In [2]:
import json
import plotly
import matplotlib
import pandas as pd
import plotly.express as px

# Harry Potter Data Info

In [3]:
characters = pd.read_pickle('data/characters_by_house.pkl')
del characters['House Unknown']
clean_books = pd.read_pickle('data/books_in_chapters_clean.pkl')
interactions = pd.read_pickle('data/interaction_df.pkl')

# Characters

In [4]:
character_house_map = {'houses': [], 'characters': []}

for house, chars in characters.items():
    for character in chars:
        character_house_map['characters'].append(character)
        character_house_map['houses'].append(house)

characters_df = pd.DataFrame(character_house_map)

In [5]:
colr = plotly.colors.qualitative.Prism
Gryffindors = [colr[7]]*len(characters_df[characters_df['houses'] == 'Gryffindor'])
Hufflepuffs = [colr[5]]*len(characters_df[characters_df['houses'] == 'Hufflepuff'])
Ravenclaws = [colr[1]]*len(characters_df[characters_df['houses'] == 'Ravenclaw'])
Slytherins = [colr[3]]*len(characters_df[characters_df['houses'] == 'Slytherin'])
house_color_sequence = Gryffindors + Hufflepuffs + Ravenclaws + Slytherins

total_count = characters_df.groupby('houses').count()
total_labels = [{"x": x, "y": total*1.05, "text": str(total), "showarrow": False} for x, total in zip(total_count.index, total_count['characters'])]

fig = px.histogram(characters_df, x='houses', color='characters', 
                    color_discrete_sequence=house_color_sequence,
                    title="Count of Characters by Hogwarts House")
fig.update_layout(annotations=total_labels)
fig

In [6]:
plotly.io.write_json(fig, 'website/characters-by-house.json')

# Reddit User Data Info

Unnamed: 0,author,subreddit,subreddit_id,house
0,PetevonPete,TheDragonPrince,t5_lghxu,Gryffindor
1,flooperdooper4,intj,t5_2qowo,Ravenclaw
2,ofcabbagesandkings14,drawing,t5_2qvp9,Ravenclaw
3,ofcabbagesandkings14,TrollXChromosomes,t5_2sekm,Ravenclaw
4,newfriend999,reylo,t5_3by99,Gryffindor
...,...,...,...,...
25298,TurtleKing0505,subwaysurfers,t5_2ydam,Hufflepuff
25299,Vocadofries,whatstheword,t5_2sjdt,Hufflepuff
25300,TurtleKing0505,BokuNoShipAcademia,t5_i9f3h,Hufflepuff
25301,InquisitorCOC,InquisitorCOC,t5_9f8vb,Slytherin


In [None]:
reddit_users_by_house = pd.read_csv('reddit_data/HP_author_subreddit_data.csv')
reddit_users_by_house

In [12]:
top = 100
top_subreddits = pd.DataFrame(columns = ["All", "Gryffindor", "Hufflepuff", "Ravenclaw", "Slytherin"])

for col in top_subreddits.columns:
    if col == "All":
        top_subreddits["All"] = list(reddit_users_by_house.groupby("subreddit").count().sort_values(by = "house", ascending=False)[:top].index)
    else:
        top_subreddits[col] = list(reddit_users_by_house[reddit_users_by_house.house == col].groupby("subreddit").count().sort_values(by = "author", ascending=False)[:top].index)

top_subreddits = top_subreddits.reset_index(drop = True)
top_subreddits

Unnamed: 0,All,Gryffindor,Hufflepuff,Ravenclaw,Slytherin
0,harrypotter,AskReddit,harrypotter,memes,harrypotter
1,AskReddit,harrypotter,AskReddit,harrypotter,InquisitorCOC
2,memes,memes,BokuNoHeroAcademia,drawing,HPHogwartsMystery
3,drawing,lgbt,BNHA_OC_Characters,AskReddit,AnimalCrossing
4,PrequelMemes,shittysuperpowers,teenagers,Enough_Sanders_Spam,friendship
...,...,...,...,...,...
95,gameofthrones,AskOuija,CellsAtWork,puppy101,KingdomHearts
96,BokuNoShipAcademia,toofers,titanfolk,saltierthankrayt,FreeKarma4U
97,SubredditDrama,Cuber_01,camphalfblood,insanepeoplefacebook,HolUp
98,SequelMemes,intj,rarepuppers,shittymoviedetails,nextfuckinglevel


In [17]:
Gryffindors = [colr[7]]*len(reddit_users_by_house[reddit_users_by_house['house'] == 'Gryffindor'])
Hufflepuffs = [colr[5]]*len(reddit_users_by_house[reddit_users_by_house['house'] == 'Hufflepuff'])
Ravenclaws = [colr[1]]*len(reddit_users_by_house[reddit_users_by_house['house'] == 'Ravenclaw'])
Slytherins = [colr[3]]*len(reddit_users_by_house[reddit_users_by_house['house'] == 'Slytherin'])
house_color_sequence = Gryffindors + Hufflepuffs + Ravenclaws + Slytherins

total_count = reddit_users_by_house.groupby('house').count()
total_labels = [{"x": x, "y": total*1.1, "text": str(total), "showarrow": False} for x, total in zip(total_count.index, total_count['author'])]

fig = px.histogram(reddit_users_by_house, x='house', color='author', 
                    color_discrete_sequence=house_color_sequence,
                    title="Count of Reddit Users by Hogwarts House")
fig.update_layout(annotations=total_labels)
fig

In [15]:
len(reddit_users_by_house[reddit_users_by_house['house'] == 'Gryffindor'])

6464