In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt

# https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html
#
# Categories are ordered by descending frequency in dataset so that
# setting unsorted=True makes graphs come out correctely where
# the category is the primary category of the population.
#
# To get the frequency of, say, 'language', run
# src/explore/table/histogram-table.py -i data/database/20220612.yaml language

categories_type = pd.api.types.CategoricalDtype(
    categories=[
        'M/M',
        'Gen',
        'F/M',
        'F/F',
        'Multi',
        'No category',
        'Other'
    ],
    ordered=True)

# Ordered by frequency in dataset
warnings_type = pd.api.types.CategoricalDtype(
    categories=[
        'No Archive Warnings Apply',
        'Choose Not To Use Archive Warnings',
        'Graphic Depictions Of Violence',
        'Major Character Death',
        'Rape/Non-Con',
        'Underage',
    ],
    ordered=True)

# Ordered by frequency in dataset
rating_type = pd.api.types.CategoricalDtype(
    categories=[
        'General Audiences',
        'Teen And Up Audiences',
        'Explicit',
        'Mature',
        'Not Rated',
    ],
    ordered=True)

language_type = pd.api.types.CategoricalDtype(
    categories=[
        'en',
        'ru',
        'de',
        'zh-Hans',
        'it',
        'pt-br',
        'ko',
        'fr',
        'es',
        'cy',
        'pl',
        'cs',
        'ja',
        'he',
        'tlh-Latn',
        'nl'
    ],
    ordered=True)

dtypes = { 'id': 'int64',
           'author': 'string',
           'chapter': 'Int64',
           'chapters': 'Int64',
           'comments': 'Int64',
           'complete': 'bool',
           'filename': 'string',
           'hits': 'Int64',
           'kudos': 'Int64',
           'language': 'category',
           'summary': 'string',
           'title': 'string',
           'userid': 'Int64',
           'words': 'Int64',
           'rating': rating_type,
           'language': language_type }

# Load data from CSV into Pandas dataframe
# See https://pbpython.com/pandas_dtypes.html
df = pd.read_csv('../../../data/database/20220612.csv', dtype=dtypes)
df.set_index('id', inplace=True)

# Convert some strings to lists
def strtolist(s):
    if pd.isna(s):
        return list()
    else:
        return eval(s)

df['categories'] = df['categories'].apply(strtolist)
df['characters'] = df['characters'].apply(strtolist)
df['charactersclean'] = df['charactersclean'].apply(strtolist)
df['fandoms'] = df['fandoms'].apply(strtolist)
df['freeforms'] = df['freeforms'].apply(strtolist)
df['relationships'] = df['relationships'].apply(strtolist)
df['relationshipspair'] = df['relationshipspair'].apply(strtolist)
df['relationshipspairslash'] = df['relationshipspairslash'].apply(strtolist)
df['relationshipspairamp'] = df['relationshipspairamp'].apply(strtolist)
df['relationshipspax'] = df['relationshipspax'].apply(strtolist)
df['relationshipspaxslash'] = df['relationshipspaxslash'].apply(strtolist)
df['warnings'] = df['warnings'].apply(strtolist)

# Convert to pandas datetime
# Only publications after 2010
df['publicationdate'] = pd.to_datetime(df['publicationdate'])
dawn = pd.Timestamp('2010-01-01')
df = df[df['publicationdate'] >= dawn]

# Only English
df = df[df['language'] == 'en']

# Complete works
df = df[df['complete'] == True]


In [2]:
df

Unnamed: 0_level_0,author,bookmarks,categories,chapter,chapters,characters,charactersclean,cleandate,comments,complete,...,relationshipspairamp,relationshipspairslash,relationshipspax,relationshipspaxamp,relationshipspaxslash,summary,title,userid,warnings,words
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45841,thehoyden,1188,[M/M],1,1,[],[],2022-08-12 20:45:47.422521,234,True,...,[],[Elim Garak/Julian Bashir],"[Elim Garak, Julian Bashir]",,"[Elim Garak, Julian Bashir]",It just figured that the first time Julian Bas...,Letters from the Northern Continent,474,[No Archive Warnings Apply],7966
45842,thehoyden,417,[M/M],1,1,[],[],2022-08-12 20:45:47.422521,71,True,...,[],[Elim Garak/Julian Bashir],"[Elim Garak, Julian Bashir]",,"[Elim Garak, Julian Bashir]","""You want me to read to you?"" Julian repeated,...",Literacy,474,[No Archive Warnings Apply],5581
45843,thehoyden,675,[M/M],1,1,[],[],2022-08-12 20:45:47.422521,122,True,...,[],[Elim Garak/Julian Bashir],"[Elim Garak, Julian Bashir]",,"[Elim Garak, Julian Bashir]","""'True enough', my dear doctor, is my favorite...",Opacity of Paradise,474,[No Archive Warnings Apply],10154
45844,thehoyden,538,[M/M],1,1,[],[],2022-08-12 20:45:47.422521,111,True,...,[],[Elim Garak/Julian Bashir],"[Elim Garak, Julian Bashir]",,"[Elim Garak, Julian Bashir]","Exile was certainly dark, but in Julian Bashir...",Exile,474,[Choose Not To Use Archive Warnings],8949
51767,glacis,8,[F/M],1,1,[],[],2022-08-12 20:45:47.422521,2,True,...,[],"[Jadzia Dax/Julian Bashir, Julian Bashir/non-c...","[Jadzia Dax, Julian Bashir, Kira Nerys, non-cast]",,"[Jadzia Dax, Julian Bashir, Kira Nerys, non-cast]","An alternate back history for Julian Bashir, i...",Flashback by Sue Castle.,534,"[Graphic Depictions Of Violence, Rape/Non-Con]",26599
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39326181,EEW101,0,[Gen],1,1,[Elim Garak],[Elim Garak],2022-08-12 20:45:47.422521,6,True,...,[],[],[],,[],just a portrait of the plain simple tailor,Through a glass darkly,13321660,[No Archive Warnings Apply],0
39329223,apollojusticeforall,0,[No category],1,1,"[Benjamin Sisko, Jadzia Dax, Julian Bashir, Ki...","[Benjamin Sisko, Jadzia Dax, Julian Bashir, Ki...",2022-08-12 20:45:47.422521,1,True,...,[],[Odo/Quark],"[Odo, Quark]",,"[Odo, Quark]",It’s the DS9 prank war episode we all deserved...,The Ferengi Is Wild,2374696,[No Archive Warnings Apply],8189
39335598,BrokenBlade,0,[Other],1,1,"[Kira Nerys, Tekeny Ghemor]","[Kira Nerys, Tekeny Ghemor]",2022-08-12 20:45:47.422521,0,True,...,[Kira Nerys & Tekeny Ghemor],[],"[Kira Nerys, Tekeny Ghemor]","['Kira Nerys', 'Tekeny Ghemor']",[],,All Need No Need,7740136,[No Archive Warnings Apply],100
39338307,deathlybijoumme,0,[Other],1,1,[Elim Garak],[Elim Garak],2022-08-12 20:45:47.422521,1,True,...,[],[],[],,[],Doodles I can't post on Tumblr that I was poss...,"Mature, Robust Cultivar",1268838,[No Archive Warnings Apply],174
