In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt

# https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html
#
# Categories are ordered by descending frequency in dataset so that
# setting unsorted=True makes graphs come out correctely where
# the category is the primary category of the population.
#
# To get the frequency of, say, 'language', run
# src/explore/table/histogram-table.py -i data/database/20220612.yaml language

categories_type = pd.api.types.CategoricalDtype(
    categories=[
        'M/M',
        'Gen',
        'F/M',
        'F/F',
        'Multi',
        'No category',
        'Other'
    ],
    ordered=True)

# Ordered by frequency in dataset
warnings_type = pd.api.types.CategoricalDtype(
    categories=[
        'No Archive Warnings Apply',
        'Choose Not To Use Archive Warnings',
        'Graphic Depictions Of Violence',
        'Major Character Death',
        'Rape/Non-Con',
        'Underage',
    ],
    ordered=True)

# Ordered by frequency in dataset
rating_type = pd.api.types.CategoricalDtype(
    categories=[
        'General Audiences',
        'Teen And Up Audiences',
        'Explicit',
        'Mature',
        'Not Rated',
    ],
    ordered=True)

language_type = pd.api.types.CategoricalDtype(
    categories=[
        'en',
        'ru',
        'de',
        'zh-Hans',
        'it',
        'pt-br',
        'ko',
        'fr',
        'es',
        'cy',
        'pl',
        'cs',
        'ja',
        'he',
        'tlh-Latn',
        'nl'
    ],
    ordered=True)

dtypes = { 'id': 'int64',
           'author': 'string',
           'chapter': 'Int64',
           'chapters': 'Int64',
           'comments': 'Int64',
           'complete': 'bool',
           'filename': 'string',
           'hits': 'Int64',
           'kudos': 'Int64',
           'language': 'category',
           'summary': 'string',
           'title': 'string',
           'userid': 'Int64',
           'words': 'Int64',
           'rating': rating_type,
           'language': language_type }

# Load data from CSV into Pandas dataframe
# See https://pbpython.com/pandas_dtypes.html
df = pd.read_csv('../../../data/database/20220612.csv', dtype=dtypes)
df.set_index('id', inplace=True)

# Convert some strings to lists
def strtolist(s):
    if pd.isna(s):
        return list()
    else:
        return eval(s)

df['categories'] = df['categories'].apply(strtolist)
df['characters'] = df['characters'].apply(strtolist)
df['charactersclean'] = df['charactersclean'].apply(strtolist)
df['fandoms'] = df['fandoms'].apply(strtolist)
df['freeforms'] = df['freeforms'].apply(strtolist)
df['relationships'] = df['relationships'].apply(strtolist)
df['relationshipspair'] = df['relationshipspair'].apply(strtolist)
df['relationshipspairslash'] = df['relationshipspairslash'].apply(strtolist)
df['relationshipspairamp'] = df['relationshipspairamp'].apply(strtolist)
df['relationshipspax'] = df['relationshipspax'].apply(strtolist)
df['relationshipspaxslash'] = df['relationshipspaxslash'].apply(strtolist)
df['warnings'] = df['warnings'].apply(strtolist)

# Convert to pandas datetime
# Only publications after 2010
df['publicationdate'] = pd.to_datetime(df['publicationdate'])
dawn = pd.Timestamp('2010-01-01')
df = df[df['publicationdate'] >= dawn]

# Only English
df = df[df['language'] == 'en']

# Complete works
df = df[df['complete'] == True]


In [2]:
df['freeforms']

id
45841                                [Epistolary, Post-Canon]
45842                                                      []
45843                               [Marriage of Convenience]
45844                          [Alternate Reality, Pre-Canon]
51767                                                      []
                                  ...                        
39326181                                                [Art]
39329223                                                   []
39335598    [Canon Relationships, Enemies to Family, Episo...
39338307                 [Crossdressing, Effeminacy, Fan Art]
39340149    [Cardassian flirting by somebody who doesn't w...
Name: freeforms, Length: 8645, dtype: object

In [8]:
df2 = df['freeforms'].apply(pd.Series)
df3percent = df2.iloc[:,0].value_counts(normalize = True)*100
df3percent.info
df3 = df2.iloc[:,0].value_counts()
df3.info

AttributeError: 'Series' object has no attribute 'info'