# Sum the number of comments, kudos , hits, and the number of works

For each character, the number of of stories they appear in, and the total number of comments, kudos and hits those stories attracted.

Similarly for each slash pairing.

## Load data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt

# https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html
#
# Categories are ordered by descending frequency in dataset so that
# setting unsorted=True makes graphs come out correctely where
# the category is the primary category of the population.
#
# To get the frequency of, say, 'language', run
# src/explore/table/histogram-table.py -i data/database/20220612.yaml language

categories_type = pd.api.types.CategoricalDtype(
    categories=[
        'M/M',
        'Gen',
        'F/M',
        'F/F',
        'Multi',
        'No category',
        'Other'
    ],
    ordered=True)

# Ordered by frequency in dataset
warnings_type = pd.api.types.CategoricalDtype(
    categories=[
        'No Archive Warnings Apply',
        'Choose Not To Use Archive Warnings',
        'Graphic Depictions Of Violence',
        'Major Character Death',
        'Rape/Non-Con',
        'Underage',
    ],
    ordered=True)

# Ordered by frequency in dataset
rating_type = pd.api.types.CategoricalDtype(
    categories=[
        'General Audiences',
        'Teen And Up Audiences',
        'Explicit',
        'Mature',
        'Not Rated',
    ],
    ordered=True)

language_type = pd.api.types.CategoricalDtype(
    categories=[
        'en',
        'ru',
        'de',
        'zh-Hans',
        'it',
        'pt-br',
        'ko',
        'fr',
        'es',
        'cy',
        'pl',
        'cs',
        'ja',
        'he',
        'tlh-Latn',
        'nl'
    ],
    ordered=True)

dtypes = { 'id': 'int64',
           'author': 'string',
           'chapter': 'Int64',
           'chapters': 'Int64',
           'comments': 'Int64',
           'complete': 'bool',
           'filename': 'string',
           'hits': 'Int64',
           'kudos': 'Int64',
           'language': 'category',
           'summary': 'string',
           'title': 'string',
           'userid': 'Int64',
           'words': 'Int64',
           'rating': rating_type,
           'language': language_type }

# Load data from CSV into Pandas dataframe
# See https://pbpython.com/pandas_dtypes.html
df = pd.read_csv('../../../data/database/20220612.csv', dtype=dtypes)
df.set_index('id', inplace=True)

# Convert some strings to lists
def strtolist(s):
    if pd.isna(s):
        return list()
    else:
        return eval(s)

df['categories'] = df['categories'].apply(strtolist)
df['characters'] = df['characters'].apply(strtolist)
df['charactersclean'] = df['charactersclean'].apply(strtolist)
df['fandoms'] = df['fandoms'].apply(strtolist)
df['freeforms'] = df['freeforms'].apply(strtolist)
df['relationships'] = df['relationships'].apply(strtolist)
df['relationshipspair'] = df['relationshipspair'].apply(strtolist)
df['relationshipspairslash'] = df['relationshipspairslash'].apply(strtolist)
df['relationshipspairamp'] = df['relationshipspairamp'].apply(strtolist)
df['relationshipspax'] = df['relationshipspax'].apply(strtolist)
df['relationshipspaxslash'] = df['relationshipspaxslash'].apply(strtolist)
df['relationshipspaxamp'] = df['relationshipspaxamp'].apply(strtolist)
df['warnings'] = df['warnings'].apply(strtolist)

# Convert to pandas datetime
# Only publications after 2010
df['publicationdate'] = pd.to_datetime(df['publicationdate'])
dawn = pd.Timestamp('2010-01-01')
df = df[df['publicationdate'] >= dawn]

# Only English
df = df[df['language'] == 'en']

# Complete works
df = df[df['complete'] == True]


## Drop the unneeded columns

If you want to look into other columns, then add them here.

In [2]:
df = df[['charactersclean', 'relationshipspairslash', 'comments', 'hits', 'kudos', 'words']]

# Program definitions

### allrows()

Find the rows which match the column's content. Return them in popularity order.

In [3]:
def allrows(df, match):
    # Explode the dataframe by column of interest into new_df
    new_df = pd.DataFrame()
    for index, matches in zip(df.index, df[match]):
        for i in matches:
            row = df[df.index == index].copy()
            row[match] = i
            new_df = pd.concat([new_df, row], ignore_index=True)
    # Get the characters and their frequency
    f = new_df[match].value_counts()
    f = f.reset_index(name = 'n')
    return f['index']

### summarisecharacter()

Go through the list of characters from the step above. For each character: find all the rows of the dataframe where they are in some `column` (like `charactersclean`); count the number of those rows; sum the values of the columns `comments`, `kudos`, `hits` and `words`.

In [4]:
def summarisecharacter(df, character, column):
    character_df = pd.DataFrame()
    for index, characters in zip(df.index, df[column]):
        if character in characters:
            # Match found, copy the whole row to the bottom of the new dataframe
            row = df[df.index == index]
            character_df = pd.concat([character_df, row])
    # Easiest way to create a dataframe is to populate a dictionary of lists.
    # Since this is a one-row dataframe, the lists in [ ] have one item.
    d = dict()
    d[column] = [ character ]
    d['works'] = [ character_df.shape[0] ]
    d['comments'] = [ character_df['comments'].sum() ]
    d['kudos'] = [ character_df['kudos'].sum() ]
    d['hits'] = [ character_df['hits'].sum() ]
    d['words'] = [ character_df['words'].sum() ]
    return pd.DataFrame.from_dict(d)

### summariseallcharacters()

Return a dataframe of all the characters found in the column with a summary of the works statistics for that character.

In [5]:
def summariseallcharacters(df, column):
    ac = allrows(df, column)
    sum_df = pd.DataFrame()
    for character in ac:
        summary = summarisecharacter(df, character, column)
        sum_df = pd.concat([sum_df, summary], ignore_index=True)
    sum_df.set_index(column, inplace=True)
    return sum_df

## Comments, hits and kudos for characters

That is, for `charactersclean`.

In [6]:
sum_df = summariseallcharacters(df, 'charactersclean')
sum_df.to_csv('sum-comments-hits-kudos-charactersclean.csv')
pd.options.display.max_rows = 999
sum_df

Unnamed: 0_level_0,works,comments,kudos,hits,words
charactersclean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Julian Bashir,4938,110911,633161,6343689,29428092
Elim Garak,4521,106712,606841,5971556,24994180
Kira Nerys,1945,38681,142215,1791858,16928641
non-cast,1922,49296,147675,1839595,21799734
Jadzia Dax,1631,35007,163586,1743817,12520276
Odo,1380,28614,127886,1444062,11561861
Benjamin Sisko,1190,27159,107504,1233615,12703665
Quark,1122,24492,108962,1091406,9433072
Miles O'Brien,1051,35308,138997,1452537,10809417
Ezri Dax,577,12505,34412,479394,5297759


### Interpreting these results

There are 110911 comments in works in which Julian Bashir is tagged as a character (the tagging done by the author of the work).

Works tagged with the character Julian Bashir are more downloaded than works tagged with the character Elim Garak.

Note that 'non-cast' refers to the sum of all non-cast characters, not just one examplar character.

## Comments, hits and kudos for slash pairings

That is, for `relationshipspairslash`.

In [7]:
sum_df = summariseallcharacters(df, 'relationshipspairslash')
sum_df.to_csv('sum-comments-hits-kudos-relationshipspairslash.csv')
pd.options.display.max_rows = 999
sum_df

Unnamed: 0_level_0,works,comments,kudos,hits,words
relationshipspairslash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Elim Garak/Julian Bashir,3850,96841,601979,5868705,19683503
Jadzia Dax/Kira Nerys,406,4783,25603,317247,1524299
Odo/Quark,372,6319,38204,325751,1706558
non-cast/non-cast,299,6977,16810,362785,5809313
Kira Nerys/Odo,247,2514,14733,298714,1917333
Julian Bashir/non-cast,145,3835,10301,136550,1825956
Jadzia Dax/Worf,144,2776,10460,123833,958660
Keiko O'Brien/Miles O'Brien,126,4275,13277,127902,1299390
Corat Damar/Weyoun,124,1527,6412,69006,732637
Elim Garak/Kelas Parmak,120,4105,7445,97294,1031230
