# Cross-tabulate

Cross-tabulations of 'slash' relationship pairs against the categorical variables.

In [1]:
import pandas as pd
import numpy as np

import ds9

df = ds9.df()

## Explode list `relationshipspairslash` into rows

Dropping the uninteresting columns to keep the size under control

In [2]:
df = df.drop(['cleandate', 'complete', 'language', 'publicationdate', 'summary', 'characters', 'filename', 'parsedate', 'fandoms'], axis=1)

new_df = pd.DataFrame()
for index, relationshipspairslash in zip(df.index, df['relationshipspairslash']):
    for i in relationshipspairslash:
        row = df[df.index == index].copy()
        row['relationshipspairslash'] = i
        new_df = pd.concat([new_df, row], ignore_index=True)
df = new_df

## Cross-tabulate with `rating`

See https://datagy.io/pandas-crosstab/

Note that we can only cross-tabulate categories, so we might need to convert strings to categories.

In [3]:
# We want the category for relationshipspairsslash to be most-common first
# (in stats jargon, descending order of frequency). So we need to compute
# the frequency of relationshipspairsslash before we can define the type
# for the category relationshipspairsslash_type.  As we want that type
# to have an order.
f = df['relationshipspairslash'].value_counts()
f = f.reset_index(name = 'n')
cat_list = f['index'].tolist()
relationshipspairslash_type = pd.api.types.CategoricalDtype(categories=cat_list, ordered=True)
df['relationshipspairslash'] = df['relationshipspairslash'].astype(relationshipspairslash_type)

Pandas has a nice crosstab() function. For categorical data it tabulates the frequency, which is exactly what we want.

In [4]:
crosstab_relationshipspairslash_rating = pd.crosstab(index = df.relationshipspairslash,
                                                     columns = df.rating,
                                                     margins=True,
                                                     margins_name = 'Total')

# Save .CSV
crosstab_relationshipspairslash_rating.to_csv('crosstab-relationshipspairslash-rating.csv')

# Display table
pd.options.display.max_rows = 10000
crosstab_relationshipspairslash_rating

rating,General Audiences,Teen And Up Audiences,Mature,Explicit,Not Rated,Total
relationshipspairslash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Elim Garak/Julian Bashir,1096,1238,588,751,139,3812
Jadzia Dax/Kira Nerys,159,158,35,35,7,394
Odo/Quark,94,144,47,80,7,372
non-cast/non-cast,67,106,61,56,14,304
Kira Nerys/Odo,100,85,23,25,11,244
Jadzia Dax/Worf,64,46,10,14,10,144
Julian Bashir/non-cast,31,34,38,36,4,143
Keiko O'Brien/Miles O'Brien,38,60,10,12,5,125
Corat Damar/Weyoun,13,37,21,51,2,124
Elim Garak/Kelas Parmak,38,32,23,23,2,118


In [5]:
crosstab_relationshipspairslash_rating_percent = pd.crosstab(index = df.relationshipspairslash,
                                                             columns = df.rating,
                                                             normalize='index',
                                                             margins=True,
                                                             margins_name = 'Total').round(3)*100
crosstab_relationshipspairslash_rating_percent.to_csv('crosstab-relationshipspairslash-rating-percent.csv')
crosstab_relationshipspairslash_rating_percent

rating,General Audiences,Teen And Up Audiences,Mature,Explicit,Not Rated
relationshipspairslash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Elim Garak/Julian Bashir,28.8,32.5,15.4,19.7,3.6
Jadzia Dax/Kira Nerys,40.4,40.1,8.9,8.9,1.8
Odo/Quark,25.3,38.7,12.6,21.5,1.9
non-cast/non-cast,22.0,34.9,20.1,18.4,4.6
Kira Nerys/Odo,41.0,34.8,9.4,10.2,4.5
Jadzia Dax/Worf,44.4,31.9,6.9,9.7,6.9
Julian Bashir/non-cast,21.7,23.8,26.6,25.2,2.8
Keiko O'Brien/Miles O'Brien,30.4,48.0,8.0,9.6,4.0
Corat Damar/Weyoun,10.5,29.8,16.9,41.1,1.6
Elim Garak/Kelas Parmak,32.2,27.1,19.5,19.5,1.7


## Cross-tabulate with `categories`

In [6]:
# Explode categories
# Use `cat_df` as we don't want to damage `df`
cat_df = pd.DataFrame()
for index, categories in zip(df.index, df['categories']):
    for i in categories:
        row = df[df.index == index].copy()
        row['categories'] = i
        cat_df = pd.concat([cat_df, row], ignore_index=True)

# Cross-tabulate
cat_df['categories'] = cat_df['categories'].astype(ds9.categories_type)
crosstab_relationshipspairslash_categories = pd.crosstab(index = cat_df.relationshipspairslash,
                                                         columns = cat_df.categories,
                                                         margins = True,
                                                         margins_name = 'Total')

# Save .CSV
crosstab_relationshipspairslash_categories.to_csv('crosstab-relationshipspairslash-categories.csv')

# Display table
crosstab_relationshipspairslash_categories

categories,M/M,Gen,F/M,F/F,Multi,No category,Other,Total
relationshipspairslash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Elim Garak/Julian Bashir,3525,315,197,76,127,104,51,4395
Jadzia Dax/Kira Nerys,45,37,23,366,13,2,4,490
Odo/Quark,338,37,26,11,25,3,14,454
non-cast/non-cast,136,72,145,66,54,9,27,509
Kira Nerys/Odo,42,76,178,18,25,10,31,380
Jadzia Dax/Worf,45,40,92,16,22,5,2,222
Julian Bashir/non-cast,109,15,40,11,13,5,12,205
Keiko O'Brien/Miles O'Brien,58,45,73,20,28,0,3,227
Corat Damar/Weyoun,118,12,6,7,8,0,30,181
Elim Garak/Kelas Parmak,85,12,7,7,15,1,33,160


In [7]:
# Repeat for percentages
crosstab_relationshipspairslash_categories_percent = pd.crosstab(index = cat_df.relationshipspairslash,
                                                                 columns = cat_df.categories,
                                                                 margins=True,
                                                                 margins_name = 'Total',
                                                                 normalize='index').round(3)*100
crosstab_relationshipspairslash_categories_percent.to_csv('crosstab-relationshipspairslash-categories-percent.csv')
crosstab_relationshipspairslash_categories_percent

categories,M/M,Gen,F/M,F/F,Multi,No category,Other
relationshipspairslash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Elim Garak/Julian Bashir,80.2,7.2,4.5,1.7,2.9,2.4,1.2
Jadzia Dax/Kira Nerys,9.2,7.6,4.7,74.7,2.7,0.4,0.8
Odo/Quark,74.4,8.1,5.7,2.4,5.5,0.7,3.1
non-cast/non-cast,26.7,14.1,28.5,13.0,10.6,1.8,5.3
Kira Nerys/Odo,11.1,20.0,46.8,4.7,6.6,2.6,8.2
Jadzia Dax/Worf,20.3,18.0,41.4,7.2,9.9,2.3,0.9
Julian Bashir/non-cast,53.2,7.3,19.5,5.4,6.3,2.4,5.9
Keiko O'Brien/Miles O'Brien,25.6,19.8,32.2,8.8,12.3,0.0,1.3
Corat Damar/Weyoun,65.2,6.6,3.3,3.9,4.4,0.0,16.6
Elim Garak/Kelas Parmak,53.1,7.5,4.4,4.4,9.4,0.6,20.6


## Cross-tabulate with `warnings`

In [8]:
warnings_df = pd.DataFrame()
for index, warnings in zip(df.index, df['warnings']):
    for i in warnings:
        row = df[df.index == index].copy()
        row['warnings'] = i
        warnings_df = pd.concat([warnings_df, row], ignore_index=True)

# Cross-tabulate
warnings_df['warnings'] = warnings_df['warnings'].astype(ds9.warnings_type)
crosstab_relationshipspairslash_warnings = pd.crosstab(index = warnings_df.relationshipspairslash,
                                                         columns = warnings_df.warnings,
                                                         margins = True,
                                                         margins_name = 'Total')

# Save .CSV
crosstab_relationshipspairslash_warnings.to_csv('crosstab-relationshipspairslash-warnings.csv')

# Display table
crosstab_relationshipspairslash_warnings


warnings,No Archive Warnings Apply,Choose Not To Use Archive Warnings,Graphic Depictions Of Violence,Major Character Death,Rape/Non-Con,Underage,Total
relationshipspairslash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Elim Garak/Julian Bashir,3041,654,74,52,52,6,3879
Jadzia Dax/Kira Nerys,338,49,1,15,1,0,404
Odo/Quark,320,46,3,6,2,0,377
non-cast/non-cast,197,73,23,17,8,3,321
Kira Nerys/Odo,159,77,6,8,3,0,253
Jadzia Dax/Worf,104,34,4,9,1,0,152
Julian Bashir/non-cast,98,28,9,6,9,0,150
Keiko O'Brien/Miles O'Brien,105,16,3,1,1,0,126
Corat Damar/Weyoun,83,27,5,5,8,0,128
Elim Garak/Kelas Parmak,97,16,2,3,1,1,120


In [9]:
crosstab_relationshipspairslash_warnings_percent = pd.crosstab(index = warnings_df.relationshipspairslash,
                                                                 columns = warnings_df.warnings,
                                                                 margins=True,
                                                                 margins_name = 'Total',
                                                                 normalize='index').round(3)*100
crosstab_relationshipspairslash_warnings_percent.to_csv('crosstab-relationshipspairslash-warnings-percent.csv')
crosstab_relationshipspairslash_warnings_percent

warnings,No Archive Warnings Apply,Choose Not To Use Archive Warnings,Graphic Depictions Of Violence,Major Character Death,Rape/Non-Con,Underage
relationshipspairslash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Elim Garak/Julian Bashir,78.4,16.9,1.9,1.3,1.3,0.2
Jadzia Dax/Kira Nerys,83.7,12.1,0.2,3.7,0.2,0.0
Odo/Quark,84.9,12.2,0.8,1.6,0.5,0.0
non-cast/non-cast,61.4,22.7,7.2,5.3,2.5,0.9
Kira Nerys/Odo,62.8,30.4,2.4,3.2,1.2,0.0
Jadzia Dax/Worf,68.4,22.4,2.6,5.9,0.7,0.0
Julian Bashir/non-cast,65.3,18.7,6.0,4.0,6.0,0.0
Keiko O'Brien/Miles O'Brien,83.3,12.7,2.4,0.8,0.8,0.0
Corat Damar/Weyoun,64.8,21.1,3.9,3.9,6.2,0.0
Elim Garak/Kelas Parmak,80.8,13.3,1.7,2.5,0.8,0.8
