In [2]:
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from datetime import date
import seaborn as sns
from statsmodels.formula.api import ols
import statsmodels.api as sm
from matplotlib import colors

### Create regression data

In [3]:
tfidf_path = '../../data/tfidf_merged_chs_1000/'

In [5]:
lda_path = '../../data/lda_merged_chs/'

In [4]:
# fandom_list = ['harry_potter',
#  'dcu',
#  'doctor_who_&_related_fandoms',
#  'star_wars_all_media_types',
#  'arthurian_mythology_&_related_fandoms',
#  'supernatural',
#  'haikyuu',
#  'kuroko_no_basuke',
#  'hamilton_miranda',
#  'dragon_age_all_media_types',
#  'the_walking_dead_&_related_fandoms',
#  'buffy_the_vampire_slayer',
#  'les_miserables_all_media_types',
#  'naruto',
#  'tolkien_j_r_r_works_&_related_fandoms',
#  'shakespare_william_works',
#  'hetalia_axis_powers',
#  'attack_on_titan',
#  'ms_paint_adventures',
#  'marvel',
#  'bishoujo_senshi_sailor_moon',
#  'one_direction',
#  'sherlock_holmes_&_related_fandoms']

In [6]:
fandom_list = [
 'star_wars_all_media_types',
 'arthurian_mythology_&_related_fandoms',
 'haikyuu',
 'kuroko_no_basuke',
 'hamilton_miranda',
 'the_walking_dead_&_related_fandoms',
 'buffy_the_vampire_slayer',
 'les_miserables_all_media_types',
 'naruto',
 'shakespare_william_works',
 'bishoujo_senshi_sailor_moon',
 'sherlock_holmes_&_related_fandoms']

In [7]:
def first_rel(field):
    try:
        return field.split(',')[0]
    except:
        return field

In [8]:
def find_major_rel(df):
    df['Relationship'] = df.apply(lambda row: first_rel(row['Relationship']), axis=1)
    counts = df['Relationship'].value_counts()
    return counts.keys()[0:5]


In [9]:
df_all = []
for i,fandom in enumerate(fandom_list):
    df_tfidf = pd.read_csv(os.path.join(tfidf_path,fandom + '_temporal_tfidf_cos_merged_chapters_1000.tsv'), sep = '\t')
    del df_tfidf['index']
    df_lda = pd.read_csv(os.path.join(lda_path,fandom + '_temporal_lda_jsd_toprev_with_dist_merged_chs.tsv'), sep = '\t')
    del df_lda['index']
    df = pd.merge(df_tfidf, df_lda, on=['AdditionalTags', 'ArchiveWarnings', 'Author', 'Bookmarks', 'Category',\
 'Chapters' ,'Characters','Comments' ,'CompleteDate',\
 'Fandoms', 'Hits' ,'Kudos', 'Language' ,'PublishDate' ,'Rating',\
 'Relationship','Title' ,'URL' ,'UpdateDate' ,'Words'], how='inner')

        
    df = df.replace([np.inf, -np.inf], np.nan)
    df['Hits'].fillna(0, inplace=True)
    df['Kudos'].fillna(0, inplace=True)
    df['Bookmarks'].fillna(0, inplace=True)
    df['Comments'].fillna(0, inplace=True)
    
    freq_rel = find_major_rel(df)
    df['Relationship'] = df['Relationship'].apply(lambda x: first_rel(x))
    df['Freq_relationship'] = df['Relationship'].apply(lambda x: 1 if x in freq_rel else 0)
    del df['Relationship']
    df_all.append(df)
    

In [10]:
df_all = pd.concat(df_all)

In [11]:
len(df_all)

71215

In [12]:
len(df_all[df_all.Kudos == 0])

0

In [13]:
# df_all = df.head(500)

In [14]:
del df_all['Language']
del df_all['Title']
del df_all['URL']
del df_all['AdditionalTags']
del df_all['Characters']
del df_all['Words']

In [16]:
def first_category(field):
    try:
        cat_3 = ['F/F', 'F/M', 'Gen', 'M/M']
        if field[0:3] in cat_3:
            return field[0:3]
        cat_5 = ['Multi', 'Other']
        if field[0:5] in cat_5:
            return field[0:5]
    except:
        return 'None'

In [17]:
df_all['Category'] = df_all.apply(lambda row: first_category(row['Category']), axis=1)

In [18]:
# df_all['ArchiveWarnings_underage'] = df_all['ArchiveWarnings'].apply(lambda x: 1 if 'Underage' in x else 0)
# df_all['ArchiveWarnings_death'] = df_all['ArchiveWarnings'].apply(lambda x: 1 if 'Major Character Death' in x else 0)
# df_all['ArchiveWarnings_choose_no'] = df_all['ArchiveWarnings'].apply(lambda x: 1 if 'Creator Chose Not To Use Archive Warnings' in x else 0)
# df_all['ArchiveWarnings_no_apply'] = df_all['ArchiveWarnings'].apply(lambda x: 1 if 'No Archive Warnings Apply' in x else 0)
# df_all['ArchiveWarnings_violence'] = df_all['ArchiveWarnings'].apply(lambda x: 1 if 'Graphic Depictions Of Violence' in x else 0)
# df_all['ArchiveWarnings_noncon'] = df_all['ArchiveWarnings'].apply(lambda x: 1 if 'Rape/Non-Con' in x else 0)

del df_all['ArchiveWarnings']

In [19]:
def first_warning(field):
    warnings = ['Creator Chose Not To Use Archive Warnings', ]

In [23]:
set(df['Rating'].tolist())

{'Explicit',
 'General Audiences',
 'Mature',
 'Not Rated',
 'Teen And Up Audiences'}

In [24]:
# df_all['Rating_E'] = df_all['Rating'].apply(lambda x: 1 if x == 'Explicit' else 0)
# df_all['Rating_G'] = df_all['Rating'].apply(lambda x: 1 if x == 'General Audiences' else 0)
# df_all['Rating_M'] = df_all['Rating'].apply(lambda x: 1 if x == 'Mature' else 0)
# df_all['Rating_N'] = df_all['Rating'].apply(lambda x: 1 if x == 'Not Rated' else 0)
# df_all['Rating_T'] = df_all['Rating'].apply(lambda x: 1 if x == 'Teen And Up Audiences' else 0)
del df_all['Rating']

In [25]:
# df_all['Fandom_harry_potter'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'harry_potter' else 0)
# df_all['Fandom_dcu'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'dcu' else 0)
# df_all['Fandom_doctor_who'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'doctor_who_&_related_fandoms' else 0)
# df_all['Fandom_star_wars'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'star_wars_all_media_types' else 0)
# df_all['Fandom_arthurian'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'arthurian_mythology_&_related_fandoms' else 0)
# df_all['Fandom_supernatural'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'supernatural' else 0)
# df_all['Fandom_haikyuu'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'haikyuu' else 0)
# df_all['Fandom_kuroko_no_basuke'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'kuroko_no_basuke' else 0)
# df_all['Fandom_hamilton_miranda'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'hamilton_miranda' else 0)
# df_all['Fandom_dragon_age'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'dragon_age_all_media_types' else 0)
# df_all['Fandom_the_walking_dead'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'the_walking_dead_&_related_fandoms' else 0)
# df_all['Fandom_buffy'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'buffy_the_vampire_slayer' else 0)
# df_all['Fandom_les_miserables'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'les_miserables_all_media_types' else 0)
# df_all['Fandom_naruto'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'naruto' else 0)
# df_all['Fandom_tolkien'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'tolkien_j_r_r_works_&_related_fandoms' else 0)
# df_all['Fandom_shakespare'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'shakespare_william_works' else 0)
# df_all['Fandom_hetalia'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'hetalia_axis_powers' else 0)
# df_all['Fandom_attack_on_titan'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'attack_on_titan' else 0)
# df_all['Fandom_ms_paint_adventures'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'ms_paint_adventures' else 0)
# df_all['Fandom_marvel'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'marvel' else 0)
# df_all['Fandom_sailor_moon'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'bishoujo_senshi_sailor_moon' else 0)
# df_all['Fandom_one_direction'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'one_direction' else 0)
# df_all['Fandom_sherlock'] = df_all['fandom_category'].apply(lambda x: 1 if x == 'sherlock_holmes_&_related_fandoms' else 0)

# del df_all['fandom_category']

In [26]:
# set(df['Fandoms'].tolist())

In [27]:
df.columns.values

       'Category', 'Chapters', 'Characters', 'Fandoms', 'Hits', 'Kudos',
       'Language', 'Rating', 'Title', 'Words', 'PublishDate',
       'UpdateDate', 'CompleteDate', 'Comments', 'URL', 'Cos', 'Text',
       'Dist', 'JSD', 'Freq_relationship'], dtype=object)

In [28]:
def date_today(cell):
    try:
        y, m, d = cell.split('-')
        return abs(date.today() - date(int(y), int(m), int(d))).days
    except:
        return float('nan')

In [29]:
df_all['PublishDate'] = df_all.apply(lambda row: date_today(row['PublishDate']), axis = 1)
df_all['CompleteDate'] = df_all.apply(lambda row: date_today(row['CompleteDate']), axis = 1)
df_all['UpdateDate'] = df_all.apply(lambda row: date_today(row['UpdateDate']), axis = 1)

In [30]:
def find_history(field_list):
    field_list = [10000000 if np.isnan(x) else x for x in field_list]
    return np.amin(field_list)

In [31]:
df_all['history'] = df_all.apply(lambda row: find_history([row['PublishDate'], row['CompleteDate'], row['UpdateDate']]), axis=1)

In [32]:
del df_all['PublishDate']
del df_all['CompleteDate']
del df_all['UpdateDate']

In [33]:
df_all = df_all.replace([np.inf, -np.inf], np.nan)

In [34]:
df_all = df_all.dropna()

In [35]:
len(df_all)

71215

In [36]:
df_all.columns.values

array(['Bookmarks', 'Category', 'Chapters', 'Fandoms', 'Hits', 'Kudos',
       'Comments', 'Cos', 'Text', 'Dist', 'JSD', 'Freq_relationship',
       'author_fic_cnt', 'history'], dtype=object)

In [37]:
df_all = df_all.rename(columns = {'history':'History', 'Cos': 'Term_novelty', 'JSD': 'Topic_novelty'})

In [38]:
df_all.head()

Unnamed: 0,Bookmarks,Category,Chapters,Fandoms,Hits,Kudos,Comments,Term_novelty,Text,Dist,Topic_novelty,Freq_relationship,author_fic_cnt,History
0,1,Gen,5,"['Star Wars - All Media Types', 'Star Wars: Ri...",2091.0,41.0,2,0.289051,"\nStar WarsReign of the ConfederacyFinally, af...","[1.6693486e-05, 1.6693486e-05, 1.6693486e-05, ...",0.105213,0,1,2444.0
1,3,Gen,1,['Star Wars Prequel Trilogy'],209.0,10.0,2,0.90367,\nObi Wan is almost more surprised than the re...,"[7.702042e-05, 7.702042e-05, 7.702042e-05, 7.7...",0.180222,0,3,2445.0
2,2,F/M,1,"['Star Wars', 'Star Wars: New Jedi Order Era -...",302.0,4.0,0,0.202239,\nTitle: Jagged HeartsAuthor: bactaqueenRating...,"[2.707458e-05, 2.707458e-05, 2.707458e-05, 2.7...",0.279718,0,3,2448.0
3,6,F/M,16,"['Star Wars', 'Star Wars: New Jedi Order Era -...",399.0,13.0,5,0.900934,Title: Changing Course: Chapter OneAuthor: bac...,"[8.5000823e-07, 8.5000823e-07, 8.5000823e-07, ...",0.235651,0,3,2446.0
4,79,M/M,1,"['Star Wars Prequel Trilogy', 'Star Wars - All...",9347.0,363.0,19,0.25576,"\nThey were fighting.\nHarsh and violent, they...","[5.481338e-06, 5.481338e-06, 5.481338e-06, 5.4...",0.40288,1,5,2453.0


In [39]:
df_all.to_csv('fanfic_regression_data_merged_chs_rgam.tsv', sep = '\t', index=False)