In [2]:
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from datetime import date
import seaborn as sns
from statsmodels.formula.api import ols
import statsmodels.api as sm
from matplotlib import colors

### Create regression data

In [3]:
tfidf_path = '../../data/tfidf_toprev_conlen_fulldata/'

In [4]:
lda_path = '../../data/lda_jsd_toprev_conlen/'

In [23]:
fandom_list = ['harry_potter',
 'dcu',
 'doctor_who_&_related_fandoms',
 'star_wars_all_media_types',
 'arthurian_mythology_&_related_fandoms',
 'supernatural',
 'haikyuu',
 'kuroko_no_basuke',
 'hamilton_miranda',
 'dragon_age_all_media_types',
 'the_walking_dead_&_related_fandoms',
 'buffy_the_vampire_slayer',
 'les_miserables_all_media_types',
 'naruto',
 'tolkien_j_r_r_works_&_related_fandoms',
 'shakespare_william_works',
 'hetalia_axis_powers',
 'attack_on_titan',
 'ms_paint_adventures',
 'homestuck',
 'marvel',
 'bishoujo_senshi_sailor_moon',
 'one_direction',
 'sherlock_holmes_&_related_fandoms']

In [24]:
def first_rel(field):
    try:
        return field.split(',')[0]
    except:
        return field

In [25]:
def first_warn(field):
    try:
        return field.split(',')[0]
    except:
        return field

In [11]:
from collections import Counter

In [26]:
def find_major_rel(df):
    df['Relationship'] = df.apply(lambda row: first_rel(row['Relationship']), axis=1)
    counts = df['Relationship'].value_counts()
    return counts.keys()[0:5]


In [50]:
df_all = []
for i,fandom in enumerate(fandom_list):
    df_tfidf = pd.read_csv(os.path.join(tfidf_path,fandom + '_temporal_tfidf_cos_toprev_conlen.tsv'), sep = '\t')
    del df_tfidf['index']
    df_lda = pd.read_csv(os.path.join(lda_path,fandom + '_temporal_lda_jsd_toprev_full.tsv'), sep = '\t')
    del df_lda['index']
    df = pd.merge(df_tfidf, df_lda, on=['AdditionalTags', 'ArchiveWarnings', 'Author', 'Bookmarks', 'Category',\
 'ChapterIndex', 'Chapters' ,'Characters','Comments' ,'CompleteDate',\
 'Fandoms', 'Hits' ,'Kudos', 'Language', 'Notes' ,'PublishDate' ,'Rating',\
 'Relationship' ,'Summary' ,'Title' ,'URL' ,'UpdateDate' ,'Words'], how='inner')
    
    df['fandom_category'] = fandom
    
    freq_rel = find_major_rel(df)
    df['Relationship'] = df['Relationship'].apply(lambda x: first_rel(x))
    df['Freq_relationship'] = df['Relationship'].apply(lambda x: 1 if x in freq_rel else 0)
    del df['Relationship']
    df_all.append(df)
    

In [51]:
df_all = pd.concat(df_all)

In [52]:
len(df_all)

701667

In [53]:
# df_all = df.head(500)

In [54]:
del df_all['Language']
del df_all['Notes']
del df_all['Summary']
del df_all['Title']
del df_all['URL']
del df_all['AdditionalTags']
del df_all['Fandoms']
del df_all['Characters']
del df_all['Words']

In [55]:
df_all['Category'].fillna('Unknown', inplace=True)
df_all["Category"] = df_all["Category"].astype('category')
df_all["Category_cat"] = df_all["Category"].cat.codes
del df_all['Category']
c = Counter(df_all['Category_cat'].tolist())
df_all = df_all[df_all['Category_cat'].apply(lambda x: True if c[x] >= 50 else False)]

In [56]:
df_all['ArchiveWarnings'].fillna('Unknown', inplace=True)
df_all["ArchiveWarnings"] = df_all["ArchiveWarnings"].astype('category')
df_all["ArchiveWarnings_cat"] = df_all["ArchiveWarnings"].cat.codes
del df_all['ArchiveWarnings']
c = Counter(df_all['ArchiveWarnings_cat'].tolist())
df_all = df_all[df_all['ArchiveWarnings_cat'].apply(lambda x: True if c[x] >= 50 else False)]

In [57]:
len(df_all)

697778

In [58]:
df_all['ChapterIndex'].fillna(0, inplace=True)

In [59]:
df_all['Author'].fillna('Unknown', inplace=True)
counts = df_all['Author'].value_counts()
idx = counts[counts >= 10].index
df_all['author_fic_cnt'] = df_all['Author'].apply(lambda x: counts[x])
del df_all['Author']

In [60]:
df_all["Rating"] = df_all["Rating"].astype('category')
df_all["Rating_cat"] = df_all["Rating"].cat.codes
del df_all['Rating']
c = Counter(df_all['Rating_cat'].tolist())
df_all = df_all[df_all['Rating_cat'].apply(lambda x: True if c[x] >= 50 else False)]

In [61]:
Counter(df_all['Rating_cat'].tolist())

Counter({0: 107799, 1: 147992, 2: 132689, 4: 73906, 5: 235389})

In [62]:
df_all["fandom_category"] = df_all["fandom_category"].astype('category')
df_all["fandom_cat"] = df_all["fandom_category"].cat.codes
# del df_all['fandom_category']
# c = Counter(df_all['fandom_cat'].tolist())
# df_all = df_all[df_all['fandom_cat'].apply(lambda x: True if c[x] >= 50 else False)]

In [64]:
def date_today(cell):
    try:
        y, m, d = cell.split('-')
        return abs(date.today() - date(int(y), int(m), int(d))).days
    except:
        return float('nan')

In [65]:
df_all['PublishDate'] = df_all.apply(lambda row: date_today(row['PublishDate']), axis = 1)
df_all['CompleteDate'] = df_all.apply(lambda row: date_today(row['CompleteDate']), axis = 1)
df_all['UpdateDate'] = df_all.apply(lambda row: date_today(row['UpdateDate']), axis = 1)

In [66]:
def find_history(field_list):
    field_list = [10000000 if np.isnan(x) else x for x in field_list]
    return np.amin(field_list)

In [67]:
df_all['history'] = df_all.apply(lambda row: find_history([row['PublishDate'], row['CompleteDate'], row['UpdateDate']]), axis=1)

In [68]:
del df_all['PublishDate']
del df_all['CompleteDate']
del df_all['UpdateDate']

In [69]:
df_all = df_all.replace([np.inf, -np.inf], np.nan)
df_all['Hits'].fillna(0, inplace=True)
df_all['Kudos'].fillna(0, inplace=True)
df_all['Bookmarks'].fillna(0, inplace=True)
df_all['Comments'].fillna(0, inplace=True)

In [76]:
df_all['Hits'] = df_all['Hits'] / df_all['Chapters']
df_all['Kudos'] =  df_all['Kudos'] / df_all['Chapters']
df_all['Bookmarks'] = df_all['Bookmarks'] / df_all['Chapters']


In [77]:
df_all = df_all.dropna()

In [78]:
len(df_all)

697744

In [79]:
df_all.columns.values

array(['Bookmarks', 'ChapterIndex', 'Chapters', 'Comments', 'Hits',
       'Kudos', 'Term_novelty', 'Topic_novelty', 'fandom_category',
       'author_fic_cnt', 'Rating_cat', 'fandom_cat', 'History'],
      dtype=object)

In [80]:
df_all = df_all.rename(columns = {'history':'History', 'Cos': 'Term_novelty', 'JSD': 'Topic_novelty'})

In [81]:
df_all.head()

Unnamed: 0,Bookmarks,ChapterIndex,Chapters,Comments,Hits,Kudos,Term_novelty,Topic_novelty,fandom_category,Freq_relationship,Category_cat,ArchiveWarnings_cat,author_fic_cnt,Rating_cat,fandom_cat,History
0,0.666667,1.0,3,0,106.666667,2.666667,0.778655,0.599456,harry_potter,0,153,122,6,1,9,5136.0
1,0.0,0.0,1,0,25.0,1.0,0.713975,0.605999,harry_potter,0,153,122,17,5,9,5589.0
2,0.0,0.0,1,1,1359.0,19.0,0.879855,0.625602,harry_potter,1,246,122,2,2,9,5595.0
3,0.0,0.0,1,0,60.0,0.0,0.332479,0.573035,harry_potter,0,64,122,11,1,9,5596.0
4,0.0,0.0,1,0,495.0,12.0,0.795259,0.650978,harry_potter,1,246,122,2,5,9,5599.0


In [82]:
df_all.to_csv('fanfic_regression_data_curated_autocat.tsv', sep = '\t', index=False)