In [18]:
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from collections import Counter
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from datetime import date
from sklearn.feature_extraction.text import CountVectorizer


In [3]:
data_path = '../../data/tag_novelty/'

In [4]:
fandom_list = [n for n in os.listdir(data_path) if 'tsv' in n]

In [44]:
fandom_list

['arthurian_mythology_&_related_fandoms_tag_novelty.tsv',
 'attack_on_titan_tag_novelty.tsv',
 'bishoujo_senshi_sailor_moon_tag_novelty.tsv',
 'buffy_the_vampire_slayer_tag_novelty.tsv',
 'dcu_tag_novelty.tsv',
 'doctor_who_&_related_fandoms_tag_novelty.tsv',
 'dragon_age_all_media_types_tag_novelty.tsv',
 'haikyuu_tag_novelty.tsv',
 'harry_potter_tag_novelty.tsv',
 'hetalia_axis_powers_tag_novelty.tsv',
 'homestuck_tag_novelty.tsv',
 'kuroko_no_basuke_tag_novelty.tsv',
 'les_miserables_all_media_types_tag_novelty.tsv',
 'marvel_tag_novelty.tsv',
 'ms_paint_adventures_tag_novelty.tsv',
 'naruto_tag_novelty.tsv',
 'one_direction_tag_novelty.tsv',
 'shakespare_william_works_tag_novelty.tsv',
 'star_wars_all_media_types_tag_novelty.tsv',
 'supernatural_tag_novelty.tsv',
 'the_avengers_all_media_types_tag_novelty.tsv',
 'the_walking_dead_&_related_fandoms_tag_novelty.tsv',
 'tolkien_j_r_r_works_&_related_fandoms_tag_novelty.tsv']

## Using other features + novelty

In [7]:
def date_today(cell):
    if '-' in str(cell):
        y, m, d = cell.split('-')
        return abs(date.today() - date(int(y), int(m), int(d))).days

In [8]:
base_feat = ['Chapters', 'Words', 'PublishDate']
base_nov_feat = ['Chapters', 'Words', 'PublishDate', 'tag_novelty']
target = ['top_kudos']



In [9]:
df_all = []
for fandom in fandom_list:
    df = pd.read_csv(os.path.join(data_path, fandom_list[0]), sep = '\t')
    df['PublishDate'] = df.apply(lambda row: date_today(row['PublishDate']), axis = 1)
    df['UpdateDate'] = df.apply(lambda row: date_today(row['UpdateDate']), axis = 1)
    df = df.dropna(subset=['Kudos'])
    top = np.percentile(df.Kudos.tolist(), 75)
    df['top_kudos'] = df.apply(lambda row: 1 if row['Kudos'] > top else 0, axis=1)
    df = df[base_nov_feat + target]
    df_all.append(df)
    
df_all = pd.concat(df_all)
df_all = df_all.reset_index()

print(len(df_all[df_all.top_kudos == 0]))
print(len(df_all[df_all.top_kudos == 1]))

train = df_all.sample(frac=0.8, random_state=1)
test = df_all.loc[~df_all.index.isin(train.index)]
print(len(train), len(test))
model = RandomForestClassifier()
model.fit(train[base_feat], train[target])
print(classification_report(test[target], model.predict(test[base_feat])))

3404
1127
3625 906
             precision    recall  f1-score   support

          0       1.00      1.00      1.00       697
          1       1.00      1.00      1.00       209

avg / total       1.00      1.00      1.00       906





In [61]:
1127/3404

0.3310810810810811

In [52]:
for fandom in fandom_list:
    df = pd.read_csv(os.path.join(data_path, fandom_list[0]), sep = '\t')
    df['PublishDate'] = df.apply(lambda row: date_today(row['PublishDate']), axis = 1)
    df['UpdateDate'] = df.apply(lambda row: date_today(row['UpdateDate']), axis = 1)
    df = df.dropna(subset=['Kudos'])
    top = np.percentile(df.Kudos.tolist(), 75)
    df['top_kudos'] = df.apply(lambda row: 1 if row['Kudos'] > top else 0, axis=1)
    df = df[base_nov_feat + target]
    train = df.sample(frac=0.8, random_state=1)
    test = df.loc[~df.index.isin(train.index)]
    model = RandomForestClassifier()
    model.fit(train[base_nov_feat], train[target])
    print(classification_report(test[target], model.predict(test[base_nov_feat])))

  if sys.path[0] == '':
  if sys.path[0] == '':


             precision    recall  f1-score   support

          0       0.79      0.97      0.87        31
          1       0.00      0.00      0.00         8

avg / total       0.63      0.77      0.69        39

             precision    recall  f1-score   support

          0       0.82      0.87      0.84        31
          1       0.33      0.25      0.29         8

avg / total       0.72      0.74      0.73        39



  if sys.path[0] == '':
  if sys.path[0] == '':


             precision    recall  f1-score   support

          0       0.80      0.90      0.85        31
          1       0.25      0.12      0.17         8

avg / total       0.69      0.74      0.71        39

             precision    recall  f1-score   support

          0       0.78      0.90      0.84        31
          1       0.00      0.00      0.00         8

avg / total       0.62      0.72      0.66        39



  if sys.path[0] == '':
  if sys.path[0] == '':


             precision    recall  f1-score   support

          0       0.79      0.97      0.87        31
          1       0.00      0.00      0.00         8

avg / total       0.63      0.77      0.69        39

             precision    recall  f1-score   support

          0       0.81      0.94      0.87        31
          1       0.33      0.12      0.18         8

avg / total       0.71      0.77      0.73        39



  if sys.path[0] == '':
  if sys.path[0] == '':


             precision    recall  f1-score   support

          0       0.78      0.90      0.84        31
          1       0.00      0.00      0.00         8

avg / total       0.62      0.72      0.66        39

             precision    recall  f1-score   support

          0       0.79      0.87      0.83        31
          1       0.20      0.12      0.15         8

avg / total       0.67      0.72      0.69        39



  if sys.path[0] == '':
  if sys.path[0] == '':


             precision    recall  f1-score   support

          0       0.82      1.00      0.90        31
          1       1.00      0.12      0.22         8

avg / total       0.85      0.82      0.76        39

             precision    recall  f1-score   support

          0       0.78      0.94      0.85        31
          1       0.00      0.00      0.00         8

avg / total       0.62      0.74      0.68        39



  if sys.path[0] == '':
  if sys.path[0] == '':


             precision    recall  f1-score   support

          0       0.78      0.90      0.84        31
          1       0.00      0.00      0.00         8

avg / total       0.62      0.72      0.66        39

             precision    recall  f1-score   support

          0       0.78      0.94      0.85        31
          1       0.00      0.00      0.00         8

avg / total       0.62      0.74      0.68        39



  if sys.path[0] == '':
  'precision', 'predicted', average, warn_for)
  if sys.path[0] == '':


             precision    recall  f1-score   support

          0       0.79      1.00      0.89        31
          1       0.00      0.00      0.00         8

avg / total       0.63      0.79      0.70        39

             precision    recall  f1-score   support

          0       0.82      0.90      0.86        31
          1       0.40      0.25      0.31         8

avg / total       0.74      0.77      0.75        39



  if sys.path[0] == '':
  if sys.path[0] == '':


             precision    recall  f1-score   support

          0       0.79      0.97      0.87        31
          1       0.00      0.00      0.00         8

avg / total       0.63      0.77      0.69        39

             precision    recall  f1-score   support

          0       0.81      0.94      0.87        31
          1       0.33      0.12      0.18         8

avg / total       0.71      0.77      0.73        39



  if sys.path[0] == '':
  if sys.path[0] == '':


             precision    recall  f1-score   support

          0       0.79      0.97      0.87        31
          1       0.00      0.00      0.00         8

avg / total       0.63      0.77      0.69        39

             precision    recall  f1-score   support

          0       0.79      0.87      0.83        31
          1       0.20      0.12      0.15         8

avg / total       0.67      0.72      0.69        39



  if sys.path[0] == '':
  if sys.path[0] == '':


             precision    recall  f1-score   support

          0       0.81      0.97      0.88        31
          1       0.50      0.12      0.20         8

avg / total       0.75      0.79      0.74        39

             precision    recall  f1-score   support

          0       0.81      0.97      0.88        31
          1       0.50      0.12      0.20         8

avg / total       0.75      0.79      0.74        39



  if sys.path[0] == '':
  if sys.path[0] == '':


             precision    recall  f1-score   support

          0       0.81      0.97      0.88        31
          1       0.50      0.12      0.20         8

avg / total       0.75      0.79      0.74        39

             precision    recall  f1-score   support

          0       0.84      1.00      0.91        31
          1       1.00      0.25      0.40         8

avg / total       0.87      0.85      0.81        39

             precision    recall  f1-score   support

          0       0.78      0.94      0.85        31
          1       0.00      0.00      0.00         8

avg / total       0.62      0.74      0.68        39



  if sys.path[0] == '':


## Using only tags

In [28]:
def tokenize(word):
    return [w.strip() for w in word.split(',')]

In [43]:
df_all = []
for fandom in fandom_list:
    df = pd.read_csv(os.path.join(data_path, fandom), sep = '\t')
    df = df.dropna(subset=['Kudos'])
    top = np.percentile(df.Kudos.tolist(), 75)
    df['top_kudos'] = df.apply(lambda row: 1 if row['Kudos'] > top else 0, axis=1)
    df_all.append(df)
    
df = pd.concat(df_all).reset_index()
    
df_train = df.sample(frac=0.8, random_state=1)
df_test = df.loc[~df.index.isin(train.index)]

train_tags = df_train.AdditionalTags.tolist()
train_labels = df_train.top_kudos.tolist()


test_tags = df_test.AdditionalTags.tolist()
test_labels = df_test.top_kudos.tolist()

vectorizer = CountVectorizer(tokenizer=tokenize, min_df=2)
train_vectors = vectorizer.fit_transform(train_tags)
test_vectors = vectorizer.transform(test_tags)

model = RandomForestClassifier()
model.fit(train_vectors, train_labels)
print(classification_report(test_labels, model.predict(test_vectors)))
feat_rank = dict(zip(model.feature_importances_, vectorizer.get_feature_names()))
for k in sorted(feat_rank, reverse=True)[0:20]:
    print(k, feat_rank[k])


# df_all = pd.concat(df_all)
# df_all = df_all.reset_index()


# train = df_all.sample(frac=0.8, random_state=1)
# test = df_all.loc[~df_all.index.isin(train.index)]
# print(len(train), len(test))
# model = RandomForestClassifier()
# model.fit(train[base_feat], train[target])
# print(classification_report(test[target], model.predict(test[base_feat])))

             precision    recall  f1-score   support

          0       0.81      0.93      0.87       687
          1       0.58      0.32      0.41       215

avg / total       0.76      0.78      0.76       902

0.0136470476426 slow burn
0.0116281619734 romance
0.00920187032128 hurt/comfort
0.00916817350764 alternate universe
0.00901038759284 angst
0.00853373201975 fluff
0.00718404849247 masturbation
0.00650743296644 smut
0.00616337209918 alternate universe - canon divergence
0.00559526415932 alternate universe - modern setting
0.00550366106665 mpreg
0.00522780767934 humor
0.00509917124581 au
0.00496699111444 crossover
0.00450599920589 drama
0.00436523695737 alpha/beta/omega dynamics
0.00429926325465 minor character death
0.00411752503934 slow build
0.00397971396322 friendship
0.00388285946622 action/adventure
