In [1]:
import pandas as pd
from datetime import date
import numpy as np

#### Some features are chapter-wise, but the Kudos and Hits data is work-wise.
#### So aggregate the chapter-wise features.

In [2]:
df = pd.read_csv('homestuck_preprocessed.tsv', sep = '\t')

In [3]:
df.columns.values

       'Category', 'ChapterIndex', 'Chapters', 'Characters', 'Comments',
       'CompleteDate', 'Fandoms', 'Hits', 'Kudos', 'Language', 'Notes',
       'PublishDate', 'Rating', 'Relationship', 'Summary', 'Text', 'Title',
       'URL', 'UpdateDate', 'Words'], dtype=object)

In [4]:
df = df.sample(len(df)//4)

#### Fields we're interested in

In [5]:
df = df[['Author','Bookmarks','Chapters','Comments','CompleteDate','Hits','Kudos',\
         'PublishDate','Text','Title','UpdateDate','Words']]

In [6]:
df.head(2)

Unnamed: 0,Author,Bookmarks,Chapters,Comments,CompleteDate,Hits,Kudos,PublishDate,Text,Title,UpdateDate,Words
74380,Phenomenon,1,1,4,2012-12-28,317.0,17.0,2012-12-28,\nVriska woke to the sound of her spider lusus...,Games for Girls,,3548.0
23564,AbelinCilion69,2,3,0,,430.0,27.0,2015-04-12,\xc2\xa0\xc2\xa0\xc2\xa0 Rose\'s feet ached as...,Apocalyptica,2015-06-18,2438.0


In [7]:
def date_today(cell):
    if '-' in str(cell):
        y, m, d = cell.split('-')
        return abs(date.today() - date(int(y), int(m), int(d))).days


#### Turn dates into the # of days between the date and today

In [8]:
df['PublishDate'] = df.apply(lambda row: date_today(row['PublishDate']), axis = 1)
df['UpdateDate'] = df.apply(lambda row: date_today(row['UpdateDate']), axis = 1)
df['CompleteDate'] = df.apply(lambda row: date_today(row['CompleteDate']), axis = 1)

In [9]:
df.head()

Unnamed: 0,Author,Bookmarks,Chapters,Comments,CompleteDate,Hits,Kudos,PublishDate,Text,Title,UpdateDate,Words
74380,Phenomenon,1,1,4,1626.0,317.0,17.0,1626,\nVriska woke to the sound of her spider lusus...,Games for Girls,,3548.0
23564,AbelinCilion69,2,3,0,,430.0,27.0,791,\xc2\xa0\xc2\xa0\xc2\xa0 Rose\'s feet ached as...,Apocalyptica,724.0,2438.0
12541,LateNiteSlacker,15,19,2,,2039.0,160.0,616,It had been a while since he had come to the m...,A Series of Drabbles,553.0,18908.0
81353,joaniedark,13,31,0,1989.0,8488.0,159.0,2008,The two of them truly surprised one another in...,31/100,,3061.0
59238,sachi_sama,93,10,0,1319.0,4961.0,374.0,1375,You\xe2\x80\x99d like to say time passed by qu...,A Purgatory Story,,42276.0


In [10]:
df_agg = df.groupby(['Author','Hits','Kudos','Title','Words'])\
                     .agg({'Bookmarks':np.sum, 'Chapters':np.sum,'Comments':np.sum,\
                           'CompleteDate':np.min, 'PublishDate':np.max,'UpdateDate':np.max,\
                           'Text':lambda x: ','.join(x)}).reset_index()

In [11]:
df_agg['completed_in_days'] = df_agg.PublishDate - df_agg.CompleteDate

In [12]:
len(df)

22176

In [13]:
len(df_agg)

12967

In [14]:
df_agg.head()

Unnamed: 0,Author,Hits,Kudos,Title,Words,Text,Comments,CompleteDate,Chapters,PublishDate,UpdateDate,Bookmarks,completed_in_days
0,00HD,216.0,17.0,A Young Man Stands Before a Great Door at the ...,829.0,"\nHis once bright blue eyes, are now clouded o...",0,936.0,1,936,,0,0.0
1,00HD,272.0,17.0,The Doctor\'s Exploration of Homestuck/Sburb a...,920.0,Your name is JhFYURe685987yiGF&^TOIYGBua34\xe2...,0,,1,1007,1008.0,0,
2,00HD,278.0,7.0,The Longfall of Gamzee Makara: a tale of insan...,696.0,Your head is killing you.You walk down the hal...,0,,3,1014,1014.0,0,
3,00HD,293.0,10.0,Three Years on a War Ship,5459.0,Davesprite was alone.He sat at the edge of the...,4,,45,1076,1046.0,0,
4,00HD,311.0,21.0,Dumb Kids,6166.0,You feel sick the whole way home. Every time s...,0,,12,935,924.0,0,


In [23]:
len(df_agg.columns.values)

NameError: name 'df_agg' is not defined

In [18]:
with open('homestuck_agg.tsv','r') as f:
    test = f.readlines()

In [21]:
length = []
for line in test:
    length.append(len(line.split('\t')))

In [30]:
for line in test:
    if len(line.split('\t')) == 6:
        print(line)


 him Yerkir"", Mituna responds, ""Hey if uh, you don\'t mind me asking... where\'s your sign?""\n""M-my sign?"" You had asked mother the same question before, but she had always avoided the topic of what she called the Hemospectrum. ""I don\'t think I have one actually""\n""You don\'t have one? What are you a signless?"" Mituna jokes, ""Next you\'re going to tell me that you\'re not even on the Hemospectrum""\n""Uhh"" You\'re not sure what to say at this point, are you in trouble? Is it bad that you\'re not on the Hemospectrum? Surely mother would have told you, considering that she\'s so protective of you.\n""HAHAHA YOU SHIT....I\'m sorry, I\'m kidding. I don\'t mean to pry hehehehe""\n""That\'s ok I suppose"", you respond, ""Do you want to play?""\nThe three of you then proceed to play a game that the author of this fic is much too lazy write out. It is getting late and daylight is drawing near. Mituna then notices another troll approaching your small opening.\n""Hey who\'s that? Do 

In [24]:
from collections import Counter

In [26]:
Counter(length)

Counter({6: 5, 8: 5, 13: 35526})