In [101]:
import pandas as pd
from datetime import date
import numpy as np

#### Some features are chapter-wise, but the Kudos and Hits data is work-wise.
#### So aggregate the chapter-wise features.

In [90]:
df = pd.read_csv('../../data/shakespare_william_works_preprocessed.tsv', sep = '\t')

In [91]:
df.columns.values

       'Category', 'ChapterIndex', 'Chapters', 'Characters', 'Comments',
       'CompleteDate', 'Fandoms', 'Hits', 'Kudos', 'Language', 'Notes',
       'PublishDate', 'Rating', 'Relationship', 'Summary', 'Text', 'Title',
       'URL', 'UpdateDate', 'Words'], dtype=object)

#### Fields we're interested in

In [92]:
df = df[['Author','Bookmarks','Chapters','Comments','CompleteDate','Hits','Kudos',\
         'PublishDate','Text','Title','UpdateDate','Words']]

In [93]:
df.head(2)

Unnamed: 0,Author,Bookmarks,Chapters,Comments,CompleteDate,Hits,Kudos,PublishDate,Text,Title,UpdateDate,Words
0,DaughterofProspero,0,1,0,2016-01-31,18.0,1.0,2016-01-31,\nCelia!\nWhere has she got to? Rebellious gir...,The Knowledge of My Fault,,786
1,veronasowl,0,1,0,2015-11-05,40.0,5.0,2015-11-05,\nHe heard his aunt scream as they carried him...,Death and the Capulets,,1234


In [94]:
def date_today(cell):
    if '-' in str(cell):
        y, m, d = cell.split('-')
        return abs(date.today() - date(int(y), int(m), int(d))).days


#### Turn dates into the # of days between the date and today

In [95]:
df['PublishDate'] = df.apply(lambda row: date_today(row['PublishDate']), axis = 1)
df['UpdateDate'] = df.apply(lambda row: date_today(row['UpdateDate']), axis = 1)
df['CompleteDate'] = df.apply(lambda row: date_today(row['CompleteDate']), axis = 1)

In [128]:
df_agg = df.groupby(['Author','Hits','Kudos','Title','Words'])\
                     .agg({'Bookmarks':np.sum, 'Chapters':np.sum,'Comments':np.sum,\
                           'CompleteDate':np.min, 'PublishDate':np.max,'UpdateDate':np.max,\
                           'Text':lambda x: ','.join(x)}).reset_index()

In [136]:
df_agg['completed_in_days'] = df_agg.PublishDate - df_agg.CompleteDate

In [137]:
len(df)

3586

In [138]:
len(df_agg)

1720

In [139]:
df_agg

Unnamed: 0,Author,Hits,Kudos,Title,Words,UpdateDate,Comments,Bookmarks,PublishDate,CompleteDate,Text,Chapters,completed_in_days
0,0emma0,143.0,3.0,Drown,169,,0,0,831,831.0,"\n""Good Horatio!"" Hamlet, prince of Denmark, s...",1,0.0
1,1f_this_be_madness,64.0,2.0,As Just A Man,2919,,0,0,670,453.0,"\xe2\x80\x98What\xe2\x80\x99ll it be, love?\xe...",12,217.0
2,1f_this_be_madness,65.0,2.0,Good Friends He Hath; And Loyal Too,671,,0,0,588,588.0,\nHoratio comes to visit his greatest and dear...,1,0.0
3,1f_this_be_madness,103.0,1.0,"Good Day, Sweet Scholar",2120,,0,0,708,708.0,"\nTime has passed, but only the years have fad...",1,0.0
4,1f_this_be_madness,146.0,6.0,Hamlet and the Pirates,11155,,0,6,616,616.0,"Characters in the Play:\t(At Sea)HAMLET, Princ...",36,0.0
5,221b_hound,458.0,54.0,Forget Your Evil,783,,34,2,438,438.0,\nThe case in Leicester was difficult and the ...,1,0.0
6,221b_hound,765.0,52.0,Proclaim a Pardon,2034,,29,4,708,708.0,\nIt was not by chance that John left the news...,1,0.0
7,221b_hound,891.0,61.0,Wondrous Strange,5232,,56,4,724,724.0,\nThree weeks it had been since Richard had wo...,1,0.0
8,221b_hound,1089.0,76.0,Boundless as the Sea,4735,,56,5,786,786.0,"\nFor once, Sherlock was not complaining of bo...",1,0.0
9,221b_hound,1118.0,75.0,The Sword of Heaven,5633,,59,4,806,806.0,\nIt was cold in the glade. Richard\xe2\x80\x9...,1,0.0
