In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from time import time
import pandas as pd
import re
import sys
sys.path.append('../../util/')
import sgt
from collections import Counter
from nltk.stem.snowball import EnglishStemmer

In [2]:
st = EnglishStemmer()

In [3]:
# create a corpus to give to sklearn
def create_corpus_for_voc(df):
    doc = []
    for i in df.Text.tolist():
        #Remove some non-ascii characters and 'aa's
        i = re.sub(r'aA|aa', 'a', i)
        i = re.sub(r'\\xe2........|\\xc|\\xa|\\n|[0123456789*_]', '', i)
        i = i.lower()
        doc.append(i)  
    return doc

In [4]:
# Get a vocabulary using sklearn's filtering
def get_voc(corpus, ngram, mindf):
    vectorizer = CountVectorizer(stop_words='english', ngram_range=(ngram,ngram),min_df=mindf)
    f = vectorizer.fit_transform(corpus)
    return set(sorted(vectorizer.get_feature_names()))

In [5]:
# compute unigram-frequency dict using the same preprocessing, using only words from the vocabulary
def create_unigram_freq_dict(df, voc):
    text = []
    for line in df.Text.tolist():
        line = re.sub(r'aA|aa', 'a', line)
        line = re.sub(r'\\xe2........|\\xc|\\xa|\\n|[0123456789*_]', '', line).lower()
        line = re.findall(u'(?u)\\b\\w\\w+\\b', line)
        line = [st.stem(word) for word in line if word in voc]
        text.append(dict(Counter(line)))
    return text

In [6]:
def create_timelist(df):
    timelist = df.PublishDate.drop_duplicates().tolist()
    timelist = [str(i)[:7] for i in timelist]
    return sorted(list(set(timelist)))

In [7]:
def create_df_time(df, time):
    return df[df.PublishDate.str[:7] == time]

In [49]:
# calculate unigram probabilities by simple Good Turing smoothing.
# imput: unigram-freq dict
# output: unigram-prob dict, mimic of a document-term matrix
# if unigram is in this doc, prob = the unigram prob calculated by sgt
# otherwise, prob = the probability given to "all unknown unigrams" by sgt
def calc_sgt(line_dict, voc):
    prob_line = []
    sgt_line = sgt.simpleGoodTuringProbs(line_dict)
    num_abs_words = len(voc - set(line_dict.keys()))
    for word in voc:
        if word in line_dict.keys():
            prob_line.append(sgt_line[0][word])
        else:
            prob_line.append(sgt_line[1]/float(num_abs_words))
    return prob_line

In [9]:
def calc_kl(p, q):
    return sum([p[i]*(np.log2(p[i]/q[i])) for i in range(len(p))])

In [10]:
df = pd.read_csv('../../data/shakespare_william_works_preprocessed.tsv', sep = '\t')

In [11]:
# df = df.head(100)

In [62]:
df_sameauthor = df[df.Author == 'LydiaOfNarnia']

In [63]:
len(df_sameauthor)

60

In [61]:
df.head(50)

Unnamed: 0,AdditionalTags,ArchiveWarnings,Author,Bookmarks,Category,ChapterIndex,Chapters,Characters,Comments,CompleteDate,...,Notes,PublishDate,Rating,Relationship,Summary,Text,Title,URL,UpdateDate,Words
0,"Crazy, Descent into Madness, Father-Daughter R...",No Archive Warnings Apply,DaughterofProspero,0,,,1,Duke Frederick (As You Like It),0,2016-01-31,...,,2016-01-31,General Audiences,,"""Thou art a fool: she robs thee of thy name;<b...",\nCelia!\nWhere has she got to? Rebellious gir...,The Knowledge of My Fault,http://archiveofourown.org/works/5862967,,786
1,"Character Death, a last goodbye, Crossover",Creator Chose Not To Use Archive Warnings,veronasowl,0,Gen,,1,"Tybalt, Juliet Capulet, Tod - Character, A Hal...",0,2015-11-05,...,,2015-11-05,Not Rated,,"Tybalt is dying, mortally wounded by Romeo. Sl...",\nHe heard his aunt scream as they carried him...,Death and the Capulets,http://archiveofourown.org/works/5147783?view_...,,1234
2,"Teenagers, Pre-Slash, Jealousy",No Archive Warnings Apply,skazka,0,Gen,,1,"Henry IV of England, Richard II of England",0,2016-04-25,...,Some two-year-old fic for you all! I wrote thi...,2016-04-25,General Audiences,,Young Henry knows he\'s playing with fire.,"\nHenry knows he\'s playing with fire, simply ...",The Blow That Envy Gave,http://archiveofourown.org/works/6654889,,390
3,"Alternate Universe - Modern Setting, Grief/Mou...",No Archive Warnings Apply,talefeathers,0,Gen,,1,"Valentine (Romeo and Juliet), Escalus (Romeo a...",2,2016-02-14,...,,2016-02-14,General Audiences,,"It\'s Valentine\'s Day again, which means Vale...","\nValentine tried to sleep in this year, but t...",Valentine\'s Next Day,http://archiveofourown.org/works/6009313,,693
4,"Genealogy, Pre-Canon",No Archive Warnings Apply,Liadt,0,Gen,,1,Richard Plantagenet Duke of York,0,2015-09-22,...,"Set roughly in early 1424 when Richard was 12,...",2015-09-22,General Audiences,,A young Richard contemplates his family tree.,"\nAs the sun went down, Richard was sitting on...",Empty Branches,http://archiveofourown.org/works/4776590,,541
5,"Poetry, Shakespearean Language, Fae & Fairies,...",Major Character Death,Masked_Man_2,0,,,1,Puck (Midsummer Night\'s Dream),0,2016-01-21,...,"Author\xe2\x80\x99s Note: Hello, fellow Shakes...",2016-01-21,Teen And Up Audiences,,"""Gather, glorious fellows all: this is the nig...","\n\nThe Elder Call\n\n\xc2\xa0\n\nGather, glor...",The Elder Call,http://archiveofourown.org/works/5781373,,628
6,"Parent-Child Relationship, Loss of Parent(s), ...",Major Character Death,MercutioLives,1,Gen,,1,"Mercutio (Romeo and Juliet), Valentine (Romeo ...",2,2015-11-21,...,For talefeathers.,2015-11-21,Teen And Up Audiences,,"\n <em>""Becoming an orphan, as it happens, is...",\nIt\'s nothing either of them ever expected t...,that heaven finds means to kill your joys with...,http://archiveofourown.org/works/5257730,,1214
7,"Multiple Partners, POV Third Person, Prostitut...",No Archive Warnings Apply,DaughterofProspero,1,F/M,,1,"Benedick (Much Ado About Nothing), Beatrice (M...",0,2016-01-15,...,,2016-01-15,Teen And Up Audiences,"Beatrice/Benedick, Beatrice/Benedick (Much Ado...","""That I neither feel how she should be loved n...","\n\nShe was his first, but he wasn\xe2\x80\x99...",How She Should be Loved,http://archiveofourown.org/works/5725711,,696
8,"Early Days, Friendship, Misunderstandings, Fir...",No Archive Warnings Apply,Fabrisse,1,F/M,,1,"Beatrice (Much Ado About Nothing), Benedick (M...",2,2015-12-11,...,For fiftysevenacademics (rapiddescent).,2015-12-11,Mature,Beatrice/Benedick,They meet for the first time at the funeral of...,\nHiding in the cupboard under the stairs wasn...,Smiles and Skirmishes,http://archiveofourown.org/works/5404376?view_...,,3224
9,Alternate Universe - High School,No Archive Warnings Apply,accio_spaceman,0,F/M,,1,"Beatrice (Much Ado About Nothing), Benedick (M...",2,2016-05-02,...,Benedick writes Beatrice a love letter and Bea...,2016-05-02,General Audiences,"Beatrice/Benedick, Beatrice/Benedick (Much Ado...",Benedick writes Beatrice a love letter and Bea...,\nBenedick frowned as an envelope fluttered ou...,Alterations,http://archiveofourown.org/works/6727771,,650


In [64]:
sgt_list = []
corp = create_corpus_for_voc(df_sameauthor)
vocab = get_voc(corp,1,min_df)
unigram_dict = create_unigram_freq_dict(df_sameauthor, vocab)
for i in unigram_dict:
    sgt_list.append(calc_sgt(i, vocab))

# calculate kl.
# std: "standard work", average of the numpy matrix
# calculate kl of each work - std work in each month
# then use the average as kl of the month
sgt_array = np.asarray(sgt_list)
std = np.mean(sgt_array, axis=0)
kl_sameauthor = []
for row in sgt_array:
    kl = calc_kl(row, std)
    kl_sameauthor.append(kl)

In [65]:
np.average([i for i in kl_sameauthor if not np.isnan(i)])

0.5914131416907018

In [66]:
df_rand = df.sample(60)

In [67]:
sgt_list = []
corp = create_corpus_for_voc(df_rand)
vocab = get_voc(corp,1,min_df)
unigram_dict = create_unigram_freq_dict(df_rand, vocab)
for i in unigram_dict:
    sgt_list.append(calc_sgt(i, vocab))

# calculate kl.
# std: "standard work", average of the numpy matrix
# calculate kl of each work - std work in each month
# then use the average as kl of the month
sgt_array = np.asarray(sgt_list)
std = np.mean(sgt_array, axis=0)
kl_rand = []
for row in sgt_array:
    kl = calc_kl(row, std)
    kl_rand.append(kl)



In [68]:
np.average([i for i in kl_rand if not np.isnan(i)])

0.81746036474618333

In [12]:
kl_all = []
t0 = time()
min_df = 2
timelist = create_timelist(df)

for t in timelist:
    sgt_list = []
    df_t = create_df_time(df, t)
    
    # len(df_t) must > min_df
    # tune this for filtering?
    if len(df_t) > min_df*5:
        
        # output of the following pipeline:
        # a list of lists, each list containing sgt word probablity
        # word order is supposed to be the same
        corp = create_corpus_for_voc(df_t)
        vocab = get_voc(corp,1,min_df)
        unigram_dict = create_unigram_freq_dict(df_t, vocab)
        for i in unigram_dict:
            sgt_list.append(calc_sgt(i, vocab))
        
        # calculate kl.
        # std: "standard work", average of the numpy matrix
        # calculate kl of each work - std work in each month
        # then use the average as kl of the month
        sgt_array = np.asarray(sgt_list)
        std = np.mean(sgt_array, axis=0)
        kl_month = []
        for row in sgt_array:
            kl = calc_kl(row, std)
            kl_month.append(kl)
        kl_all.append(np.average([i for i in kl_rand if not np.isnan(i)]))

print("done in %0.3fs." % (time() - t0))





KeyboardInterrupt: 