In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from time import time
import pandas as pd
import re
import sys
sys.path.append('../../util/')
import sgt
from collections import Counter
from nltk.stem.snowball import EnglishStemmer

In [2]:
st = EnglishStemmer()

In [3]:
# create a corpus to give to sklearn
def create_corpus_for_voc(df):
    doc = []
    for i in df.Text.tolist():
        #Remove some non-ascii characters and 'aa's
        i = re.sub(r'aA|aa', 'a', i)
        i = re.sub(r'\\xe2........|\\xc|\\xa|\\n|[0123456789*_]', '', i)
        i = i.lower()
        doc.append(i)  
    return doc

In [4]:
# Get a vocabulary using sklearn's filtering
def get_voc(corpus, ngram, mindf):
    vectorizer = CountVectorizer(stop_words='english', ngram_range=(ngram,ngram),min_df=mindf)
    f = vectorizer.fit_transform(corpus)
    return set(sorted(vectorizer.get_feature_names()))

In [5]:
# compute unigram-frequency dict using the same preprocessing, using only words from the vocabulary
def create_unigram_freq_dict(df, voc):
    text = []
    for line in df.Text.tolist():
        line = re.sub(r'aA|aa', 'a', line)
        line = re.sub(r'\\xe2........|\\xc|\\xa|\\n|[0123456789*_]', '', line).lower()
        line = re.findall(u'(?u)\\b\\w\\w+\\b', line)
        line = [st.stem(word) for word in line if word in voc]
        text.append(dict(Counter(line)))
    return text

In [6]:
def create_timelist(df):
    timelist = df.PublishDate.drop_duplicates().tolist()
    timelist = [str(i)[:7] for i in timelist]
    return sorted(list(set(timelist)))

In [7]:
def create_df_time(df, time):
    return df[df.PublishDate.str[:7] == time]

In [8]:
# calculate unigram probabilities by simple Good Turing smoothing.
# imput: unigram-freq dict
# output: unigram-prob dict, mimic of a document-term matrix
# if unigram is in this doc, prob = the unigram prob calculated by sgt
# otherwise, prob = the probability given to "all unknown unigrams" by sgt
def calc_sgt(line_dict, voc):
    prob_line = []
    sgt_line = sgt.simpleGoodTuringProbs(line_dict)
    num_abs_words = len(voc - set(line_dict.keys()))
    for word in voc:
        if word in line_dict.keys():
            prob_line.append(sgt_line[0][word])
        else:
            prob_line.append(sgt_line[1]/float(num_abs_words))
    return prob_line

In [9]:
def calc_kl(p, q):
    return sum([p[i]*(np.log2(p[i]/q[i])) for i in range(len(p))])

In [10]:
df = pd.read_csv('../../data/shakespare_william_works_preprocessed.tsv', sep = '\t')

In [11]:
df = df.head(100)

In [12]:
kl_all = []
t0 = time()
min_df = 2
timelist = create_timelist(df)

for t in timelist:
    sgt_list = []
    df_t = create_df_time(df, t)
    
    # len(df_t) must > min_df
    # tune this for filtering?
    if len(df_t) > min_df*5:
        
        # output of the following pipeline:
        # a list of lists, each list containing sgt word probablity
        # word order is supposed to be the same
        corp = create_corpus_for_voc(df_t)
        vocab = get_voc(corp,1,min_df)
        unigram_dict = create_unigram_freq_dict(df_t, vocab)
        for i in unigram_dict:
            sgt_list.append(calc_sgt(i, vocab))
        
        # calculate kl.
        # std: "standard work", average of the numpy matrix
        # calculate kl of each work - std work in each month
        # then use the average as kl of the month
        sgt_array = np.asarray(sgt_list)
        std = np.mean(sgt_array, axis=0)
        kl_month = []
        for row in sgt_array:
            kl = calc_kl(row, std)
            kl_month.append(kl)
        kl_all.append(np.average([i for i in kl_month if not np.isnan(i)]))

print("done in %0.3fs." % (time() - t0))

{1: 172, 2: 38, 3: 22, 4: 6, 5: 6, 6: 2, 7: 2, 20: 1, 24: 1}
{1: 125, 2: 34, 3: 15, 4: 2, 5: 2, 6: 2, 8: 1, 10: 1, 11: 1, 20: 1, 24: 1, 29: 1}
{1: 153, 2: 63, 3: 26, 4: 6, 5: 9, 6: 4, 7: 4, 8: 3, 9: 1, 10: 1, 12: 1, 13: 1, 14: 1, 15: 1, 19: 2, 20: 1}
{1: 88, 2: 12, 3: 3, 5: 1}
{1: 209, 2: 94, 3: 47, 4: 29, 5: 20, 6: 14, 7: 19, 8: 7, 9: 4, 10: 4, 11: 5, 12: 1, 13: 2, 14: 2, 15: 2, 17: 1, 18: 2, 19: 1, 21: 1, 22: 1, 23: 2, 24: 1, 25: 2, 26: 1, 29: 1, 35: 1, 36: 1, 43: 1, 136: 1}
{1: 139, 2: 38, 3: 15, 4: 7, 5: 3, 6: 3, 7: 2, 16: 1}
{1: 47, 2: 11}
{1: 215, 2: 84, 3: 44, 4: 39, 5: 10, 6: 11, 7: 13, 8: 6, 9: 6, 10: 4, 11: 3, 12: 4, 14: 2, 15: 2, 84: 1, 22: 1, 106: 1, 62: 1}
{1: 116, 2: 34, 3: 5, 4: 4}
{1: 165, 2: 46, 3: 17, 4: 7, 5: 4, 7: 2, 12: 1, 20: 1}
{1: 126, 2: 33, 3: 17, 4: 12, 5: 3, 6: 4, 7: 1, 8: 1, 10: 1}
{1: 154, 2: 56, 3: 18, 4: 12, 5: 3, 6: 2, 7: 2, 8: 4, 9: 1, 10: 1, 12: 1, 13: 1, 14: 1}
{1: 87, 2: 16, 3: 9, 4: 6, 5: 1, 8: 1, 9: 1}
{1: 88, 2: 16, 3: 14, 4: 6, 5: 2, 6: 4, 7: 3,



In [13]:
kl_all

[0.5419282705453593, 0.69854270178875033, 0.4595797595418778]