In [79]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from time import time
import pandas as pd
import re
import sys
sys.path.append('../../util/')
import sgt
from collections import Counter
from nltk.stem.snowball import EnglishStemmer

In [36]:
st = EnglishStemmer()

In [15]:
# create a corpus to give to sklearn
def create_corpus_for_voc(df):
    doc = []
    for i in df.Text.tolist():
        #Remove some non-ascii characters and 'aa's
        i = re.sub(r'aA|aa', 'a', i)
        i = re.sub(r'\\xe2........|\\xc|\\xa|\\n|[0123456789*_]', '', i)
        i = i.lower()
        doc.append(i)  
    return doc

In [16]:
# Get a vocabulary using sklearn's filtering
def get_voc(corpus, ngram, mindf):
    vectorizer = CountVectorizer(stop_words='english', ngram_range=(ngram,ngram),min_df=mindf)
    f = vectorizer.fit_transform(corpus)
    return set(sorted(vectorizer.get_feature_names()))

In [49]:
# compute unigram-frequency dict using the same preprocessing, using only words from the vocabulary
def create_unigram_freq_dict(df, voc):
    text = []
    for line in df.Text.tolist():
        line = re.sub(r'aA|aa', 'a', line)
        line = re.sub(r'\\xe2........|\\xc|\\xa|\\n|[0123456789*_]', '', line).lower()
        line = re.findall(u'(?u)\\b\\w\\w+\\b', line)
        line = [st.stem(word) for word in line if word in voc]
        text.append(dict(Counter(line)))
    return text

In [18]:
def create_timelist(df):
    timelist = df.PublishDate.drop_duplicates().tolist()
    timelist = [str(i)[:7] for i in timelist]
    return sorted(list(set(timelist)))

In [19]:
def create_df_time(df, time):
    return df[df.PublishDate.str[:7] == time]

In [52]:
# calculate unigram probabilities by simple Good Turing smoothing.
# imput: unigram-freq dict
# output: unigram-prob dict, mimic of a document-term matrix
# if unigram is in this doc, prob = the unigram prob calculated by sgt
# otherwise, prob = the probability given to "all unknown unigrams" by sgt
def calc_sgt(line_dict, voc):
    prob_line = []
    sgt_line = sgt.simpleGoodTuringProbs(line_dict)
    for word in voc:
        if word in line_dict.keys():
            prob_line.append(sgt_line[0][word])
        else:
            prob_line.append(sgt_line[1])
    return prob_line

In [58]:
def calc_kl(p, q):
    return sum([p[i]*(np.log2(p[i]/q[i])) for i in range(len(p))])

In [130]:
df = pd.read_csv('../../data/shakespare_william_works_preprocessed.tsv', sep = '\t')

In [131]:
# df = df.head(100)

In [132]:
kl_all = []
t0 = time()
min_df = 2
timelist = create_timelist(df)

for t in timelist:
    sgt_list = []
    df_t = create_df_time(df, t)
    
    # len(df_t) must > min_df
    # tune this for filtering?
    if len(df_t) > min_df*5:
        
        # output of the following pipeline:
        # a list of lists, each list containing sgt word probablity
        # word order is supposed to be the same
        corp = create_corpus_for_voc(df_t)
        vocab = get_voc(corp,1,min_df)
        unigram_dict = create_unigram_freq_dict(df_t, vocab)
        for i in unigram_dict:
            sgt_list.append(calc_sgt(i, vocab))
        
        # calculate kl.
        # std: "standard work", average of the numpy matrix
        # calculate kl of each work - std work in each month
        # then use the average as kl of the month
        sgt_array = np.asarray(sgt_list)
        std = np.mean(sgt_array, axis=0)
        kl_month = []
        for row in sgt_array:
            kl = calc_kl(row, std)
            kl_month.append(kl)
        kl_all.append(np.average(kl_month))

print("done in %0.3fs." % (time() - t0))

p0 = 0.729730
Regression: log(z) = -2.879208*log(r) + 3.860462
p0 = 0.337374
Regression: log(z) = -2.442765*log(r) + 5.379962
p0 = 0.274534
Regression: log(z) = -2.079185*log(r) + 5.470238
p0 = 0.561224
Regression: log(z) = -3.148506*log(r) + 4.979897
p0 = 0.359725
Regression: log(z) = -2.460851*log(r) + 5.651221
p0 = 0.417661
Regression: log(z) = -2.467944*log(r) + 5.259759
p0 = 0.510373
Regression: log(z) = -2.106726*log(r) + 4.629486
p0 = 0.416370
Regression: log(z) = -2.287358*log(r) + 4.488452
p0 = 0.505495
Regression: log(z) = -2.349884*log(r) + 4.020270
p0 = 0.267054
Regression: log(z) = -2.176573*log(r) + 5.478525
p0 = 0.600000
Regression: log(z) = -2.517675*log(r) + 4.422405
p0 = 0.463277
Regression: log(z) = -2.583693*log(r) + 5.149276
p0 = 0.666667
Regression: log(z) = -2.098498*log(r) + 2.649886
p0 = 0.370675
Regression: log(z) = -1.946017*log(r) + 5.149455
p0 = 0.159204
Regression: log(z) = -2.056676*log(r) + 6.394903
p0 = 0.290909
Regression: log(z) = -2.312409*log(r) + 5

Exception KeyboardInterrupt in 'zmq.backend.cython.message.Frame.__dealloc__' ignored
ERROR: Internal Python error in the inspect module.
Below is the traceback from this internal error.


Unfortunately, your original traceback can not be constructed.


  File "/Users/jingy/anaconda/envs/py27/lib/python2.7/site-packages/IPython/core/ultratb.py", line 970, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/Users/jingy/anaconda/envs/py27/lib/python2.7/site-packages/IPython/core/ultratb.py", line 233, in wrapped
    return f(*args, **kwargs)
  File "/Users/jingy/anaconda/envs/py27/lib/python2.7/site-packages/IPython/core/ultratb.py", line 267, in _fixed_getinnerframes
    records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))
  File "/Users/jingy/anaconda/envs/py27/lib/python2.7/inspect.py", line 1049, in getinnerframes
    framelist.append((tb.tb_frame,) + getframeinfo(tb, context))
  File "/Users/jingy/anaconda/envs/py27/lib/python2.7/inspect.py", line 1009, in getframeinfo
    filename = getsourcefile(frame) or getfile(frame)
  File "/Users/jingy/anaconda/envs/py27/lib/python2.7/inspect.py", line 451, in getsourcefile
    if os.path.exists(filename):
  File "/Users




TypeError: 'NoneType' object is not iterable