In [38]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from time import time
import pandas as pd
import re
from scipy import sparse

In [60]:
def get_distribution_matrix(corpus, ngram):
    vectorizer = CountVectorizer(stop_words='english', ngram_range = (ngram,ngram))
    f = vectorizer.fit_transform(corpus)
    l = len(vectorizer.get_feature_names())
    return np.divide(f, l)

In [21]:
def calc_kl(p, q):
    return sum([p[i]*(np.log2(p[i]/q[i])) for i in range(len(p))])

In [32]:
def create_df_time(df, time):
    return df[df.PublishDate.str[:7] == time]

In [34]:
df = pd.read_csv('../../data/shakespare_william_works_preprocessed.tsv', sep = '\t')

In [35]:
timelist = df.PublishDate.drop_duplicates().tolist()
timelist = [str(i)[:7] for i in timelist]
timelist = sorted(list(set(timelist)))

In [98]:
cnt = 0
kl_all = []
t0 = time()
for t in timelist:
    df_t = create_df_time(df, t)
#     if len(df_t) > 1:
    doc = []
    for i in df_t.Text.tolist():
        #Remove some non-ascii characters and 'aa's
        i = re.sub(r'aA|aa', 'a', i)
        i = re.sub(r'\\xe2........|\\xc|\\xa|\\n|[0123456789*_]', '', i)
        doc.append(i)  
    m = get_distribution_matrix(doc,2)
    std = sparse.csr_matrix.mean(m, axis=0)
    std = np.asarray(std)[0]+1
    kl_month = []
    for row in m.toarray():
        kl = calc_kl(row+1, std)
        if not np.isnan(kl):
            kl_month.append(kl)
    kl_all.append(np.average(kl_month))
print("done in %0.3fs." % (time() - t0))

done in 526.889s.


In [100]:
print (len(timelist), len(kl_all))

102 102


In [101]:
kl_all

[0.0,
 6.774010259664216e-06,
 0.00010066387720741865,
 0.0,
 0.0,
 1.1592331577791542e-06,
 4.6246928024224381e-06,
 0.00030266309129331481,
 0.00010809637913791748,
 0.0,
 0.00013539512578101634,
 0.0,
 0.00010377850423898272,
 0.00012491303009245133,
 5.057089394988612e-05,
 5.8282525328523516e-05,
 0.00026684066940979578,
 3.0837672802412819e-06,
 4.2518657594905106e-05,
 0.00037919694216587141,
 0.00058032625565718199,
 0.00011161100129714052,
 9.6387002513205936e-05,
 0.00014931284301172854,
 6.6027983467220608e-05,
 4.4436183575506195e-05,
 0.0,
 4.2154836929750306e-05,
 4.2141362258454884e-07,
 2.8068167722191086e-06,
 1.5770993330136668e-05,
 3.014318083429668e-05,
 3.3072672167838714e-05,
 2.7185597184046597e-06,
 0.0017957185550147581,
 7.0993265568894779e-06,
 1.923903090476353e-06,
 2.6413556193167182e-06,
 0.00022508868880583011,
 3.8484031126432127e-05,
 1.0189396274138458e-05,
 2.438841406835271e-06,
 1.2288170492106265e-06,
 1.4519419564094072e-05,
 2.6160114603104394e

In [81]:
test = ['cat cat dog',
       'sheep cat dog']

In [82]:
mt = get_distribution_matrix(test,1)

In [83]:
stdt = sparse.csr_matrix.mean(mt, axis=0)

In [84]:
print(mt.toarray())

[[ 0.66666667  0.33333333  0.        ]
 [ 0.33333333  0.33333333  0.33333333]]


In [85]:
print(mt)

  (0, 0)	0.666666666667
  (0, 1)	0.333333333333
  (1, 0)	0.333333333333
  (1, 1)	0.333333333333
  (1, 2)	0.333333333333


In [86]:
for row in mt.toarray():
    kl = calc_kl(row, np.asarray(stdt)[0])
    if not np.isnan(kl):
        print(kl)

0.138345833093


In [88]:
np.asarray(stdt)[0]+1

array([ 1.5       ,  1.33333333,  1.16666667])