# Bag of words modeling of time

This is a quick and dirty baseline, not optimal or polished.

We have three readers doing disjoint sets of books. Our strategy will be to cross-validate on two readers and test on the third.

In [42]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate
from pathlib import Path
import glob
import json
import math
from scipy.stats import pearsonr

In [4]:
files = glob.glob('/Users/tunder/Dropbox/python/time/results/*.json')
readers = []
durations = []
texts = []
for afile in files: 
    with open(afile, encoding = 'utf-8') as f:
        for line in f:
            j = json.loads(line)
            if 'underwood' in afile:
                reader = 'u'
            elif 'mercado' in afile:
                reader = 'm'
            elif 'lee' in afile:
                reader = 'l'
            else:
                print('error')
            for docid, content in j.items():
                for seg in content['segments']:
                    text = seg['text']
                    minutes = seg['narratedtime']
                    readers.append(reader)
                    texts.append(text)
                    durations.append(math.log(minutes + .1))

In [25]:
vectorizer = CountVectorizer(max_features = 5000)
sparse_wordcounts = vectorizer.fit_transform(texts)
wordcounts = sparse_wordcounts.toarray()
docterm = pd.DataFrame(wordcounts, columns = vectorizer.get_feature_names())
docterm.head()

Unnamed: 0,000,10,13,1904,22,23,25,abandoned,abate,abbey,...,younger,youngest,your,yours,yourself,yourselves,youth,youthful,yu,zat
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [26]:
textlengths = [len(x) for x in texts]
wordfreqs = docterm.divide(textlengths, axis = 'rows')
wordfreqs.head()

Unnamed: 0,000,10,13,1904,22,23,25,abandoned,abate,abbey,...,younger,youngest,your,yours,yourself,yourselves,youth,youthful,yu,zat
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000728,0.000728,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.00071,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
featurecounts = docterm.sum(axis = 'rows')

In [36]:
featurecounts.shape

(5000,)

In [37]:
sortedcounts = sorted(list(featurecounts))

In [52]:
training = wordfreqs.loc[np.array(readers) != 'm', : ]
training.shape

(976, 5000)

In [19]:
readers = np.array(readers)
durations = np.array(durations)

In [53]:
testset = wordfreqs.loc[readers == 'm', : ]
testset.shape

(192, 5000)

In [56]:
trainscaler = StandardScaler()
trainXscaled = trainscaler.fit_transform(training)
trainXscaled = pd.DataFrame(trainXscaled, columns = training.columns)

In [57]:
testscaler = StandardScaler()
testXscaled = testscaler.fit_transform(testset)
testXscaled = pd.DataFrame(testXscaled, columns = testset.columns)

In [54]:
testy = durations[readers == 'm']
trainy = durations[readers != 'm']

In [58]:
 for numwords in [500, 1000, 2000, 3000]:
    trainX = trainXscaled.loc[ : , featurecounts > sortedcounts[-numwords]]
    for alpha in [100, 1000, 2000, 5000, 10000]:
        model = Ridge(alpha = alpha, max_iter = 1000) 
        results = cross_validate(model, trainX, trainy, cv = 5)
        print('wordcount', numwords, 'alpha', alpha)
        print('Mean r2:', np.mean(results['test_score']))
        print()

wordcount 500 alpha 100
Mean r2: -0.01982894547939207

wordcount 500 alpha 1000
Mean r2: 0.27402318706774975

wordcount 500 alpha 2000
Mean r2: 0.27255562330050925

wordcount 500 alpha 5000
Mean r2: 0.22241027060585133

wordcount 500 alpha 10000
Mean r2: 0.16289327637396714

wordcount 1000 alpha 100
Mean r2: -0.025387438598027102

wordcount 1000 alpha 1000
Mean r2: 0.2834710888624623

wordcount 1000 alpha 2000
Mean r2: 0.2878811132433792

wordcount 1000 alpha 5000
Mean r2: 0.24629319447376735

wordcount 1000 alpha 10000
Mean r2: 0.18924309289467006

wordcount 2000 alpha 100
Mean r2: 0.08993389963863732

wordcount 2000 alpha 1000
Mean r2: 0.25397522763004243

wordcount 2000 alpha 2000
Mean r2: 0.27112494643302754

wordcount 2000 alpha 5000
Mean r2: 0.2482800574963307

wordcount 2000 alpha 10000
Mean r2: 0.20121263001901757

wordcount 3000 alpha 100
Mean r2: 0.1747405253842262

wordcount 3000 alpha 1000
Mean r2: 0.24573576591079113

wordcount 3000 alpha 2000
Mean r2: 0.259513641463269

w

In [41]:
math.sqrt(.165)

0.406201920231798

In [61]:
testX = testXscaled.loc[ : , featurecounts > sortedcounts[-1000]]
trainX = trainXscaled.loc[ : , featurecounts > sortedcounts[-1000]]
ridge = Ridge(alpha = 2000, max_iter = 1000) 
ridge.fit(trainX, trainy)

# Now apply it to test

predictions = ridge.predict(testX)
pearsonr(testy, predictions)

(0.3763021998350018, 7.482366177243692e-08)

In [62]:
# u as test 0.5847
# l as test 0.5169
# m as test 0.3763

In [63]:
(0.5847 + 0.5169 + 0.3763) / 3

0.4926333333333333