# Topics that inversely correlate

In [9]:
import sys, csv, math, random
import numpy as np
import pandas as pd
from matplotlib import pyplot 
%matplotlib inline
from scipy.stats import pearsonr

In [2]:
# A useful function

def getdoc(anid):
    '''
    Gets the docid part of a character id
    '''

    if '|' in anid:
        thedoc = anid.split('|')[0]
    else:
        print('error', anid)
        thedoc = anid

    return thedoc

In [3]:
# Initialize some variables

# We're going to do this for a 200-topic model.

doctopic_path = '../fic200/fic200_doctopics.txt'
veclen = 200

# Two variables that will hold a list of CVs and means 
# for each topic. Each item in the list represents a
# coefficient of variation, or a mean, for a different
# document.

doc_cvs = dict()
doc_means= dict()

for i in range(veclen):
    doc_means[i] = []
    doc_cvs[i] = []



In [4]:
names = ['theindex', 'charid']
names.extend(["topic" + str(x) for x in range (200)])
dtm = pd.read_csv(doctopic_path, sep = '\t', names = names)

In [5]:
dtm = dtm.assign(docid = dtm.charid.map(getdoc))

In [43]:
alldocs = set(dtm.docid)
randomsample = random.sample(alldocs, 200)

In [44]:
topicbytopic = dict()
for i in range(veclen):
    topicbytopic[i] = dict()

In [45]:
for idx, d in enumerate(randomsample):
    if idx % 10 == 1:
        print(idx)
    group = dtm.loc[dtm.docid == d, :]
    if len(group) < 3:
        continue
    for i in range(veclen):
        for j in range(veclen):
            if j not in topicbytopic[i]:
                topicbytopic[i][j] = []
            ival = group['topic' + str(i)]
            jval = group['topic' + str(j)]
            r, p = pearsonr(ival, jval)
            topicbytopic[i][j].append((r, np.mean(ival) + np.mean(jval)))


1
11
21
31
41
51
61
71
81
91
101
111
121
131
141
151
161
171
181
191


In [46]:
keypath = '../fic200/fic200_keys.txt'

keys = []
boring = {'said', 'had', 'was'}

with open(keypath, encoding = 'utf-8') as f:
    for line in f:
        fields = line.strip().split('\t')
        text = fields[2]
        words = text.split()[1 : ]
        interestingwords = []
        for w in words:
            if w not in boring:
                interestingwords.append(w)
            if len(interestingwords) > 15:
                break
        keys.append(interestingwords)

keys = [' '.join(x) for x in keys]

In [47]:
tuples = []
for i in range(veclen):
    for j in range(i + 1, veclen):
        weight_tuples =  topicbytopic[i][j]
        vector, weights = list(zip(*weight_tuples))
        mean = np.average(vector, weights = weights)
        tuples.append((mean, i, j))

tuples.sort()
for t in tuples[0: 12]:
    mean, i, j = t
    print(mean)
    print(i, keys[i])
    print(j, keys[j])
    print()

-0.414284941012
57 was-miss nd said-lh ty said-tw vo said-ba lh said-md said-jr said-mrs thought m.d or l looked
131 replied said-sir hand companion eyes continued added said-de was-cried said-replied said-though head said-lord said-give said-indeed said-dear

-0.409789294622
29 said-hy said-heen was-cried heen was-replied said-hythe said-sir said-lord said-hack said-hetter hy said-yon hut said-cried said-lady cried
131 replied said-sir hand companion eyes continued added said-de was-cried said-replied said-though head said-lord said-give said-indeed said-dear

-0.404947298925
94 said-knight said-king said-lancelot said-knights said-hee said-unto said-fair said-ever said-lady was-sir said-lord said-doe said-therefore said-sword said-god said-bee
131 replied said-sir hand companion eyes continued added said-de was-cried said-replied said-though head said-lord said-give said-indeed said-dear

-0.402040236433
121 said-beth said-ﬁrst said-ﬁnd said-miss godmother said-rose said-mrs ﬁnd ﬁnge

In [48]:
topicmeans = dict()
for i in range(200):
    topicmeans[i] = []

for idx, d in enumerate(randomsample):
    if idx % 10 == 1:
        print(idx)
    group = dtm.loc[dtm.docid == d, :]
    if len(group) < 1:
        continue
    for i in range(veclen):
        ival = group['topic' + str(i)]
        topicmeans[i].append(np.mean(ival))

1
11
21
31
41
51
61
71
81
91
101
111
121
131
141
151
161
171
181
191


In [52]:
bookcorrelations = dict()

for i in range(200):
    if i not in bookcorrelations:
        bookcorrelations[i] = dict()
    for j in range(i + 1, 200):
        bookcorrelations[i][j], p = pearsonr(topicmeans[i], topicmeans[j])  

In [59]:
tuples = []
for i in range(veclen):
    for j in range(i + 1, veclen):
        weight_tuples =  topicbytopic[i][j]
        vector, weights = list(zip(*weight_tuples))
        mean = np.average(vector, weights = weights)
        if mean < 0 and bookcorrelations[i][j] > 0.2:
            tuples.append((mean, bookcorrelations[i][j], i, j))

tuples.sort()
for t in tuples[0: 12]:
    mean, bookc, i, j = t
    print(mean, bookc)
    print(i, keys[i])
    print(j, keys[j])
    print()

-0.148163974954 0.240298258068
29 said-hy said-heen was-cried heen was-replied said-hythe said-sir said-lord said-hack said-hetter hy said-yon hut said-cried said-lady cried
165 said-wo said-why said-game said-take said-thing said-oh said-look said-play said-back said-boy said-big said-give said-enough said-fellow said-better said-guess

-0.139549492338 0.227229758496
4 looked head nodded smiled asked shook turned said-want eyes hand face laughed said-yes stood said-why took
47 said-lady said-ladyship carriage said-lord maid said-dear husband woman was-cried party lord friends said-miss house has was-exclaimed

-0.138524999246 0.595019133799
29 said-hy said-heen was-cried heen was-replied said-hythe said-sir said-lord said-hack said-hetter hy said-yon hut said-cried said-lady cried
176 felt said-dear said-miss was-replied thought friend said-quite said-wish looked mind manner said-lady said-indeed said-mrs feelings said-sure

-0.128795885417 0.297329165015
124 said-ai said-reckon said-

In [30]:
topicbytopic[1][2]

[(0.0074688356697326002, 292119    0.001817
  292120    0.001202
  292121    0.057931
  292122    0.005395
  292123    0.003333
  292124    0.006079
  292125    0.005717
  292126    0.005108
  292127    0.002758
  292128    0.024106
  292129    0.031029
  292130    0.002095
  292131    0.004034
  292132    0.004211
  292133    0.267645
  dtype: float64), (-0.050140191173976258, 43580    0.005419
  43581    0.001202
  43582    0.009408
  43583    0.003871
  43584    0.020379
  43585    0.000242
  43586    0.004211
  43587    0.001394
  43588    0.000701
  43589    0.000480
  43590    0.002608
  43591    0.003721
  43592    0.012776
  43593    0.000794
  43594    0.003018
  43595    0.001927
  43596    0.000347
  43597    0.000096
  43598    0.005108
  43599    0.005717
  43600    0.000593
  43601    0.030632
  43602    0.000419
  43603    0.000821
  43604    0.000468
  43605    0.000384
  43606    0.002758
  43607    0.000636
  43608    0.006079
  43609    0.000233
  43610    0.000775
 