# Make topic data at book level

Here we simply aggregate the chunk-level data, at book (title) level, and combine it with metadata that may be useful later.

In [1]:
import pandas as pd
import numpy as np

In [2]:
def get_doctopics(filename, parsemeta, docs2get):
    chunks = dict()
    weights = dict()
    with open(filename, encoding = 'utf-8') as f:
        for line in f:
            fields = line.strip().split()
            chunkid = fields[1]
            docid = fields[1].split('_')[0]
            
            if docid not in docs2get:
                continue
        
            if docid not in chunks:
                chunks[docid] = []
                weights[docid] = []
                
            vector = np.array([float(x) for x in fields[2: ]])
            chunks[docid].append(vector)
            weights[docid].append(parsemeta.at[chunkid, 'tokens'])
            
    docs = dict()
   
    for docid, value in chunks.items():
        avgvector = np.average(value, axis = 0, weights = weights[docid])
        docs[docid] = avgvector
          
    return docs

In [3]:
parsemeta = pd.read_csv('../getEF/parsing_metadata3.tsv', sep = '\t')

In [4]:
liwc = pd.read_csv('../liwc/liwc_w_meta.tsv', sep = '\t', low_memory = False)
liwc.shape

(10835, 89)

In [6]:
parsemeta.set_index('id', inplace = True)

In [7]:
docs = get_doctopics('../modelselection/final/k200doctopics.txt', parsemeta, liwc.docid.tolist())

In [8]:
docdf = pd.DataFrame.from_dict(docs, orient = 'index', columns = ['t' + str(i) for i in range(200)])

In [9]:
meta = pd.read_csv('../metadata/corpus4.tsv', sep = '\t', low_memory = False)

In [11]:
demograph = meta.loc[meta.docid.isin(docdf.index), ['docid', 'birthyear', 'firstpub', 'hathi_author', 'hathi_title', 'us_national', 'authof3ormore']]
demograph = demograph.assign(age = demograph.firstpub - demograph.birthyear)

In [12]:
demograph.shape

(10830, 8)

In [13]:
docdf.shape

(10830, 200)

In [17]:
bookdata = docdf.merge(demograph, right_on = 'docid', left_index = True)
bookdata.shape

(10830, 208)

In [18]:
bookdata.head()

Unnamed: 0,t0,t1,t2,t3,t4,t5,t6,t7,t8,t9,...,t198,t199,docid,birthyear,firstpub,hathi_author,hathi_title,us_national,authof3ormore,age
19420,6e-06,3.1e-05,0.011298,4e-06,0.000193,5e-06,0.000212,0.000165,0.0004,0.000138,...,0.004745,2e-06,inu.30000112046630,1911.0,1957,"Abbe, George",The winter house,False,True,46.0
21760,0.000243,3.5e-05,0.007245,4e-06,1.1e-05,0.002074,3e-06,9e-06,8e-06,8e-06,...,0.035885,2e-06,uc1.$b799882,1911.0,1967,"Abbe, George",The funeral,False,True,56.0
21989,6e-06,5e-06,0.012275,3e-06,0.000755,0.009934,2e-06,8e-06,0.000583,0.000278,...,4e-06,2e-06,uc1.$b149331,1911.0,1968,"Abbe, George",Yonderville,False,True,57.0
19228,0.000215,0.004015,0.00768,0.001625,0.000167,0.002289,0.050386,0.000305,7e-06,0.002727,...,0.001661,2e-06,uc1.32106007981415,1927.0,1956,"Abbey, Edward",The brave cowboy : an old tale in a new time,True,True,29.0
20601,6e-06,5e-06,9.5e-05,0.007116,0.007377,0.000231,0.062383,0.000149,0.000289,0.001048,...,3.3e-05,2e-06,inu.39000001136287,1927.0,1962,"Abbey, Edward",Fire on the mountain,True,True,35.0


In [19]:
bookdata.to_csv('bookleveltopicdata.tsv', sep = '\t', index = False)

In [21]:
sum(bookdata.us_national & (bookdata.firstpub > 1889) & (bookdata.firstpub < 1990))

5572