# Make topic data at book level

Here we simply aggregate the chunk-level data, at book (title) level, and combine it with metadata that may be useful later.

In [4]:
import pandas as pd
import numpy as np

In [5]:
def get_doctopics(filename, parsemeta, docs2get):
    chunks = dict()
    weights = dict()
    with open(filename, encoding = 'utf-8') as f:
        for line in f:
            fields = line.strip().split()
            chunkid = fields[1]
            docid = fields[1].split('_')[0]
            
            if docid not in docs2get:
                continue
        
            if docid not in chunks:
                chunks[docid] = []
                weights[docid] = []
                
            vector = np.array([float(x) for x in fields[2: ]])
            chunks[docid].append(vector)
            weights[docid].append(parsemeta.at[chunkid, 'tokens'])
            
    docs = dict()
   
    for docid, value in chunks.items():
        avgvector = np.average(value, axis = 0, weights = weights[docid])
        docs[docid] = avgvector
          
    return docs

In [7]:
parsemeta = pd.read_csv('../get_texts/parsing_metadata3.tsv', sep = '\t')

In [8]:
parsemeta.shape

(154883, 5)

In [39]:
shortmeta = pd.read_csv('../shortstories/shortvsnovelmeta.tsv', sep = '\t')

In [10]:
parsemeta.set_index('id', inplace = True)

In [40]:
docs = get_doctopics('../modelselection/final/k200doctopics.txt', parsemeta, shortmeta.docid.tolist())

In [41]:
docdf = pd.DataFrame.from_dict(docs, orient = 'index', columns = ['t' + str(i) for i in range(200)])

In [42]:
demograph = shortmeta.loc[ : , ['docid', 'genre', 'birthyear', 'firstpub', 'hathi_author', 'hathi_title', 'us_national', 'authof3ormore']]
demograph = demograph.assign(age = demograph.firstpub - demograph.birthyear)

In [43]:
demograph.shape

(1420, 9)

In [44]:
docdf.shape

(1420, 200)

In [45]:
bookdata = docdf.merge(demograph, right_on = 'docid', left_index = True)
bookdata.shape

(1420, 209)

In [35]:
bookdata.head()

Unnamed: 0,t0,t1,t2,t3,t4,t5,t6,t7,t8,t9,...,t199,docid,genre,birthyear,firstpub,hathi_author,hathi_title,us_national,authof3ormore,age
985,0.000243,3.5e-05,0.007245,4e-06,1.1e-05,0.002074,3e-06,9e-06,8e-06,8e-06,...,2e-06,uc1.$b799882,long,1911.0,1967,"Abbe, George",The funeral,False,True,56.0
1188,0.0001,5e-06,0.001766,0.048662,0.02235,0.000591,2.1e-05,0.001594,0.000121,0.031007,...,2e-06,mdp.39015012918861,long,1919.0,1957,"Abrahams, Peter","This island, now",False,True,38.0
285,6e-06,0.01879,0.023701,4e-06,0.000635,0.0231,3e-06,0.003576,0.000692,8e-06,...,0.000775,mdp.39015059384274,short,1889.0,1925,"Aiken, Conrad",Bring! bring! and other stories,True,True,36.0
946,0.000186,0.003024,0.007866,4.9e-05,0.0001,0.026803,2e-06,0.052632,0.001825,7e-06,...,2e-06,mdp.39015063779485,long,1892.0,1946,"Aldington, Richard","The romance of Casanova, a novel",False,True,54.0
1106,0.000807,7.8e-05,4.7e-05,4e-06,0.001467,0.001645,2e-06,0.000118,0.001355,7e-06,...,2e-06,mdp.39015063740230,long,1881.0,1926,"Aldrich, Bess Streeter",The Cutters,True,True,45.0


In [46]:
bookdata.to_csv('shortvsnoveldata.tsv', sep = '\t', index = False)

In [36]:
bookdata.to_csv('shortstorytopicdata.tsv', sep = '\t', index = False)

In [37]:
sum(bookdata.genre == 'short')

710

In [38]:
sum(bookdata.genre == 'long')

709