# Predictive modeling of subject/genre categories

Code to infer the strength of the boundary that separates a category of fiction from the rest of the works in our matched dataset. While we're treating the dataset as "background" here, we don't mean to imply that it's a vanilla or absolutely random sample. On the contrary, these are all works that were collected by academic libraries and reviewed by at least one periodical, so very obscure works of fiction (or those with an ephemeral popular audience) are likely to have been excluded.

In [1]:
import pandas as pd
import numpy as np
import os, ast, string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import cross_validate

from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
punctuationstring = string.punctuation + '—‘’“”'
punctzapper = str.maketrans(punctuationstring, ' ' * len(punctuationstring))

bpodict = dict()

with open('../../filtered/all_fic_reviews.txt', encoding = 'utf-8') as f:
    for line in f:
        fields = line.strip().split('\t')
        bpoid = int(fields[0])

        cleaned_text = fields[2].translate(punctzapper)
        bpodict[bpoid] = cleaned_text

In [3]:
def get_vols_4_category(metapath, books, texts, classids, thisclass):
    global bpodict
    
    metadf = pd.read_csv(metapath, sep = '\t', index_col = 'bookid')
    
    for bookid, row in metadf.iterrows():
        thetext = []
        bpoids = ast.literal_eval(row['bpoids'])

        for b in bpoids:
            thetext.append(bpodict[b])
            
        if len(thetext) > 0:
            thetext = ' '.join(thetext)
            books.append(bookid)
            texts.append(thetext)
            classids.append(thisclass)
                   
def get_genretexts(gname):
    books = []
    texts = []
    classids = []
    
    root = '../../genremeta/'
    mainpath = root + gname + '.tsv'
    get_vols_4_category(mainpath, books, texts, classids, 1)
    
    contrastpath = root + gname + '_contrast.tsv'
    get_vols_4_category(contrastpath, books, texts, classids, 0)
    
    genredf = pd.DataFrame({'classid': classids, 'text': texts}, index = books)
    
    return genredf

In [4]:
def test_train_split(genredf):
    positive = genredf.loc[genredf['classid'] == 1, : ]
    negative = genredf.loc[genredf['classid'] == 0, : ]
    
    postrain = positive.sample(n = 100)
    negtrain = negative.sample(n = 100)
    
    # Here we should really have some code to ensure that authors
    # are represented in train OR in test. But this is a first pass.
    
    traindf = pd.concat([postrain, negtrain])
    
    takenids = traindf.index.tolist()
    
    allids = genredf.index.tolist()
    remainingids = set(allids) - set(takenids)
    
    testdf = genredf.loc[remainingids, : ]
    return traindf.sample(frac = 1), testdf.sample(frac = 1)
    

In [5]:
def grid_search(traindf):
    
    # first count words in all the texts
    vectorizer = CountVectorizer(max_features = 10000).fit(traindf['text'])
    
    # this returns a sparse matrix which will need transformation
    sparse_matrix = vectorizer.transform(traindf['text'])
    feature_names = vectorizer.get_feature_names()
    
    # we don't really have to name the columns, but we do need to transform
    # the sparse matrix to a normal one
    termdoc = pd.DataFrame(sparse_matrix.toarray(), columns = feature_names)
    
    # normalize by dividing by the number of words in each text
    rowsums = termdoc.sum(axis=1)
    normalized_termdoc = termdoc.div(rowsums, axis = 0)
    
    colsums = normalized_termdoc.sum(axis = 0)
    
    # also scale by the mean and stdev of each feature
    scaler = StandardScaler().fit(normalized_termdoc)
    scaled_matrix = scaler.transform(normalized_termdoc)
    
    # get features sorted by mutual information
    # feature_mi = mutual_info_classif(scaled_matrix, traindf['classid'])
    feature_tuples = [x for x in zip(colsums, vectorizer.get_feature_names())]
    feature_tuples.sort(reverse = True)
    
    featureindices = []
    for freq, feature in feature_tuples:
        featureindices.append(feature_names.index(feature))
    
    y = traindf['classid']
    
    # now we're prepared for gridsearch
    
    results = []
    
    for featurelen in [10, 50, 100, 200, 400, 800, 1600, 2400, 3200]:
        for C in [.0001, .001, .01, .1, 1, 10, 100, 1000]:
            
            X = scaled_matrix[ : , featureindices[0 : featurelen]]
            modeler = LogisticRegression(C = C)
            scores = cross_validate(modeler, X, y, cv = 4, n_jobs = 6)
            accuracy = np.mean(scores['test_score'])
            results.append((accuracy, 1/featurelen, featurelen, C))
            
            # the reason for 1/featurelen is that in cases of an accuracy tie
            # I want the sorting to choose the model with the smallest feature
            # count!
    
    results.sort(reverse = True)
    
    acc, inversetosort, featurelen, C = results[0]
    
    X = scaled_matrix[ : , featureindices[0 : featurelen]]
    best_model = LogisticRegression(C = C).fit(X, y)
    
    return results[0: 20], featureindices[0 : featurelen], vectorizer, scaler, best_model     

In [6]:
def apply_to_testset(testdf, features, vectorizer, scaler, best_model):
    
    sparse_matrix = vectorizer.transform(testdf['text'])
    termdoc = pd.DataFrame(sparse_matrix.toarray())
    
    # normalize by dividing by the number of words in each text
    rowsums = termdoc.sum(axis=1)
    normalized_termdoc = termdoc.div(rowsums, axis = 0)
    
    # and scale using the saved scaler
    scaled_matrix = scaler.transform(normalized_termdoc)
    X = scaled_matrix[ : , features]
    y = testdf['classid']
    
    predictions = best_model.predict(X)
    
    testacc = sum(predictions == y) / len(y)
    print('test accuracy: ', testacc, testdf.shape[0])
    print()
    
    return testacc
    

def repeatedly_validate(gname):
    
    print(gname)
    print()
    
    genredf = get_genretexts(gname)
    traintrials = []
    validations = []
    featurelens = []
    
    for i in range(3):
        train, test = test_train_split(genredf)
        
        results, features, vectorizer, scaler, best_model = grid_search(train)
        
        acc, inversetosort, featurelen, C = results[0]
        print(gname + ' cross-validation acc: ', acc, featurelen, C)
        traintrials.append(acc)
        featurelens.append(featurelen)
        
        validacc = apply_to_testset(test, features, vectorizer, scaler, best_model)
        validations.append(validacc)
    
    return np.mean(validations), validations, np.mean(traintrials), np.mean(featurelens)

In [7]:
def try_all_genres(genre_names):

    with open('predictivereviewresults1.tsv', mode = 'w', encoding = 'utf-8') as f:
        f.write('genre\tmeanvalidation\tvalidations\tmean_cv\tmean_features\n')
        
    for gname in genre_names:
        meanvalid, validations, mean_cv, mean_features = repeatedly_validate(gname)
        
        with open('predictivereviewresults1.tsv', mode = 'a', encoding = 'utf-8') as f:
            f.write(gname + '\t' + str(meanvalid) + '\t' + str(validations) + '\t' + str(mean_cv) +
                    '\t' + str(mean_features) + '\n')
        

In [9]:
genre_names = ['biography', 'britain', 'englishfiction', 'folklore', 'history', 'juvenile', 'northamerica', 
               'novel', 'random', 'romance', 'social', 'stories', 'unmarked', 'war']

if __name__ == '__main__':
    try_all_genres(genre_names)

biography

biography cross-validation acc:  0.6599999999999999 1600 0.01
test accuracy:  0.6533333333333333 150

biography cross-validation acc:  0.655 2400 0.0001
test accuracy:  0.64 150

biography cross-validation acc:  0.6000000000000001 400 0.001
test accuracy:  0.5866666666666667 150

britain

britain cross-validation acc:  0.6000000000000001 50 0.01
test accuracy:  0.5652173913043478 138

britain cross-validation acc:  0.625 100 1
test accuracy:  0.4927536231884058 138

britain cross-validation acc:  0.5700000000000001 800 0.0001
test accuracy:  0.5797101449275363 138

englishfiction

englishfiction cross-validation acc:  0.55 800 1000
test accuracy:  0.545 200

englishfiction cross-validation acc:  0.5900000000000001 1600 0.001
test accuracy:  0.485 200

englishfiction cross-validation acc:  0.61 10 0.0001
test accuracy:  0.5 200

folklore

folklore cross-validation acc:  0.9349999999999999 2400 100
test accuracy:  0.9 80

folklore cross-validation acc:  0.9199999999999999 2400