# Testing a fiction filter

This very simply loads some training data and trains a regularized logistic regression on it, using gridsearch to find an optimal number of features and regularization constant. We optimize on F1 score.

There are more sophisticated feature-selection strategies than this, but if I used them I would also need a more sophisticated validation strategy to avoid fooling myself; e.g. a validation set separate from the test set.

In [8]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import StandardScaler
import pickle

In [33]:
rawdata = pd.read_csv('trainingdata.tsv', sep = '\t')
rawdata.head()

Unnamed: 0,sequenceID,genrecode,#matchquality,#rareword,the,of,and,a,is,to,...,air,title,neither,four,hope,especially,able,y,used,somewhat
0,159-3,y,2.64087,0.494662,0.003559,0.003559,0.003559,0.003559,0.003559,0.003559,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,208-1,y,3.153019,0.535714,0.004464,0.004464,0.004464,0.004464,0.004464,0.004464,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,533,y,2.260767,0.778061,0.001276,0.001276,0.001276,0.001276,0.001276,0.001276,...,0.001276,0.0,0.0,0.0,0.001276,0.001276,0.0,0.001276,0.0,0.0
3,352,y,2.654242,0.459016,0.008197,0.008197,0.008197,0.008197,0.008197,0.008197,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,433,y,3.043333,0.564748,0.003597,0.003597,0.003597,0.003597,0.003597,0.003597,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003597


This is the complete dataset as it exists on disk. We may not use all the features. Notice how the most common feature is '#rareword', i.e., English word outside this limited vocabulary.

We're going to select only the columns for features, leaving out the first two metadata columns.

In [34]:
termdoc = rawdata.iloc[ : , 2 : 402]
termdoc.shape

(505, 400)

Let's create a vector that maps genrecode (is it fiction n/y) to an integer code.

In [35]:
classvec = rawdata.genrecode.map({'n' : 0, 'y': 1})

Now the function that actually trains a logistic model.

In [36]:
def onepass(cval, numfeatures, termdoc, classvec):
    '''
    cval is the regularization constant
    numfeatures the number of features to use in the model
    termdoc is X, aka the feature matrix
    classvec is y, aka the vector of class integers to be predicted
    '''
    
    scaler = StandardScaler()
    data = scaler.fit_transform(termdoc.iloc[ : , 0 : numfeatures])
    
    # Note that we scale and center the columns of the feature matrix.
    
    model = LogisticRegression(C = cval)
    f1_scores = cross_val_score(model, data, classvec,
                             scoring = 'f1', cv=10)
    f1 = sum(f1_scores) / len(f1_scores)
    # Tenfold crossvalidation, using F1 score.
    
    precision_scores = cross_val_score(model, data, classvec,
                             scoring = 'precision', cv=10)
    precision = sum(precision_scores) / len(precision_scores)
    
    recall_scores = cross_val_score(model, data, classvec,
                             scoring = 'recall', cv=10)
    recall = sum(recall_scores) / len(recall_scores)
    
    f05 = 1.5 * (precision * recall) / ((.5 * precision) + recall)
    
    model.fit(data, classvec)
    predictions = model.predict(data)
    
    # We return both the average F1 score of a cross-
    # validated model, and the predictions of a model
    # trained on all the data.
    
    return f05, precision, recall, predictions

Grid search across a range of feature numbers and regularization constants. Notice that for regularization, we iterate across an integer range but then divide by ten thousand. So the best value of 100 is actually .01.

In [37]:
bestscores = []
for features in range(120, 400, 10):
    for cval in range(50, 300, 10):
        f05, precision, recall, predictions = onepass(cval/ 10000, features, termdoc, classvec)
        print(f05, cval, features)
        bestscores.append((f05, cval, features))
        

0.85683704973 50 120
0.861683416155 60 120
0.858961325994 70 120
0.857602791495 80 120
0.856370414361 90 120
0.855232812036 100 120
0.854915235916 110 120
0.851610037983 120 120
0.852932623746 130 120
0.855852534695 140 120
0.855370144368 150 120
0.856631519174 160 120
0.854998611147 170 120
0.854998611147 180 120
0.859094302128 190 120
0.856821836083 200 120
0.858045871754 210 120
0.856732712898 220 120
0.857921654117 230 120
0.857921654117 240 120
0.857921654117 250 120
0.857921654117 260 120
0.857921654117 270 120
0.859166350946 280 120
0.860393096469 290 120
0.842814356705 50 130
0.850442234559 60 130
0.850652474582 70 130
0.847433497516 80 130
0.842653638268 90 130
0.842723894971 100 130
0.847116179664 110 130
0.847116179664 120 130
0.846698084797 130 130
0.849531058092 140 130
0.851833768059 150 130
0.851833768059 160 130
0.851790384493 170 130
0.851790384493 180 130
0.854484237221 190 130
0.855809644841 200 130
0.857546665287 210 130
0.858540734701 220 130
0.859791648499 230 130

0.865887395041 160 260
0.865887395041 170 260
0.862775439313 180 260
0.861476573761 190 260
0.862915463729 200 260
0.864339370162 210 260
0.867063841791 220 260
0.865872164983 230 260
0.86725438371 240 260
0.86725438371 250 260
0.86725438371 260 260
0.865487228389 270 260
0.865234871287 280 260
0.865234871287 290 260
0.855895665795 50 270
0.851513925193 60 270
0.853307364128 70 270
0.855893645269 80 270
0.858857803629 90 270
0.859503086069 100 270
0.863094285756 110 270
0.864125886479 120 270
0.864125886479 130 270
0.862070340035 140 270
0.860253948262 150 270
0.858820156741 160 270
0.860286077582 170 270
0.860286077582 180 270
0.864723525999 190 270
0.866138629931 200 270
0.864497397792 210 270
0.864497397792 220 270
0.864497397792 230 270
0.862738849596 240 270
0.864228166131 250 270
0.865534453287 260 270
0.865558324504 270 270
0.868231762652 280 270
0.868231762652 290 270
0.851810447292 50 280
0.855878116206 60 280
0.853934498454 70 280
0.855586848222 80 280
0.85962701785 90 280
0.

In [38]:
bestscores.sort()
bestscores[-1]

(0.88353218900242325, 170, 320)

Test the best model more specifically. Note that precision is good, which is important.

In [39]:
f05, precision, recall, predictions = onepass(.0170, 320, termdoc, classvec)
print(f05, precision, recall, sum(predictions))

0.883532189002 0.949222876241 0.776111111111 322


### Produce a model to export

Ultimately we have to make a model to use, and this can't be crossvalidated.

In [30]:
scaler = StandardScaler()
data = scaler.fit_transform(termdoc.iloc[ : , 0 : 330])
model = LogisticRegression(C = .0260)
model.fit(data, classvec)
print('Model trained.')

Model trained.


In [34]:
with open('fictionreview_scaler.pkl', mode = 'wb') as f:
    pickle.dump(scaler, f)

with open('fictionreview_model.pkl', mode = 'wb') as f:
    pickle.dump(model, f)

words = list(termdoc.columns)
with open('fictionreview_vocab.txt', mode = 'w', encoding = 'utf-8') as f:
    for w in words[0: 400]:
        f.write(w + '\n')
    
    

In [31]:
words = list(termdoc.columns)

In [32]:
features = list(zip(list(model.coef_[0]), words))
features.sort()
for x, y in features:
    print(y, x)

voyages -0.267336648243
letters -0.219930648761
subject -0.184078101237
year -0.141330701794
came -0.136963180258
set -0.134250743054
#rareword -0.128230321524
every -0.123960378007
history -0.120813234593
sense -0.118523043698
years -0.111317361256
have -0.111218886601
under -0.105723492365
here -0.104435215001
state -0.096682472146
no -0.0928452440297
#romannumeral -0.0924782114431
i -0.0922178266268
think -0.0893390356446
something -0.0890621066751
during -0.085242941688
knowledge -0.0821307371905
h -0.0819647243647
must -0.0811553627844
among -0.0762262592473
own -0.0753208020356
even -0.074267723266
those -0.0735561095232
age -0.0727081230924
volume -0.072214208641
makes -0.0716348365576
cannot -0.0706946825306
each -0.0696362384984
so -0.0695204954841
me -0.0684433284252
co -0.0661531470648
my -0.0638831792641
till -0.063664709479
matter -0.0631487038202
also -0.0630160086945
p -0.0624107517832
this -0.0593533480628
j -0.0579417904779
ill -0.0574710404869
natural -0.0571052468132

In [52]:
vector = np.array(termdoc.iloc[1 , ])


In [55]:
data = vector[0:210].reshape(1, -1)
vecscaled = scaler.transform(data)

In [57]:
model.predict_proba(vecscaled)

array([[ 0.25452809,  0.74547191]])

In [58]:
classvec[1]

1