In [1]:
import pickle

def pickle_load(n):
    with open('./features/Model2_no_stopwords/fiction_' + str(n) + '.pickle', 'rb') as f:
        fic = pickle.load(f)

    with open('./features/Model2_no_stopwords/non_fiction_' + str(n) + '.pickle', 'rb') as f:
        nonfic = pickle.load(f)

    return fic, nonfic

fic50, nonfic50     = pickle_load(50)
fic100, nonfic100   = pickle_load(100)
fic500, nonfic500   = pickle_load(500)
fic1000, nonfic1000 = pickle_load(1000)
fic3000, nonfic3000 = pickle_load(3000)

In [2]:
'''
Different hyper-parameters in Logistic Regression.
'''

from sklearn.linear_model import LogisticRegression

def my_LogReg(X):
    # Getting labels ready: 1 is for Fiction, 0 is for NonFiction
    labels = [1]*4558 + [0]*4558
    
    tuned_parameters = [{'C': [1, 10, 100, 1000], 'penalty': ['l1'], 'solver': ['liblinear']}, 
                        {'C': [1, 10, 100, 1000], 'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs', 'sag']}]
    metrics = ['f1']
    m = ['precision', 'f1', 'accuracy', 'recall']
    
    # List of 4 dictionaries, where each dictionary represents all the results for that particular best model.
    models = []
    
    for score in metrics:
        model = {}
        lr = LogisticRegression()
        print "Running for ", score
        clf = GridSearchCV(lr, tuned_parameters, cv=10, scoring=score, verbose=1)
        clf.fit(X, labels)
        print "\nBest parameters for " + score + ": " + str(clf.best_estimator_)
        print "Best score achieved for " + score + ": " + str(clf.best_score_)
        best_lr = clf.best_estimator_
        
        for s in m:
            print "Running the best " + score + " model for " + s + ".."
            model[s] = np.array(cross_val_score(best_lr, X, labels, cv=10, scoring=s))
        
        print "For ", score 
        print model
        print "\n\n"
        models.append(model)

    return models

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.grid_search import GridSearchCV
import numpy as np
from sklearn.linear_model import LogisticRegression

'''
For 50:
'''
# Getting it ready for Count Vectorizer:
countvec_fiction50 = []
countvec_nonfiction50 = []

for doc in fic50:
    temp = ' '.join(doc)
    countvec_fiction50.append(temp)
    
for doc in nonfic50:
    temp = ' '.join(doc)
    countvec_nonfiction50.append(temp)
    
fiction_plus_nonfiction50 = countvec_fiction50 + countvec_nonfiction50
    
vectorizer50 = CountVectorizer()
X50 = vectorizer50.fit_transform(fiction_plus_nonfiction50)

results_50 = my_LogReg(X50)
print "Done for 50\n"
print "######################################################\n\n"

'''
For 100:
'''
# Getting it ready for Count Vectorizer:
countvec_fiction100 = []
countvec_nonfiction100 = []

for doc in fic100:
    temp = ' '.join(doc)
    countvec_fiction100.append(temp)
    
for doc in nonfic100:
    temp = ' '.join(doc)
    countvec_nonfiction100.append(temp)
    
fiction_plus_nonfiction100 = countvec_fiction100 + countvec_nonfiction100
    
vectorizer100 = CountVectorizer()
X100 = vectorizer100.fit_transform(fiction_plus_nonfiction100)

results_100 = my_LogReg(X100)
print "Done for 100\n"
print "######################################################\n\n"


'''
For 500:
'''
# Getting it ready for Count Vectorizer:
countvec_fiction500 = []
countvec_nonfiction500 = []

for doc in fic500:
    temp = ' '.join(doc)
    countvec_fiction500.append(temp)
    
for doc in nonfic500:
    temp = ' '.join(doc)
    countvec_nonfiction500.append(temp)
    
fiction_plus_nonfiction500 = countvec_fiction500 + countvec_nonfiction500
    
vectorizer500 = CountVectorizer()
X500 = vectorizer500.fit_transform(fiction_plus_nonfiction500)

results_500 = my_LogReg(X500)
print "Done for 500\n"
print "######################################################\n\n"

'''
For 1000:
'''
# Getting it ready for Count Vectorizer:
countvec_fiction1000 = []
countvec_nonfiction1000 = []

for doc in fic1000:
    temp = ' '.join(doc)
    countvec_fiction1000.append(temp)
    
for doc in nonfic1000:
    temp = ' '.join(doc)
    countvec_nonfiction1000.append(temp)
    
fiction_plus_nonfiction1000 = countvec_fiction1000 + countvec_nonfiction1000
    
vectorizer1000 = CountVectorizer()
X1000 = vectorizer1000.fit_transform(fiction_plus_nonfiction1000)

results_1000 = my_LogReg(X1000)
print "Done for 1000\n"
print "######################################################\n\n"

'''
For 3000:
'''
# Getting it ready for Count Vectorizer:
countvec_fiction3000 = []
countvec_nonfiction3000 = []

for doc in fic3000:
    temp = ' '.join(doc)
    countvec_fiction3000.append(temp)
    
for doc in nonfic3000:
    temp = ' '.join(doc)
    countvec_nonfiction3000.append(temp)
    
fiction_plus_nonfiction3000 = countvec_fiction3000 + countvec_nonfiction3000
    
vectorizer3000 = CountVectorizer()
X3000 = vectorizer3000.fit_transform(fiction_plus_nonfiction3000)

results_3000 = my_LogReg(X3000)
print "Done for 3000\n"
print "######################################################\n\n"



Running for  f1
Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed:  2.1min finished



Best parameters for f1: LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)
Best score achieved for f1: 0.953743629563
Running the best f1 model for precision..
Running the best f1 model for f1..
Running the best f1 model for accuracy..
Running the best f1 model for recall..
For  f1
{'recall': array([ 0.89254386,  0.98684211,  0.98245614,  0.99561404,  0.95175439,
        0.95175439,  0.94517544,  0.96710526,  0.90989011,  0.87032967]), 'f1': array([ 0.92710706,  0.97932535,  0.97180043,  0.97950378,  0.96124031,
        0.95911602,  0.95248619,  0.95973885,  0.93559322,  0.91139241]), 'precision': array([ 0.96445498,  0.97192225,  0.96137339,  0.96390658,  0.97091723,
        0.96659243,  0.95991091,  0.9524838 ,  0.9627907 ,  0.95652174]), 'accuracy': array([ 0.92982456,  0.97916

[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed:  3.6min finished



Best parameters for f1: LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False)
Best score achieved for f1: 0.957330346503
Running the best f1 model for precision..
Running the best f1 model for f1..
Running the best f1 model for accuracy..
Running the best f1 model for recall..
For  f1
{'recall': array([ 0.89912281,  0.99342105,  0.99122807,  0.99122807,  0.95175439,
        0.95394737,  0.95175439,  0.96710526,  0.91648352,  0.86153846]), 'f1': array([ 0.92865232,  0.99016393,  0.9826087 ,  0.97941495,  0.96017699,
        0.96559378,  0.95594714,  0.96078431,  0.94450736,  0.90531178]), 'precision': array([ 0.96018735,  0.9869281 ,  0.97413793,  0.96788009,  0.96875   ,
        0.97752809,  0.96017699,  0.95454545,  0.97429907,  0.95377129]), 'accuracy': array([ 0.93092105,  0.9

[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed: 28.1min finished



Best parameters for f1: LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)
Best score achieved for f1: 0.965762314073
Running the best f1 model for precision..
Running the best f1 model for f1..
Running the best f1 model for accuracy..
Running the best f1 model for recall..
For  f1
{'recall': array([ 0.91008772,  0.99561404,  0.99342105,  0.99780702,  0.95614035,
        0.96052632,  0.94517544,  0.97368421,  0.92967033,  0.87912088]), 'f1': array([ 0.94640821,  0.99018539,  0.9869281 ,  0.98698482,  0.97104677,
        0.96688742,  0.96205357,  0.97368421,  0.95377678,  0.91954023]), 'precision': array([ 0.98574822,  0.98481562,  0.98051948,  0.97639485,  0.98642534,
        0.97333333,  0.97954545,  0.97368421,  0.97916667,  0.96385542]), 'accuracy': array([ 0.94846491,  0.99

[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed: 61.7min finished



Best parameters for f1: LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)
Best score achieved for f1: 0.965480676235
Running the best f1 model for precision..
Running the best f1 model for f1..
Running the best f1 model for accuracy..
Running the best f1 model for recall..
For  f1
{'recall': array([ 0.91666667,  0.99122807,  0.99342105,  0.99561404,  0.95394737,
        0.9627193 ,  0.95614035,  0.96491228,  0.92747253,  0.86593407]), 'f1': array([ 0.94570136,  0.98905908,  0.98585419,  0.98803047,  0.97098214,
        0.97555556,  0.96460177,  0.97345133,  0.94938133,  0.91203704]), 'precision': array([ 0.97663551,  0.98689956,  0.97840173,  0.98056156,  0.98863636,
        0.98873874,  0.97321429,  0.98214286,  0.97235023,  0.96332518]), 'accuracy': array([ 0.94736842,  0.989

[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed: 267.1min finished



Best parameters for f1: LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Best score achieved for f1: 0.964867161461
Running the best f1 model for precision..
Running the best f1 model for f1..
Running the best f1 model for accuracy..
Running the best f1 model for recall..
For  f1
{'recall': array([ 0.91447368,  0.98903509,  0.99342105,  0.99780702,  0.95833333,
        0.95833333,  0.95175439,  0.96929825,  0.92747253,  0.86593407]), 'f1': array([ 0.94677237,  0.98579235,  0.9826087 ,  0.98803047,  0.96895787,
        0.97009967,  0.96222222,  0.9726776 ,  0.94903737,  0.9228972 ]), 'precision': array([ 0.97423888,  0.97830803,  0.98264642,  0.97639485,  0.97972973,
        0.97544643,  0.97959184,  0.97161572,  0.97882353,  0.98743719]), 'accuracy': array([ 0.94407895,  0