In [1]:
import pickle
def pickle_load(n):
    with open('./features/Model2_no_stopwords/fiction_' + str(n) + '.pickle', 'rb') as f:
        fic = pickle.load(f)

    with open('./features/Model2_no_stopwords/non_fiction_' + str(n) + '.pickle', 'rb') as f:
        nonfic = pickle.load(f)

    return fic, nonfic

fic50, nonfic50     = pickle_load(50)
fic100, nonfic100   = pickle_load(100)
fic500, nonfic500   = pickle_load(500)
fic1000, nonfic1000 = pickle_load(1000)
fic3000, nonfic3000 = pickle_load(3000)

In [2]:
'''
Different smoothing parameters in Naive Bayes.
'''

# New better version:
def my_NB(X):
    # Getting labels ready: 1 is for Fiction, 0 is for NonFiction
    labels = [1]*4558 + [0]*4558
    
    tuned_parameters = {'alpha': [1.0, 1e-3, 1e-6, 1e-12, 1e-18]}
    
    metrics = ['precision', 'f1', 'accuracy', 'recall']
    
    # List of 4 dictionaries, where each dictionary represents all the results for that particular best model.
    models = []
    
    for score in metrics:
        model = {}
        nb = MultinomialNB()
        print "Running for ", score
        clf = GridSearchCV(nb, tuned_parameters, cv=10, scoring=score, verbose=1)
        clf.fit(X, labels)
        print "\nBest parameters for " + score + ": " + str(clf.best_estimator_)
        print "Best score achieved for " + score + ": " + str(clf.best_score_)
        best_nb = clf.best_estimator_
        # Now that I have the best parameters for each metric, running SVM for those specific parameters to obtain 
        # all values.
        for s in metrics:
            print "Running the best " + score + " model for " + s + ".."
            model[s] = np.array(cross_val_score(best_nb, X, labels, cv=10, scoring=s))
        
        print "For ", score 
        print model
        print "\n\n"
        models.append(model)

    return models

In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.grid_search import GridSearchCV
import numpy as np


'''
For 50:
'''
# Getting it ready for Count Vectorizer:
countvec_fiction50 = []
countvec_nonfiction50 = []

for doc in fic50:
    temp = ' '.join(doc)
    countvec_fiction50.append(temp)
    
for doc in nonfic50:
    temp = ' '.join(doc)
    countvec_nonfiction50.append(temp)
    
fiction_plus_nonfiction50 = countvec_fiction50 + countvec_nonfiction50
    
vectorizer50 = CountVectorizer()
X50 = vectorizer50.fit_transform(fiction_plus_nonfiction50)


'''
For 100:
'''
# Getting it ready for Count Vectorizer:
countvec_fiction100 = []
countvec_nonfiction100 = []

for doc in fic100:
    temp = ' '.join(doc)
    countvec_fiction100.append(temp)
    
for doc in nonfic100:
    temp = ' '.join(doc)
    countvec_nonfiction100.append(temp)
    
fiction_plus_nonfiction100 = countvec_fiction100 + countvec_nonfiction100

vectorizer100 = CountVectorizer()
X100 = vectorizer100.fit_transform(fiction_plus_nonfiction100)


'''
For 500:
'''
# Getting it ready for Count Vectorizer:
countvec_fiction500 = []
countvec_nonfiction500 = []

for doc in fic500:
    temp = ' '.join(doc)
    countvec_fiction500.append(temp)
    
for doc in nonfic500:
    temp = ' '.join(doc)
    countvec_nonfiction500.append(temp)
    
fiction_plus_nonfiction500 = countvec_fiction500 + countvec_nonfiction500
    
vectorizer500 = CountVectorizer()
X500 = vectorizer500.fit_transform(fiction_plus_nonfiction500)


'''
For 1000:
'''
# Getting it ready for Count Vectorizer:
countvec_fiction1000 = []
countvec_nonfiction1000 = []

for doc in fic1000:
    temp = ' '.join(doc)
    countvec_fiction1000.append(temp)
    
for doc in nonfic1000:
    temp = ' '.join(doc)
    countvec_nonfiction1000.append(temp)
    
fiction_plus_nonfiction1000 = countvec_fiction1000 + countvec_nonfiction1000

vectorizer1000 = CountVectorizer()
X1000 = vectorizer1000.fit_transform(fiction_plus_nonfiction1000)

'''
For 3000:
'''
# Getting it ready for Count Vectorizer:
countvec_fiction3000 = []
countvec_nonfiction3000 = []

for doc in fic3000:
    temp = ' '.join(doc)
    countvec_fiction3000.append(temp)
    
for doc in nonfic3000:
    temp = ' '.join(doc)
    countvec_nonfiction3000.append(temp)
    
fiction_plus_nonfiction3000 = countvec_fiction3000 + countvec_nonfiction3000
    
vectorizer3000 = CountVectorizer()
X3000 = vectorizer3000.fit_transform(fiction_plus_nonfiction3000)


# DONE with all cases.

print "Set I) 50 words:"
results_50 = my_NB(X50)
print "\n################################################################\n"
print "Set II) 100 words:"
results_100 = my_NB(X100)
print "\n################################################################\n"
print "Set III) 500 words:"
results_500 = my_NB(X500)
print "\n################################################################\n"
print "Set IV) 1000 words:"
results_1000 = my_NB(X1000)
print "\n################################################################\n"
print "Set V) 3000 words:"
results_3000 = my_NB(X3000)
print "\n################################################################\n"



Set I) 50 words:
Running for  precision
Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.9s finished



Best parameters for precision: MultinomialNB(alpha=1e-18, class_prior=None, fit_prior=True)
Best score achieved for precision: 0.957937540059
Running the best precision model for precision..
Running the best precision model for f1..
Running the best precision model for accuracy..
Running the best precision model for recall..
For  precision
{'recall': array([ 0.80921053,  0.91008772,  0.93201754,  0.85745614,  0.89692982,
        0.91447368,  0.89254386,  0.91008772,  0.85494505,  0.78241758]), 'f1': array([ 0.88066826,  0.93573844,  0.94130676,  0.90404624,  0.92117117,
        0.93707865,  0.9218573 ,  0.93258427,  0.91100703,  0.8588661 ]), 'precision': array([ 0.96596859,  0.96287703,  0.950783  ,  0.95599022,  0.94675926,
        0.96082949,  0.95316159,  0.9562212 ,  0.97493734,  0.95187166]), 'accuracy': array([ 0.89035088,  0.9375    ,  0.94188596,  0.90899123,  0.92324561,
        0.93859649,  0.92434211,  0.93421053,  0.91648352,  0.87142857])}



Running for  f1
Fitting 10 f

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.7s finished



Best parameters for f1: MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True)
Best score achieved for f1: 0.935659929745
Running the best f1 model for precision..
Running the best f1 model for f1..
Running the best f1 model for accuracy..
Running the best f1 model for recall..
For  f1
{'recall': array([ 0.875     ,  0.97368421,  0.97368421,  0.95833333,  0.94078947,
        0.94298246,  0.92105263,  0.94078947,  0.88351648,  0.82197802]), 'f1': array([ 0.912     ,  0.96626768,  0.96208017,  0.95310796,  0.94182217,
        0.94818082,  0.93541203,  0.93975904,  0.91780822,  0.88      ]), 'precision': array([ 0.9522673 ,  0.95896328,  0.95074946,  0.94793926,  0.94285714,
        0.95343681,  0.95022624,  0.93873085,  0.95486936,  0.94683544]), 'accuracy': array([ 0.91557018,  0.96600877,  0.96162281,  0.95285088,  0.94188596,
        0.94846491,  0.93640351,  0.93969298,  0.92087912,  0.88791209])}



Running for  accuracy
Fitting 10 folds for each of 5 candidates, totalling 50

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.5s finished



Best parameters for accuracy: MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True)
Best score achieved for accuracy: 0.937143483984
Running the best accuracy model for precision..
Running the best accuracy model for f1..
Running the best accuracy model for accuracy..
Running the best accuracy model for recall..
For  accuracy
{'recall': array([ 0.875     ,  0.97368421,  0.97368421,  0.95833333,  0.94078947,
        0.94298246,  0.92105263,  0.94078947,  0.88351648,  0.82197802]), 'f1': array([ 0.912     ,  0.96626768,  0.96208017,  0.95310796,  0.94182217,
        0.94818082,  0.93541203,  0.93975904,  0.91780822,  0.88      ]), 'precision': array([ 0.9522673 ,  0.95896328,  0.95074946,  0.94793926,  0.94285714,
        0.95343681,  0.95022624,  0.93873085,  0.95486936,  0.94683544]), 'accuracy': array([ 0.91557018,  0.96600877,  0.96162281,  0.95285088,  0.94188596,
        0.94846491,  0.93640351,  0.93969298,  0.92087912,  0.88791209])}



Running for  recall
Fitting 10 fold

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.1s finished



Best parameters for recall: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Best score achieved for recall: 0.935717419921
Running the best recall model for precision..
Running the best recall model for f1..
Running the best recall model for accuracy..
Running the best recall model for recall..
For  recall
{'recall': array([ 0.90570175,  0.98903509,  0.97807018,  0.97149123,  0.9495614 ,
        0.94298246,  0.92324561,  0.9495614 ,  0.8989011 ,  0.84835165]), 'f1': array([ 0.91371681,  0.96264674,  0.94591729,  0.94557097,  0.93318966,
        0.93174431,  0.92324561,  0.92918455,  0.90989989,  0.88027366]), 'precision': array([ 0.921875  ,  0.93762994,  0.91581109,  0.92099792,  0.91737288,
        0.92077088,  0.92324561,  0.90966387,  0.92117117,  0.91469194]), 'accuracy': array([ 0.91447368,  0.96162281,  0.94407895,  0.94407895,  0.93201754,
        0.93092105,  0.92324561,  0.92763158,  0.91098901,  0.88461538])}




##################################################

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.6s finished



Best parameters for precision: MultinomialNB(alpha=1e-18, class_prior=None, fit_prior=True)
Best score achieved for precision: 0.965461237398
Running the best precision model for precision..
Running the best precision model for f1..
Running the best precision model for accuracy..
Running the best precision model for recall..
For  precision
{'recall': array([ 0.79166667,  0.90570175,  0.92105263,  0.89254386,  0.89912281,
        0.89912281,  0.87280702,  0.91008772,  0.81978022,  0.74505495]), 'f1': array([ 0.86883273,  0.93650794,  0.9406495 ,  0.92605233,  0.92760181,
        0.9350057 ,  0.91810842,  0.9315376 ,  0.89127838,  0.84223602]), 'precision': array([ 0.96266667,  0.96948357,  0.9610984 ,  0.96217494,  0.95794393,
        0.97387173,  0.96836983,  0.95402299,  0.97643979,  0.96857143]), 'accuracy': array([ 0.88048246,  0.93859649,  0.94188596,  0.92872807,  0.92982456,
        0.9375    ,  0.92214912,  0.93311404,  0.9       ,  0.86043956])}



Running for  f1
Fitting 10 f

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.4s finished



Best parameters for f1: MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True)
Best score achieved for f1: 0.932576481618
Running the best f1 model for precision..
Running the best f1 model for f1..
Running the best f1 model for accuracy..
Running the best f1 model for recall..
For  f1
{'recall': array([ 0.85745614,  0.98026316,  0.97149123,  0.97149123,  0.93201754,
        0.93421053,  0.90350877,  0.92763158,  0.87252747,  0.78461538]), 'f1': array([ 0.90300231,  0.97279652,  0.96514161,  0.96619411,  0.93509351,
        0.94352159,  0.92792793,  0.93274531,  0.91474654,  0.86440678]), 'precision': array([ 0.95365854,  0.96544276,  0.95887446,  0.96095445,  0.93818985,
        0.95302013,  0.9537037 ,  0.93791574,  0.96125908,  0.96226415]), 'accuracy': array([ 0.90789474,  0.97258772,  0.96491228,  0.96600877,  0.93530702,
        0.94407895,  0.92982456,  0.93311404,  0.91868132,  0.87692308])}



Running for  accuracy
Fitting 10 folds for each of 5 candidates, totalling 50

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.7s finished



Best parameters for accuracy: MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True)
Best score achieved for accuracy: 0.934949539272
Running the best accuracy model for precision..
Running the best accuracy model for f1..
Running the best accuracy model for accuracy..
Running the best accuracy model for recall..
For  accuracy
{'recall': array([ 0.85745614,  0.98026316,  0.97149123,  0.97149123,  0.93201754,
        0.93421053,  0.90350877,  0.92763158,  0.87252747,  0.78461538]), 'f1': array([ 0.90300231,  0.97279652,  0.96514161,  0.96619411,  0.93509351,
        0.94352159,  0.92792793,  0.93274531,  0.91474654,  0.86440678]), 'precision': array([ 0.95365854,  0.96544276,  0.95887446,  0.96095445,  0.93818985,
        0.95302013,  0.9537037 ,  0.93791574,  0.96125908,  0.96226415]), 'accuracy': array([ 0.90789474,  0.97258772,  0.96491228,  0.96600877,  0.93530702,
        0.94407895,  0.92982456,  0.93311404,  0.91868132,  0.87692308])}



Running for  recall
Fitting 10 fold

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.9s finished



Best parameters for recall: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Best score achieved for recall: 0.925625274243
Running the best recall model for precision..
Running the best recall model for f1..
Running the best recall model for accuracy..
Running the best recall model for recall..
For  recall
{'recall': array([ 0.88596491,  0.98464912,  0.97368421,  0.97587719,  0.93640351,
        0.9495614 ,  0.90789474,  0.93859649,  0.8989011 ,  0.8043956 ]), 'f1': array([ 0.90178571,  0.95531915,  0.94468085,  0.94580234,  0.92025862,
        0.93520518,  0.91089109,  0.91452991,  0.90687361,  0.85814771]), 'precision': array([ 0.91818182,  0.92768595,  0.91735537,  0.91752577,  0.90466102,
        0.9212766 ,  0.91390728,  0.89166667,  0.91498881,  0.91959799]), 'accuracy': array([ 0.90350877,  0.95394737,  0.94298246,  0.94407895,  0.91885965,
        0.93421053,  0.91118421,  0.9122807 ,  0.90769231,  0.86703297])}




##################################################

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   10.8s finished



Best parameters for precision: MultinomialNB(alpha=1e-18, class_prior=None, fit_prior=True)
Best score achieved for precision: 0.97620558378
Running the best precision model for precision..
Running the best precision model for f1..
Running the best precision model for accuracy..
Running the best precision model for recall..
For  precision
{'recall': array([ 0.80482456,  0.92982456,  0.95614035,  0.94298246,  0.9122807 ,
        0.90570175,  0.87938596,  0.9122807 ,  0.78461538,  0.7010989 ]), 'f1': array([ 0.88327316,  0.94854586,  0.96996663,  0.96196868,  0.94117647,
        0.94077449,  0.92396313,  0.94011299,  0.87392901,  0.81377551]), 'precision': array([ 0.97866667,  0.96803653,  0.98419865,  0.98173516,  0.97196262,
        0.97867299,  0.97330097,  0.96969697,  0.98618785,  0.96960486]), 'accuracy': array([ 0.89364035,  0.9495614 ,  0.97039474,  0.9627193 ,  0.94298246,
        0.94298246,  0.92763158,  0.94188596,  0.88681319,  0.83956044])}



Running for  f1
Fitting 10 fo

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    9.0s finished



Best parameters for f1: MultinomialNB(alpha=1e-06, class_prior=None, fit_prior=True)
Best score achieved for f1: 0.924228663221
Running the best f1 model for precision..
Running the best f1 model for f1..
Running the best f1 model for accuracy..
Running the best f1 model for recall..
For  f1
{'recall': array([ 0.83114035,  0.97368421,  0.97149123,  0.96491228,  0.91885965,
        0.92324561,  0.89473684,  0.91447368,  0.8043956 ,  0.73846154]), 'f1': array([ 0.8907168 ,  0.96416938,  0.96619411,  0.96385542,  0.93736018,
        0.94288914,  0.92517007,  0.93602694,  0.8787515 ,  0.83686177]), 'precision': array([ 0.95949367,  0.95483871,  0.96095445,  0.96280088,  0.956621  ,
        0.96338673,  0.95774648,  0.95862069,  0.96825397,  0.96551724]), 'accuracy': array([ 0.89802632,  0.96381579,  0.96600877,  0.96381579,  0.93859649,
        0.94407895,  0.92763158,  0.9375    ,  0.88901099,  0.85604396])}



Running for  accuracy
Fitting 10 folds for each of 5 candidates, totalling 50

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    8.5s finished



Best parameters for accuracy: MultinomialNB(alpha=1e-12, class_prior=None, fit_prior=True)
Best score achieved for accuracy: 0.928916191312
Running the best accuracy model for precision..
Running the best accuracy model for f1..
Running the best accuracy model for accuracy..
Running the best accuracy model for recall..
For  accuracy
{'recall': array([ 0.81798246,  0.95175439,  0.9627193 ,  0.95614035,  0.91666667,
        0.90789474,  0.88157895,  0.91447368,  0.79340659,  0.71868132]), 'f1': array([ 0.88915375,  0.96124031,  0.97016575,  0.96566999,  0.94356659,
        0.93771234,  0.92413793,  0.94130926,  0.8783455 ,  0.82471627]), 'precision': array([ 0.97389034,  0.97091723,  0.97772829,  0.9753915 ,  0.97209302,
        0.96955504,  0.97101449,  0.96976744,  0.98365123,  0.96745562]), 'accuracy': array([ 0.89802632,  0.96162281,  0.97039474,  0.96600877,  0.94517544,
        0.93969298,  0.92763158,  0.94298246,  0.89010989,  0.84725275])}



Running for  recall
Fitting 10 fold

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    8.0s finished



Best parameters for recall: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Best score achieved for recall: 0.914436156209
Running the best recall model for precision..
Running the best recall model for f1..
Running the best recall model for accuracy..
Running the best recall model for recall..
For  recall
{'recall': array([ 0.87061404,  0.98684211,  0.97368421,  0.98026316,  0.9254386 ,
        0.93640351,  0.90131579,  0.93201754,  0.86593407,  0.77142857]), 'f1': array([ 0.89013453,  0.94043887,  0.93868922,  0.94603175,  0.91046386,
        0.92424242,  0.90131579,  0.91397849,  0.88738739,  0.82783019]), 'precision': array([ 0.91055046,  0.89820359,  0.90612245,  0.91411043,  0.89596603,
        0.91239316,  0.90131579,  0.89662447,  0.90993072,  0.89312977]), 'accuracy': array([ 0.89254386,  0.9375    ,  0.93640351,  0.94407895,  0.90899123,
        0.92324561,  0.90131579,  0.9122807 ,  0.89010989,  0.83956044])}




##################################################

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   23.0s finished



Best parameters for precision: MultinomialNB(alpha=1e-18, class_prior=None, fit_prior=True)
Best score achieved for precision: 0.977672465986
Running the best precision model for precision..
Running the best precision model for f1..
Running the best precision model for accuracy..
Running the best precision model for recall..
For  precision
{'recall': array([ 0.76754386,  0.9122807 ,  0.94736842,  0.91885965,  0.89912281,
        0.88157895,  0.85307018,  0.91008772,  0.79120879,  0.69010989]), 'f1': array([ 0.85995086,  0.94331066,  0.96428571,  0.94689266,  0.9382151 ,
        0.92626728,  0.91207503,  0.94104308,  0.87484812,  0.80823681]), 'precision': array([ 0.97765363,  0.97652582,  0.98181818,  0.97668998,  0.98086124,
        0.97572816,  0.97984887,  0.9741784 ,  0.97826087,  0.97515528]), 'accuracy': array([ 0.875     ,  0.94517544,  0.96491228,  0.94846491,  0.94078947,
        0.92982456,  0.91776316,  0.94298246,  0.88681319,  0.83626374])}



Running for  f1
Fitting 10 f

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   23.3s finished



Best parameters for f1: MultinomialNB(alpha=1e-06, class_prior=None, fit_prior=True)
Best score achieved for f1: 0.920308966329
Running the best f1 model for precision..
Running the best f1 model for f1..
Running the best f1 model for accuracy..
Running the best f1 model for recall..
For  f1
{'recall': array([ 0.82236842,  0.95175439,  0.97368421,  0.9627193 ,  0.91447368,
        0.89912281,  0.88377193,  0.91666667,  0.80659341,  0.72527473]), 'f1': array([ 0.88443396,  0.95911602,  0.9704918 ,  0.9627193 ,  0.93813273,
        0.9307605 ,  0.91695108,  0.93721973,  0.87589499,  0.82706767]), 'precision': array([ 0.95663265,  0.96659243,  0.96732026,  0.9627193 ,  0.9630485 ,
        0.96470588,  0.95271868,  0.9587156 ,  0.95822454,  0.96209913]), 'accuracy': array([ 0.89254386,  0.95942982,  0.97039474,  0.9627193 ,  0.93969298,
        0.93311404,  0.91995614,  0.93859649,  0.88571429,  0.84835165])}



Running for  accuracy
Fitting 10 folds for each of 5 candidates, totalling 50

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   19.8s finished



Best parameters for accuracy: MultinomialNB(alpha=1e-06, class_prior=None, fit_prior=True)
Best score achieved for accuracy: 0.925076788065
Running the best accuracy model for precision..
Running the best accuracy model for f1..
Running the best accuracy model for accuracy..
Running the best accuracy model for recall..
For  accuracy
{'recall': array([ 0.82236842,  0.95175439,  0.97368421,  0.9627193 ,  0.91447368,
        0.89912281,  0.88377193,  0.91666667,  0.80659341,  0.72527473]), 'f1': array([ 0.88443396,  0.95911602,  0.9704918 ,  0.9627193 ,  0.93813273,
        0.9307605 ,  0.91695108,  0.93721973,  0.87589499,  0.82706767]), 'precision': array([ 0.95663265,  0.96659243,  0.96732026,  0.9627193 ,  0.9630485 ,
        0.96470588,  0.95271868,  0.9587156 ,  0.95822454,  0.96209913]), 'accuracy': array([ 0.89254386,  0.95942982,  0.97039474,  0.9627193 ,  0.93969298,
        0.93311404,  0.91995614,  0.93859649,  0.88571429,  0.84835165])}



Running for  recall
Fitting 10 fold

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   18.5s finished



Best parameters for recall: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Best score achieved for recall: 0.915971917508
Running the best recall model for precision..
Running the best recall model for f1..
Running the best recall model for accuracy..
Running the best recall model for recall..
For  recall
{'recall': array([ 0.87719298,  0.98464912,  0.97587719,  0.97587719,  0.92982456,
        0.93421053,  0.91008772,  0.92324561,  0.86373626,  0.78461538]), 'f1': array([ 0.89485459,  0.94031414,  0.93980993,  0.94279661,  0.91675676,
        0.92307692,  0.90809628,  0.90732759,  0.88413948,  0.83802817]), 'precision': array([ 0.91324201,  0.8997996 ,  0.90631365,  0.91188525,  0.90405117,
        0.91220557,  0.90611354,  0.89194915,  0.90552995,  0.89924433]), 'accuracy': array([ 0.89692982,  0.9375    ,  0.9375    ,  0.94078947,  0.91557018,
        0.92214912,  0.90789474,  0.90570175,  0.88681319,  0.84835165])}




##################################################

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.7min finished



Best parameters for precision: MultinomialNB(alpha=1e-18, class_prior=None, fit_prior=True)
Best score achieved for precision: 0.973596442523
Running the best precision model for precision..
Running the best precision model for f1..
Running the best precision model for accuracy..
Running the best precision model for recall..
For  precision
{'recall': array([ 0.74780702,  0.88815789,  0.93421053,  0.90789474,  0.89692982,
        0.88377193,  0.84210526,  0.88815789,  0.78241758,  0.70549451]), 'f1': array([ 0.84720497,  0.93103448,  0.95195531,  0.94305239,  0.93485714,
        0.92537313,  0.90140845,  0.92360319,  0.87148103,  0.81575604]), 'precision': array([ 0.97707736,  0.97826087,  0.97038724,  0.98104265,  0.97613365,
        0.97108434,  0.96969697,  0.96199525,  0.98342541,  0.96686747]), 'accuracy': array([ 0.86513158,  0.93421053,  0.95285088,  0.94517544,  0.9375    ,
        0.92872807,  0.90789474,  0.92653509,  0.88461538,  0.84065934])}



Running for  f1
Fitting 10 f

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.2min finished



Best parameters for f1: MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True)
Best score achieved for f1: 0.922501035865
Running the best f1 model for precision..
Running the best f1 model for f1..
Running the best f1 model for accuracy..
Running the best f1 model for recall..
For  f1
{'recall': array([ 0.83333333,  0.96929825,  0.97807018,  0.9627193 ,  0.92324561,
        0.92105263,  0.88815789,  0.92105263,  0.85054945,  0.75384615]), 'f1': array([ 0.88681447,  0.96296296,  0.96641387,  0.95747001,  0.94394619,
        0.93229745,  0.91525424,  0.92409241,  0.89791183,  0.83760684]), 'precision': array([ 0.94763092,  0.95670996,  0.95503212,  0.95227766,  0.96559633,
        0.94382022,  0.94405594,  0.92715232,  0.95085995,  0.94230769]), 'accuracy': array([ 0.89364035,  0.9627193 ,  0.96600877,  0.95723684,  0.94517544,
        0.93311404,  0.91776316,  0.92434211,  0.9032967 ,  0.85384615])}



Running for  accuracy
Fitting 10 folds for each of 5 candidates, totalling 50

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.6min finished



Best parameters for accuracy: MultinomialNB(alpha=1e-06, class_prior=None, fit_prior=True)
Best score achieved for accuracy: 0.926722246599
Running the best accuracy model for precision..
Running the best accuracy model for f1..
Running the best accuracy model for accuracy..
Running the best accuracy model for recall..
For  accuracy
{'recall': array([ 0.81578947,  0.95175439,  0.97368421,  0.95614035,  0.91666667,
        0.91666667,  0.88377193,  0.91666667,  0.82857143,  0.73846154]), 'f1': array([ 0.88151659,  0.96230599,  0.96732026,  0.96247241,  0.94250282,
        0.93512304,  0.91799544,  0.93095768,  0.89125296,  0.83271375]), 'precision': array([ 0.95876289,  0.97309417,  0.96103896,  0.96888889,  0.96983759,
        0.9543379 ,  0.9549763 ,  0.94570136,  0.96419437,  0.95454545]), 'accuracy': array([ 0.89035088,  0.9627193 ,  0.96710526,  0.9627193 ,  0.94407895,
        0.93640351,  0.92105263,  0.93201754,  0.8989011 ,  0.85164835])}



Running for  recall
Fitting 10 fold

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.6min finished



Best parameters for recall: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Best score achieved for recall: 0.930671347082
Running the best recall model for precision..
Running the best recall model for f1..
Running the best recall model for accuracy..
Running the best recall model for recall..
For  recall
{'recall': array([ 0.88377193,  0.98026316,  0.97807018,  0.98464912,  0.93859649,
        0.94517544,  0.92982456,  0.93201754,  0.90989011,  0.82417582]), 'f1': array([ 0.89258029,  0.93416928,  0.93501048,  0.94426919,  0.91747053,
        0.92094017,  0.91774892,  0.90811966,  0.90491803,  0.86805556]), 'precision': array([ 0.901566  ,  0.89221557,  0.89558233,  0.90707071,  0.89727463,
        0.89791667,  0.90598291,  0.88541667,  0.9       ,  0.91687042]), 'accuracy': array([ 0.89364035,  0.93092105,  0.93201754,  0.94188596,  0.91557018,
        0.91885965,  0.91666667,  0.90570175,  0.9043956 ,  0.87472527])}




##################################################

In [4]:
##### CHECKING THE IMPLEMENTATION, and PRINTING USEFUL VALUES #####

# NOTE: Because SVM was taking a really long time to run, I only optimised f1. So, results_50 has only one element
#       in the list.
'''
This code should repeat the best values obtained for each metric.
'''
# metrics = ['precision', 'f1', 'accuracy', 'recall']
# for score in metrics:
#     print results_50[0][score].mean()

'''
But, we care about f1. It will always be the second element in the list resuls_50, but just confirming.
'''
for d in results_50:
    print d['f1'].mean()
'''
So now that it's confirmed it's the second element in the list, concentrating on the model with the best f1:
'''
# for m in results_50[0]:
#     print m + ": " + str(results_50[0][m].mean())

0.914432422665
0.935643808771
0.935643808771
0.927538949509


"\nSo now that it's confirmed it's the second element in the list, concentrating on the model with the best f1:\n"

In [5]:
# Put this in the table: This is the best-f1-model scores.
print "For 50 words"
print results_50[1]
for m in results_50[1]:
    print m + ": " + str(results_50[1][m].mean()) + "    Std-Dev: " + str(results_50[1][m].std())
    
print "\n\nFor 100 words"
print results_100[1]
for m in results_100[1]:
    print m + ": " + str(results_100[1][m].mean()) + "    Std-Dev: " + str(results_100[1][m].std())
    
print "\n\nFor 500 words"
print results_500[1]
for m in results_500[1]:
    print m + ": " + str(results_500[1][m].mean()) + "    Std-Dev: " + str(results_500[1][m].std())
    
print "\n\nFor 1000 words"
print results_1000[1]
for m in results_1000[1]:
    print m + ": " + str(results_1000[1][m].mean()) + "    Std-Dev: " + str(results_1000[1][m].std())
    
print "\n\nFor 3000 words"
print results_3000[1]
for m in results_3000[1]:
    print m + ": " + str(results_3000[1][m].mean()) + "    Std-Dev: " + str(results_3000[1][m].std())

For 50 words
{'recall': array([ 0.875     ,  0.97368421,  0.97368421,  0.95833333,  0.94078947,
        0.94298246,  0.92105263,  0.94078947,  0.88351648,  0.82197802]), 'f1': array([ 0.912     ,  0.96626768,  0.96208017,  0.95310796,  0.94182217,
        0.94818082,  0.93541203,  0.93975904,  0.91780822,  0.88      ]), 'precision': array([ 0.9522673 ,  0.95896328,  0.95074946,  0.94793926,  0.94285714,
        0.95343681,  0.95022624,  0.93873085,  0.95486936,  0.94683544]), 'accuracy': array([ 0.91557018,  0.96600877,  0.96162281,  0.95285088,  0.94188596,
        0.94846491,  0.93640351,  0.93969298,  0.92087912,  0.88791209])}
recall: 0.923181029497    Std-Dev: 0.0463576366778
f1: 0.935643808771    Std-Dev: 0.0247536028973
precision: 0.949687516258    Std-Dev: 0.00559388020523
accuracy: 0.937129120879    Std-Dev: 0.0223395235415


For 100 words
{'recall': array([ 0.85745614,  0.98026316,  0.97149123,  0.97149123,  0.93201754,
        0.93421053,  0.90350877,  0.92763158,  0.8725274