In [2]:
import pickle

def pickle_load(n):
    with open('./features/Model1_length1/fiction_' + str(n) + '.pickle', 'rb') as f:
        fic = pickle.load(f)

    with open('./features/Model1_length1/non_fiction_' + str(n) + '.pickle', 'rb') as f:
        nonfic = pickle.load(f)

    return fic, nonfic

fic50, nonfic50     = pickle_load(50)
fic100, nonfic100   = pickle_load(100)
fic500, nonfic500   = pickle_load(500)
fic1000, nonfic1000 = pickle_load(1000)
fic3000, nonfic3000 = pickle_load(3000)

## Multinomial Naive Bayes:

#### Model 1: All words len(word) > 1

NOTE: cross_val_score and grid_search use stratified k-fold.

In [2]:
'''
Different smoothing parameters in Naive Bayes.
'''

# New better version:
def my_NB(X):
    # Getting labels ready: 1 is for Fiction, 0 is for NonFiction
    labels = [1]*4558 + [0]*4558
    
    tuned_parameters = {'alpha': [1.0, 1e-3, 1e-6, 1e-12, 1e-18]}
    
    metrics = ['precision', 'f1', 'accuracy', 'recall']
    
    # List of 4 dictionaries, where each dictionary represents all the results for that particular best model.
    models = []
    
    for score in metrics:
        model = {}
        nb = MultinomialNB()
        print "Running for ", score
        clf = GridSearchCV(nb, tuned_parameters, cv=10, scoring=score, verbose=1)
        clf.fit(X, labels)
        print "\nBest parameters for " + score + ": " + str(clf.best_estimator_)
        print "Best score achieved for " + score + ": " + str(clf.best_score_)
        best_nb = clf.best_estimator_
        # Now that I have the best parameters for each metric, running SVM for those specific parameters to obtain 
        # all values.
        for s in metrics:
            print "Running the best " + score + " model for " + s + ".."
            model[s] = np.array(cross_val_score(best_nb, X, labels, cv=10, scoring=s))
        
        print "For ", score 
        print model
        print "\n\n"
        models.append(model)

    return models

In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.grid_search import GridSearchCV
import numpy as np


'''
For 50:
'''
# Getting it ready for Count Vectorizer:
countvec_fiction50 = []
countvec_nonfiction50 = []

for doc in fic50:
    temp = ' '.join(doc)
    countvec_fiction50.append(temp)
    
for doc in nonfic50:
    temp = ' '.join(doc)
    countvec_nonfiction50.append(temp)

print len(countvec_fiction50) 
fiction_plus_nonfiction50 = countvec_fiction50 + countvec_nonfiction50
    
vectorizer50 = CountVectorizer()
X50 = vectorizer50.fit_transform(fiction_plus_nonfiction50)


'''
For 100:
'''
# Getting it ready for Count Vectorizer:
countvec_fiction100 = []
countvec_nonfiction100 = []

for doc in fic100:
    temp = ' '.join(doc)
    countvec_fiction100.append(temp)
    
for doc in nonfic100:
    temp = ' '.join(doc)
    countvec_nonfiction100.append(temp)
    
fiction_plus_nonfiction100 = countvec_fiction100 + countvec_nonfiction100

vectorizer100 = CountVectorizer()
X100 = vectorizer100.fit_transform(fiction_plus_nonfiction100)


'''
For 500:
'''
# Getting it ready for Count Vectorizer:
countvec_fiction500 = []
countvec_nonfiction500 = []

for doc in fic500:
    temp = ' '.join(doc)
    countvec_fiction500.append(temp)
    
for doc in nonfic500:
    temp = ' '.join(doc)
    countvec_nonfiction500.append(temp)
    
fiction_plus_nonfiction500 = countvec_fiction500 + countvec_nonfiction500
    
vectorizer500 = CountVectorizer()
X500 = vectorizer500.fit_transform(fiction_plus_nonfiction500)


'''
For 1000:
'''
# Getting it ready for Count Vectorizer:
countvec_fiction1000 = []
countvec_nonfiction1000 = []

for doc in fic1000:
    temp = ' '.join(doc)
    countvec_fiction1000.append(temp)
    
for doc in nonfic1000:
    temp = ' '.join(doc)
    countvec_nonfiction1000.append(temp)
    
fiction_plus_nonfiction1000 = countvec_fiction1000 + countvec_nonfiction1000

vectorizer1000 = CountVectorizer()
X1000 = vectorizer1000.fit_transform(fiction_plus_nonfiction1000)

'''
For 3000:
'''
# Getting it ready for Count Vectorizer:
countvec_fiction3000 = []
countvec_nonfiction3000 = []

for doc in fic3000:
    temp = ' '.join(doc)
    countvec_fiction3000.append(temp)
    
for doc in nonfic3000:
    temp = ' '.join(doc)
    countvec_nonfiction3000.append(temp)
    
fiction_plus_nonfiction3000 = countvec_fiction3000 + countvec_nonfiction3000
    
vectorizer3000 = CountVectorizer()
X3000 = vectorizer3000.fit_transform(fiction_plus_nonfiction3000)


# DONE with all cases.

# print "Set I) 50 words:"
# results_50 = my_NB(X50)
# print "\n################################################################\n"
# print "Set II) 100 words:"
# results_100 = my_NB(X100)
# print "\n################################################################\n"
# print "Set III) 500 words:"
# results_500 = my_NB(X500)
# print "\n################################################################\n"
# print "Set IV) 1000 words:"
# results_1000 = my_NB(X1000)
# print "\n################################################################\n"
# print "Set V) 3000 words:"
# results_3000 = my_NB(X3000)
# print "\n################################################################\n"

4558


In [24]:
##### CHECKING THE IMPLEMENTATION, and PRINTING USEFUL VALUES #####

# NOTE: Because SVM was taking a really long time to run, I only optimised f1. So, results_50 has only one element
#       in the list.
'''
This code should repeat the best values obtained for each metric.
'''
# metrics = ['precision', 'f1', 'accuracy', 'recall']
# for score in metrics:
#     print results_50[0][score].mean()

'''
But, we care about f1. It will always be the second element in the list resuls_50, but just confirming.
'''
for d in results_50:
    print d['f1'].mean()
'''
So now that it's confirmed it's the second element in the list, concentrating on the model with the best f1:
'''
for m in results_50[1]:
    print m + ": " + str(results_50[0][m].mean())


0.899606420458
0.901239274724
0.901239274724
0.896187898781
recall: 0.898174763833
f1: 0.899606420458
precision: 0.901848665776
accuracy: 0.900272074417


In [21]:
# Put this in the table: This is the best-f1-model scores.
print "For 50 words"
print results_50[1]
for m in results_50[1]:
    print m + ": " + str(results_50[1][m].mean()) + "    Std-Dev: " + str(results_50[1][m].std())
    
print "\n\nFor 100 words"
print results_100[1]
for m in results_100[1]:
    print m + ": " + str(results_100[1][m].mean()) + "    Std-Dev: " + str(results_100[1][m].std())
    
print "\n\nFor 500 words"
print results_500[1]
for m in results_500[1]:
    print m + ": " + str(results_500[1][m].mean()) + "    Std-Dev: " + str(results_500[1][m].std())
    
print "\n\nFor 1000 words"
print results_1000[1]
for m in results_1000[1]:
    print m + ": " + str(results_1000[1][m].mean()) + "    Std-Dev: " + str(results_1000[1][m].std())
    
print "\n\nFor 3000 words"
print results_3000[1]
for m in results_3000[1]:
    print m + ": " + str(results_3000[1][m].mean()) + "    Std-Dev: " + str(results_3000[1][m].std())

For 50 words
{'recall': array([ 0.88596491,  0.96929825,  0.97368421,  0.94298246,  0.93421053,
        0.93640351,  0.89473684,  0.94078947,  0.87692308,  0.81538462]), 'f1': array([ 0.89977728,  0.94545455,  0.9527897 ,  0.93784079,  0.92008639,
        0.92324324,  0.89571899,  0.91568837,  0.89162011,  0.848     ]), 'precision': array([ 0.91402715,  0.92275574,  0.93277311,  0.93275488,  0.90638298,
        0.91044776,  0.8967033 ,  0.89189189,  0.90681818,  0.88333333]), 'accuracy': array([ 0.90131579,  0.94407895,  0.95175439,  0.9375    ,  0.91885965,
        0.92214912,  0.89583333,  0.91337719,  0.89340659,  0.85384615])}
recall: 0.917037786775    Std-Dev: 0.0461005297322
f1: 0.913021941836    Std-Dev: 0.0292771566183
precision: 0.909788832405    Std-Dev: 0.0156655812364
accuracy: 0.913212116831    Std-Dev: 0.0273862775727


For 100 words
{'recall': array([ 0.88157895,  0.98245614,  0.96929825,  0.97149123,  0.93859649,
        0.94517544,  0.9254386 ,  0.94078947,  0.89450549