In [3]:
import pickle

def pickle_load(n):
    with open('./features/Model2_no_stopwords/fiction_' + str(n) + '.pickle', 'rb') as f:
        fic = pickle.load(f)

    with open('./features/Model2_no_stopwords/non_fiction_' + str(n) + '.pickle', 'rb') as f:
        nonfic = pickle.load(f)

    return fic, nonfic

fic50, nonfic50     = pickle_load(50)
fic100, nonfic100   = pickle_load(100)
fic500, nonfic500   = pickle_load(500)
fic1000, nonfic1000 = pickle_load(1000)
fic3000, nonfic3000 = pickle_load(3000)

In [4]:
'''
Different hyper-parameters in SVM.
'''
def my_SVM(X):
    # Getting labels ready: 1 is for Fiction, 0 is for NonFiction
    labels = [1]*4558 + [0]*4558
    
    tuned_parameters = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, 
                        {'C': [1, 10, 100, 1000], 'gamma': ['auto', 0.001, 0.0001], 'kernel': ['rbf']}]
    
    metrics = ['f1']
    m = ['precision', 'f1', 'accuracy', 'recall']
    
    # List of 4 dictionaries, where each dictionary represents all the results for that particular best model.
    models = []
    
    for score in metrics:
        model = {}
        svc = SVC()
        print "Running for ", score
        clf = GridSearchCV(svc, tuned_parameters, cv=10, scoring=score, verbose=2)
        clf.fit(X, labels)
        print "\nBest parameters for " + score + ": " + str(clf.best_estimator_)
        print "Best score achieved for " + score + ": " + str(clf.best_score_)
        best_svc = clf.best_estimator_
        # Now that I have the best parameters for each metric, running SVM for those specific parameters to obtain 
        # all values.
        for s in m:
            print "Running the best " + score + " model for " + s + ".."
            model[s] = np.array(cross_val_score(best_svc, X, labels, cv=10, scoring=s))
        
        print "For ", score 
        print model
        print "\n\n"
        models.append(model)

    return models

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.grid_search import GridSearchCV
import numpy as np
from sklearn.svm import SVC

'''
For 50:
'''
# Getting it ready for Count Vectorizer:
countvec_fiction50 = []
countvec_nonfiction50 = []

for doc in fic50:
    temp = ' '.join(doc)
    countvec_fiction50.append(temp)
    
for doc in nonfic50:
    temp = ' '.join(doc)
    countvec_nonfiction50.append(temp)
    
fiction_plus_nonfiction50 = countvec_fiction50 + countvec_nonfiction50
    
vectorizer50 = CountVectorizer()

X50 = vectorizer50.fit_transform(fiction_plus_nonfiction50)

results_50 = my_SVM(X50)
print "Done for 50\n"
print "######################################################\n\n"
'''
For 100:
'''
# Getting it ready for Count Vectorizer:
countvec_fiction100 = []
countvec_nonfiction100 = []

for doc in fic100:
    temp = ' '.join(doc)
    countvec_fiction100.append(temp)
    
for doc in nonfic100:
    temp = ' '.join(doc)
    countvec_nonfiction100.append(temp)
    
fiction_plus_nonfiction100 = countvec_fiction100 + countvec_nonfiction100
    
vectorizer100 = CountVectorizer()
X100 = vectorizer100.fit_transform(fiction_plus_nonfiction100)

results_100 = my_SVM(X100)
print "Done for 100\n"
print "######################################################\n\n"


'''
For 500:
'''
# Getting it ready for Count Vectorizer:
countvec_fiction500 = []
countvec_nonfiction500 = []

for doc in fic500:
    temp = ' '.join(doc)
    countvec_fiction500.append(temp)
    
for doc in nonfic500:
    temp = ' '.join(doc)
    countvec_nonfiction500.append(temp)
    
fiction_plus_nonfiction500 = countvec_fiction500 + countvec_nonfiction500
    
vectorizer500 = CountVectorizer()
X500 = vectorizer500.fit_transform(fiction_plus_nonfiction500)

results_500 = my_SVM(X500)
print "Done for 500\n"
print "######################################################\n\n"

'''
For 1000:
'''
# Getting it ready for Count Vectorizer:
countvec_fiction1000 = []
countvec_nonfiction1000 = []

for doc in fic1000:
    temp = ' '.join(doc)
    countvec_fiction1000.append(temp)
    
for doc in nonfic1000:
    temp = ' '.join(doc)
    countvec_nonfiction1000.append(temp)
    
fiction_plus_nonfiction1000 = countvec_fiction1000 + countvec_nonfiction1000
    
vectorizer1000 = CountVectorizer()
X1000 = vectorizer1000.fit_transform(fiction_plus_nonfiction1000)

results_1000 = my_SVM(X1000)
print "Done for 1000\n"
print "######################################################\n\n"

'''
For 3000:
'''
# Getting it ready for Count Vectorizer:
countvec_fiction3000 = []
countvec_nonfiction3000 = []

for doc in fic3000:
    temp = ' '.join(doc)
    countvec_fiction3000.append(temp)
    
for doc in nonfic3000:
    temp = ' '.join(doc)
    countvec_nonfiction3000.append(temp)
    
fiction_plus_nonfiction3000 = countvec_fiction3000 + countvec_nonfiction3000
    
vectorizer3000 = CountVectorizer()
X3000 = vectorizer3000.fit_transform(fiction_plus_nonfiction3000)

results_3000 = my_SVM(X3000)
print "Done for 3000\n"
print "######################################################\n\n"

In [10]:
## Just experimenting with Lemmatization:

# from nltk.stem.wordnet import WordNetLemmatizer

# lemm = WordNetLemmatizer()

# new_fic500 = []
# new_nonfic500 = []

# for sent in fic500:
#     temp = []
#     for w in sent:
#         temp.append(lemm.lemmatize(w))
#     new_fic500.append(temp)


# for sent in nonfic500:
#     temp = []
#     for w in sent:
#         temp.append(lemm.lemmatize(w))
#     new_nonfic500.append(temp)
    

# print len(fic500)
# print len(new_fic500)
# print len(nonfic500)
# print len(new_nonfic500)

In [17]:
'''
For 500:
'''
# Getting it ready for Count Vectorizer:
countvec_fiction500 = []
countvec_nonfiction500 = []

for doc in new_fic500:
    temp = ' '.join(doc)
    countvec_fiction500.append(temp)
    
for doc in new_nonfic500:
    temp = ' '.join(doc)
    countvec_nonfiction500.append(temp)
    
fiction_plus_nonfiction500 = countvec_fiction500 + countvec_nonfiction500
    
vectorizer500 = CountVectorizer()
X500 = vectorizer500.fit_transform(fiction_plus_nonfiction500)

results_500 = my_SVM(X500)
print "Done for 500\n"
print "######################################################\n\n"

Running for  f1
Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV] kernel=rbf, C=10, gamma=auto ....................................
[CV] ........................... kernel=rbf, C=10, gamma=auto - 2.4min
[CV] kernel=rbf, C=10, gamma=auto ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.4min remaining:    0.0s


[CV] ........................... kernel=rbf, C=10, gamma=auto - 2.5min
[CV] kernel=rbf, C=10, gamma=auto ....................................
[CV] ........................... kernel=rbf, C=10, gamma=auto - 2.5min
[CV] kernel=rbf, C=10, gamma=auto ....................................
[CV] ........................... kernel=rbf, C=10, gamma=auto - 2.5min
[CV] kernel=rbf, C=10, gamma=auto ....................................
[CV] ........................... kernel=rbf, C=10, gamma=auto - 2.6min
[CV] kernel=rbf, C=10, gamma=auto ....................................
[CV] ........................... kernel=rbf, C=10, gamma=auto - 2.7min
[CV] kernel=rbf, C=10, gamma=auto ....................................
[CV] ........................... kernel=rbf, C=10, gamma=auto - 2.7min
[CV] kernel=rbf, C=10, gamma=auto ....................................
[CV] ........................... kernel=rbf, C=10, gamma=auto - 2.8min
[CV] kernel=rbf, C=10, gamma=auto ....................................
[CV] .

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed: 82.2min finished



Best parameters for f1: SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Best score achieved for f1: 0.967469280934
Running the best f1 model for precision..
Running the best f1 model for f1..
Running the best f1 model for accuracy..
Running the best f1 model for recall..
For  f1
{'recall': array([ 0.90789474,  0.99561404,  0.99342105,  0.99561404,  0.95394737,
        0.96929825,  0.9495614 ,  0.97149123,  0.92747253,  0.87692308]), 'f1': array([ 0.94628571,  0.99234973,  0.9869281 ,  0.99018539,  0.96989967,
        0.97464168,  0.96759777,  0.97469747,  0.95367232,  0.91829689]), 'precision': array([ 0.98806683,  0.98910675,  0.98051948,  0.98481562,  0.98639456,
        0.98004435,  0.98633257,  0.97792494,  0.98139535,  0.96376812]), 'accuracy': array([ 0.94846491,  0.99232456,  0.98684211,  0.99013158,  0.970

In [22]:
print results_500[0]['f1'].mean()

0.967455471865
