# Basic simplest version
In this file, using tf-idf with SVM abd Naive Bayes.  With full training set

Working with nltk version 3.2.5 and scikit-learn version 0.20.1.

In [64]:
import nltk
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split

verbose = False

In [65]:
# for feature extraction
from sklearn.feature_extraction.text import CountVectorizer
# tf-idf
from sklearn.feature_extraction.text import TfidfTransformer
# SVM
from sklearn.linear_model import SGDClassifier
# NB
from sklearn.naive_bayes import MultinomialNB
# pipeline
from sklearn.pipeline import Pipeline
# Random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

### Support Vector Machine

In [66]:
# # step by step reference, no pipeline

# # count words
# count_vect = CountVectorizer()
# X_train_count = count_vect.fit_transform(X_train.text)
# X_train_count.shape

# # tf-idf
# tfidf_transformer = TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_count)
# X_train_tfidf.shape

# # NB
# clf = MultinomialNB().fit(X_train_tfidf, X_train.author)

## Pipeline
Using pipelines to help speed things up

In [67]:
svm_clf = Pipeline([("vect", CountVectorizer()),
                        ("tfidf", TfidfTransformer()),
                        ("clf-svm", SGDClassifier(loss = "hinge", penalty = "l2",
                                                tol = 1e-3, 
                                                random_state = 42)),])


### Multinomial NB

In [68]:
nb_clf = Pipeline([("vect", CountVectorizer()),
                    ("tfidf", TfidfTransformer()),
                    ("clf- nb", MultinomialNB(),)])


### Random Forest

In [69]:
rf_clf = Pipeline([("vect", CountVectorizer()),
                    ("tfidf", TfidfTransformer()),
                    ("clf-rf", RandomForestClassifier(random_state = 42, n_estimators = 10),)])

## Automate pipelines
Create lists and a dictionary to help iterate through the fit and predict parts. 

In [70]:
pipelines = [svm_clf, nb_clf, rf_clf]
pipe_dict = {0: "Support Vector Machine", 1: "Naive Bayes", 2: "Random Forest"}


Extracting features, using bag of words model. 

In [71]:
dataPath = os.path.join(os.getcwd(), "data", "")
train = pd.read_csv(dataPath + "train.csv")
if(verbose):
    print(train.head(5))
test = pd.read_csv(dataPath + "labeledTest.csv")


## Split data

In [72]:

X_train = train.text
y_train = train.author
X_test = test.text
y_test = train.author

X_train.append(X_train)
y_train.append(y_train)
X_test.append(X_test)
y_test.append(y_test)

0        EAP
1        HPL
2        EAP
3        MWS
4        HPL
5        MWS
6        EAP
7        EAP
8        EAP
9        MWS
10       MWS
11       EAP
12       HPL
13       HPL
14       EAP
15       MWS
16       EAP
17       MWS
18       EAP
19       HPL
20       EAP
21       HPL
22       EAP
23       EAP
24       EAP
25       EAP
26       EAP
27       EAP
28       HPL
29       HPL
        ... 
19549    MWS
19550    EAP
19551    EAP
19552    EAP
19553    EAP
19554    HPL
19555    EAP
19556    EAP
19557    EAP
19558    EAP
19559    HPL
19560    EAP
19561    HPL
19562    EAP
19563    MWS
19564    EAP
19565    EAP
19566    MWS
19567    EAP
19568    EAP
19569    MWS
19570    MWS
19571    HPL
19572    EAP
19573    MWS
19574    EAP
19575    EAP
19576    EAP
19577    EAP
19578    HPL
Name: author, Length: 39158, dtype: object

In [73]:
X_train = []
y_train = []
X_test = []
y_test = []

X_train.append(X50_train)
y_train.append(y50_train)
X_test.append(X50_test)
y_test.append(y50_test)
    
X_train.append(X80_train)
y_train.append(y20_train)
X_test.append(X80_test)
y_test.append(y20_test)
    
X_train.append(X90_train)
y_train.append(y10_train)
X_test.append(X90_train)
y_test.append(y10_train)

train_dict = {0: "50/50", 1: "80/20", 2: "90/10"}

In [74]:
accuracy = {}


In [75]:
for i, item in enumerate(X_train):
    
    print("\nLooking at",  train_dict[i], "split")
    accuracy[train_dict[i]] = {}
    for idx, pl in enumerate(pipelines):
        print("\nEstimator:",  pipe_dict[idx])
        pl.fit(X_train[i], y_train[i])
        y_pred = pl.predict(X_test[i])
        print('Test set accuracy score: %.3f ' % accuracy_score(y_test[i], y_pred))
        accuracy[train_dict[i]][pipe_dict[idx]] = accuracy_score(y_test[i], y_pred)
        # end inner for over pipeline
    #end outer for over data


Looking at 50/50 split

Estimator: Support Vector Machine
Test set accuracy score: 0.807 

Estimator: Naive Bayes
Test set accuracy score: 0.778 

Estimator: Random Forest
Test set accuracy score: 0.599 

Looking at 80/20 split

Estimator: Support Vector Machine
Test set accuracy score: 0.832 

Estimator: Naive Bayes
Test set accuracy score: 0.816 

Estimator: Random Forest
Test set accuracy score: 0.594 

Looking at 90/10 split

Estimator: Support Vector Machine
Test set accuracy score: 0.927 

Estimator: Naive Bayes
Test set accuracy score: 0.899 

Estimator: Random Forest
Test set accuracy score: 0.988 


In [76]:
accuracy2 = {}
for i, item in enumerate(X_train):
    
    print("\nLooking at",  train_dict[i], "split")
    accuracy2[train_dict[i]] = {}
    for idx, pl in enumerate(pipelines):
        print("\nEstimator:",  pipe_dict[idx])
        pl.fit(X_train[i], y_train[i])
        y_pred = pl.predict(X_test[i])
        print('Test set accuracy score: %.3f ' % accuracy_score(y_test[i], y_pred))
        accuracy2[train_dict[i]][pipe_dict[idx]] = accuracy_score(y_test[i], y_pred)
        # end inner for over pipeline
    #end outer for over data


Looking at 50/50 split

Estimator: Support Vector Machine
Test set accuracy score: 0.807 

Estimator: Naive Bayes
Test set accuracy score: 0.778 

Estimator: Random Forest
Test set accuracy score: 0.599 

Looking at 80/20 split

Estimator: Support Vector Machine
Test set accuracy score: 0.832 

Estimator: Naive Bayes
Test set accuracy score: 0.816 

Estimator: Random Forest
Test set accuracy score: 0.594 

Looking at 90/10 split

Estimator: Support Vector Machine
Test set accuracy score: 0.927 

Estimator: Naive Bayes
Test set accuracy score: 0.899 

Estimator: Random Forest
Test set accuracy score: 0.988 


In [77]:
accuracy = pd.DataFrame.from_dict(accuracy)
accuracy

Unnamed: 0,50/50,80/20,90/10
Naive Bayes,0.777937,0.815884,0.899495
Random Forest,0.59857,0.594484,0.987742
Support Vector Machine,0.806639,0.831716,0.926508


# Plots 

## Reference
Check nltk and sklearn versions: https://stackoverflow.com/questions/28501072/how-to-check-which-version-of-nltk-scikit-learn-installed
Pipeline help: https://www.kdnuggets.com/2017/12/managing-machine-learning-workflows-scikit-learn-pipelines-part-1.html