# Basic simplest version
In this file, using tf-idf with SVM abd Naive Bayes.  Comparing 50/50 80/20 90/10 data splits.

Working with nltk version 3.2.5 and scikit-learn version 0.20.1.

In [20]:
import nltk
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split

verbose = False

In [21]:
# for feature extraction
from sklearn.feature_extraction.text import CountVectorizer
# tf-idf
from sklearn.feature_extraction.text import TfidfTransformer
# SVM
from sklearn.linear_model import SGDClassifier
# NB
from sklearn.naive_bayes import MultinomialNB
# pipeline
from sklearn.pipeline import Pipeline
# Random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

## Pipeline
Using pipelines to help speed things up

In [22]:
svm_clf = Pipeline([("vect", CountVectorizer()),
                        ("tfidf", TfidfTransformer()),
                        ("clf-svm", SGDClassifier(loss = "hinge", penalty = "l2",
                                                tol = 1e-3, 
                                                random_state = 42)),])


### Multinomial NB

In [23]:
nb_clf = Pipeline([("vect", CountVectorizer()),
                    ("tfidf", TfidfTransformer()),
                    ("clf- nb", MultinomialNB(),)])


### Random Forest

In [24]:
rf_clf = Pipeline([("vect", CountVectorizer()),
                    ("tfidf", TfidfTransformer()),
                    ("clf-rf", RandomForestClassifier(random_state = 42, n_estimators = 10),)])

## Automate pipelines
Create lists and a dictionary to help iterate through the fit and predict parts. 

In [25]:
pipelines = [svm_clf, nb_clf, rf_clf]
pipe_dict = {0: "Support Vector Machine", 1: "Naive Bayes", 2: "Random Forest"}

Extracting features, using bag of words model. 

In [26]:
dataPath = os.path.join(os.getcwd(), "data", "")
train = pd.read_csv(dataPath + "train.csv")
test = pd.read_csv(dataPath + "labeledTest.csv")
if(verbose):
    print(train.head(5))
    print(test.head(5))


## Split data

In [27]:
X50_train, X50_test, y50_train, y50_test = train_test_split(train.text, train.author, test_size = .50)
X80_train, X80_test, y20_train, y20_test = train_test_split(train.text, train.author, test_size = .20) 
X90_train, X90_test, y10_train, y10_test = train_test_split(train.text, train.author, test_size = .10)
X_train = train.text
y_train = train.author
X_test = test.text
y_test = test.author

if(verbose):
    print("\nShape of X50_train ", X50_train.shape)
    print("Shape of y50_train ", y50_train.shape)
    print("Shape of X50_test ", X50_test.shape)
    print("Shape of y50_test ", y50_test.shape)
    
    print("\nShape of X80_train ", X80_train.shape)
    print("Shape of y20_train ", y20_train.shape)
    print("Shape of X80_test ", X80_test.shape)
    print("Shape of y20_test ", y20_test.shape)
    
    print("\nShape of X90_train ", X90_train.shape)
    print("Shape of y10_train ", y10_train.shape)
    print("Shape of X90_train ", X90_train.shape)
    print("Shape of y10_train ", y10_train.shape)
    
    print("\nShape of X_train ", X_train.shape)
    print("Shape of y_train ", y_train.shape)
    print("Shape of X_test ", X_test.shape)
    print("Shape of y_test ", y_test.shape)


In [28]:
X_train = []
y_train = []
X_test = []
y_test = []

X_train.append(X50_train)
y_train.append(y50_train)
X_test.append(X50_test)
y_test.append(y50_test)
    
X_train.append(X80_train)
y_train.append(y20_train)
X_test.append(X80_test)
y_test.append(y20_test)
    
X_train.append(X90_train)
y_train.append(y10_train)
X_test.append(X90_train)
y_test.append(y10_train)



train_dict = {0: "50/50", 1: "80/20", 2: "90/10", 3: "full set"}

In [29]:
accuracy = {}

In [30]:
for i, item in enumerate(X_train):
    
    print("\nLooking at",  train_dict[i], "split")
    accuracy[train_dict[i]] = {}
    for idx, pl in enumerate(pipelines):
        print("\nEstimator:",  pipe_dict[idx])
        pl.fit(X_train[i], y_train[i])
        y_pred = pl.predict(X_test[i])
        print('Test set accuracy score: %.3f ' % accuracy_score(y_test[i], y_pred))
        accuracy[train_dict[i]][pipe_dict[idx]] = accuracy_score(y_test[i], y_pred)
        # end inner for over pipeline
    #end outer for over data


Looking at 50/50 split

Estimator: Support Vector Machine
Test set accuracy score: 0.813 

Estimator: Naive Bayes
Test set accuracy score: 0.777 

Estimator: Random Forest
Test set accuracy score: 0.592 

Looking at 80/20 split

Estimator: Support Vector Machine
Test set accuracy score: 0.827 

Estimator: Naive Bayes
Test set accuracy score: 0.807 

Estimator: Random Forest
Test set accuracy score: 0.594 

Looking at 90/10 split

Estimator: Support Vector Machine
Test set accuracy score: 0.927 

Estimator: Naive Bayes
Test set accuracy score: 0.899 

Estimator: Random Forest
Test set accuracy score: 0.987 


In [31]:
accuracy2 = {}
for i, item in enumerate(X_train):
    
    print("\nLooking at",  train_dict[i], "split")
    accuracy2[train_dict[i]] = {}
    for idx, pl in enumerate(pipelines):
        print("\nEstimator:",  pipe_dict[idx])
        pl.fit(X_train[i], y_train[i])
        y_pred = pl.predict(X_test[i])
        print('Test set accuracy score: %.3f ' % accuracy_score(y_test[i], y_pred))
        accuracy2[train_dict[i]][pipe_dict[idx]] = accuracy_score(y_test[i], y_pred)
        # end inner for over pipeline
    #end outer for over data


Looking at 50/50 split

Estimator: Support Vector Machine
Test set accuracy score: 0.813 

Estimator: Naive Bayes
Test set accuracy score: 0.777 

Estimator: Random Forest
Test set accuracy score: 0.592 

Looking at 80/20 split

Estimator: Support Vector Machine
Test set accuracy score: 0.827 

Estimator: Naive Bayes
Test set accuracy score: 0.807 

Estimator: Random Forest
Test set accuracy score: 0.594 

Looking at 90/10 split

Estimator: Support Vector Machine
Test set accuracy score: 0.927 

Estimator: Naive Bayes
Test set accuracy score: 0.899 

Estimator: Random Forest
Test set accuracy score: 0.987 


In [32]:
accuracy = pd.DataFrame.from_dict(accuracy)
accuracy

Unnamed: 0,50/50,80/20,90/10
Naive Bayes,0.776609,0.806946,0.898814
Random Forest,0.592339,0.594484,0.987231
Support Vector Machine,0.813279,0.827375,0.927359


In [34]:
print("Enter sentence: ")
sent = input()
sentence = pd.DataFrame({'text' : [sent]})
print(sentence)

for idx, pl in enumerate(pipelines):
    print("\nEstimator:",  pipe_dict[idx])
    pl.fit(X_train, y_train)
    y_pred = pl.predict(sentence.text)
    print("According to", pipe_dict[idx], "the author is:", y_pred)
    # end inner for over pipeline
print("Done")

Enter sentence: 


 And by the shores of the river Zaire there is neither quiet nor silence.


                                                text
0  And by the shores of the river Zaire there is ...

Estimator: Support Vector Machine


AttributeError: 'Series' object has no attribute 'lower'

## Reference
Check nltk and sklearn versions: https://stackoverflow.com/questions/28501072/how-to-check-which-version-of-nltk-scikit-learn-installed
Pipeline help: https://www.kdnuggets.com/2017/12/managing-machine-learning-workflows-scikit-learn-pipelines-part-1.html