#**Author Attribution**

*The NLP task is to identify the author of a document, given samples of author's work* <br>

*The data set used is a collection of Federalist Papers from Project Gutenberg*






###Read in CSV File Using Pandas

In [21]:
import pandas as pd
df = pd.read_csv('federalist.csv')

# convert the author column to categorical data
df.author = df.author.astype('category')

In [22]:
# display the first few rows
print('The first few rows:\n')
df.head()

The first few rows:



Unnamed: 0,author,text
0,HAMILTON,FEDERALIST. No. 1 General Introduction For the...
1,JAY,FEDERALIST No. 2 Concerning Dangers from Forei...
2,JAY,FEDERALIST No. 3 The Same Subject Continued (C...
3,JAY,FEDERALIST No. 4 The Same Subject Continued (C...
4,JAY,FEDERALIST No. 5 The Same Subject Continued (C...


In [23]:
#display the counts by author
print('Counts by author:\n')
df.author.value_counts()

Counts by author:



HAMILTON                49
MADISON                 15
HAMILTON OR MADISON     11
JAY                      5
HAMILTON AND MADISON     3
Name: author, dtype: int64

##Create Train and Test Data Using Sklearn

In [26]:
from sklearn.model_selection import train_test_split

# 80/20 with random state = 1234
# X = predictor variables (text)
# y = target variable (author)
X_train, X_test, y_train, y_test = train_test_split(df.text, df.author, test_size=0.2, random_state=1234)

print('The shape of train data:\t', X_train.shape)
print('The shape of test data: \t', X_test.shape)


The shape of train data:	 (66,)
The shape of test data: 	 (17,)


##Remove Stop Words and Perform Tf-Idf Vectoriztion

In [None]:
import nltk
nltk.download('stopwords')

In [28]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords)

# fit to the training data only
X_train_v = tfidf_vectorizer.fit_transform(X_train)
X_test_v = tfidf_vectorizer.transform(X_test)

print('The shape of the vectorized train data:\t', X_train_v.shape)
print('The shape of the vectorized test data: \t', X_test_v.shape)

The shape of the vectorized train data:	 (66, 7876)
The shape of the vectorized test data: 	 (17, 7876)


##Bernoulli Naïve Bayes

In [30]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()

bnb.fit(X_train_v, y_train)
print('Accuracy on the test set: ', bnb.score(X_train_v, y_train))

Accuracy on the test set:  0.7727272727272727


### Redo Tf-Idf Vectoriztion to Improve Accuracy

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords, max_features=1000, ngram_range=(1,2))

X_train_v2 = tfidf_vectorizer.fit_transform(X_train)
X_test_v2 = tfidf_vectorizer.transform(X_test)

print('The shape of the new vectorized train data:\t', X_train_v2.shape)
print('The shape of the new vectorized test data: \t', X_test_v2.shape)

The shape of the new vectorized train data:	 (66, 1000)
The shape of the new vectorized test data: 	 (17, 1000)


In [32]:
bnb.fit(X_train_v2, y_train)
print('Accuracy on the new test set: ', bnb.score(X_train_v2, y_train))

Accuracy on the new test set:  1.0


##Logistic Regression v1
> no parameters

In [53]:
from sklearn.metrics._plot.confusion_matrix import confusion_matrix
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_v2, y_train)

# for evaluating
lr_pred = lr.predict(X_test_v2)

###Classification Report

In [54]:
from sklearn.metrics import classification_report
print(classification_report(y_test, lr_pred))

                     precision    recall  f1-score   support

           HAMILTON       0.59      1.00      0.74        10
HAMILTON OR MADISON       0.00      0.00      0.00         3
                JAY       0.00      0.00      0.00         2
            MADISON       0.00      0.00      0.00         2

           accuracy                           0.59        17
          macro avg       0.15      0.25      0.19        17
       weighted avg       0.35      0.59      0.44        17



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


###Confusion Matrix

In [55]:
print(confusion_matrix(y_test, lr_pred))

[[10  0  0  0]
 [ 3  0  0  0]
 [ 2  0  0  0]
 [ 2  0  0  0]]


###Overall Accuracy

In [56]:
import numpy as np
print(np.mean(lr_pred == y_test))

0.5882352941176471


##Logistic Regression v2
> solver = 'lbfgs'<br>
> multi_class = 'multinomial'<br>
> class_weight = 'balanced'

In [66]:
lr = LogisticRegression(solver='lbfgs', multi_class='multinomial', class_weight='balanced')
lr.fit(X_train_v2, y_train)

# for evaluation
lr_pred = lr.predict(X_test_v2)

###Classification Report

In [67]:
print(classification_report(y_test, lr_pred))

                     precision    recall  f1-score   support

           HAMILTON       0.83      1.00      0.91        10
HAMILTON OR MADISON       0.67      0.67      0.67         3
                JAY       1.00      0.50      0.67         2
            MADISON       0.00      0.00      0.00         2

           accuracy                           0.76        17
          macro avg       0.62      0.54      0.56        17
       weighted avg       0.73      0.76      0.73        17



###Confusion Matrix

In [68]:
print(confusion_matrix(y_test, lr_pred))

[[10  0  0  0]
 [ 0  2  0  1]
 [ 1  0  1  0]
 [ 1  1  0  0]]


###Overall Accuracy

In [69]:
print(np.mean(lr_pred == y_test))

0.7647058823529411


##Neural Network v1
> hidden_layer_sizes = (1,1)<br>
> solvee = 'lbfgs'<br>

In [140]:
nn = MLPClassifier(hidden_layer_sizes=(1,1), solver='lbfgs')
nn.fit(X_train_v2, y_train)

# for evaluation
nn_pred = nn.predict(X_test_v2)

###Classification Report

In [144]:
print(classification_report(y_test, nn_pred))

                     precision    recall  f1-score   support

           HAMILTON       0.59      1.00      0.74        10
HAMILTON OR MADISON       0.00      0.00      0.00         3
                JAY       0.00      0.00      0.00         2
            MADISON       0.00      0.00      0.00         2

           accuracy                           0.59        17
          macro avg       0.15      0.25      0.19        17
       weighted avg       0.35      0.59      0.44        17



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


###Confustion Matrix

In [145]:
print(confusion_matrix(y_test, nn_pred))

[[10  0  0  0]
 [ 3  0  0  0]
 [ 2  0  0  0]
 [ 2  0  0  0]]


###Overall Accuracy

In [146]:
print(np.mean(nn_pred == y_test))

0.5882352941176471


##Neural Network v2
> hidden_layer_sizes = (3,1)<br>
> solver = 'lbfgs'

In [147]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(hidden_layer_sizes=(3,1), solver='lbfgs')
nn.fit(X_train_v2, y_train)

# for evaluation
nn_pred = nn.predict(X_test_v2)

###Classification Report

In [148]:
print(classification_report(y_test, nn_pred))

                     precision    recall  f1-score   support

           HAMILTON       0.91      1.00      0.95        10
HAMILTON OR MADISON       1.00      0.33      0.50         3
                JAY       0.00      0.00      0.00         2
            MADISON       0.40      1.00      0.57         2

           accuracy                           0.76        17
          macro avg       0.58      0.58      0.51        17
       weighted avg       0.76      0.76      0.72        17



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


###Confustion Matrix

In [149]:
print(confusion_matrix(y_test, nn_pred))

[[10  0  0  0]
 [ 0  1  0  2]
 [ 1  0  0  1]
 [ 0  0  0  2]]


###Overall Accuracy

In [150]:
print(np.mean(nn_pred == y_test))

0.7647058823529411
