**Part 1**

In [33]:
import pandas as pd
import numpy as np
import io

In [4]:
from google.colab import files
uploaded = files.upload()

Saving federalist.csv to federalist.csv


In [34]:
# Reading in file using Pandas
df = pd.read_csv(io.BytesIO(uploaded['federalist.csv']))

In [35]:
df['author'] = df.author.astype('category')

In [36]:
df.info() # Displaying the type

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83 entries, 0 to 82
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   author  83 non-null     category
 1   text    83 non-null     object  
dtypes: category(1), object(1)
memory usage: 1.0+ KB


In [37]:
df.head(8) #Displaying the first few rows


Unnamed: 0,author,text
0,HAMILTON,FEDERALIST. No. 1 General Introduction For the...
1,JAY,FEDERALIST No. 2 Concerning Dangers from Forei...
2,JAY,FEDERALIST No. 3 The Same Subject Continued (C...
3,JAY,FEDERALIST No. 4 The Same Subject Continued (C...
4,JAY,FEDERALIST No. 5 The Same Subject Continued (C...
5,HAMILTON,FEDERALIST No. 6 Concerning Dangers from Disse...
6,HAMILTON,FEDERALIST. No. 7 The Same Subject Continued (...
7,HAMILTON,FEDERALIST No. 8 The Consequences of Hostiliti...


In [38]:
df['author'].value_counts()

HAMILTON                49
MADISON                 15
HAMILTON OR MADISON     11
JAY                      5
HAMILTON AND MADISON     3
Name: author, dtype: int64

**Part 2**

In [39]:
from sklearn.model_selection import train_test_split

X = df['text']
y = df['author']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)


In [40]:
# Displaying Shape of train and test
X_train.shape

(66,)

In [41]:
X_test.shape

(17,)

In [42]:
y_train.shape

(66,)

In [43]:
y_test.shape

(17,)

**Part 3**

In [44]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [45]:
stopwords = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(stop_words=stopwords)

In [46]:
# Applying the vectorizer to the training and test data
X_train = vectorizer.fit_transform(X_train)  
X_test = vectorizer.transform(X_test)        

In [47]:
X_train.shape

(66, 7876)

In [48]:
X_test.shape

(17, 7876)

**Part 4**

In [49]:
from sklearn.naive_bayes import BernoulliNB

naive_bayes = BernoulliNB()
naive_bayes.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [50]:
from sklearn.metrics import accuracy_score, confusion_matrix

pred = naive_bayes.predict(X_test)
confusion_matrix(y_test, pred)

array([[10,  0,  0,  0],
       [ 3,  0,  0,  0],
       [ 2,  0,  0,  0],
       [ 2,  0,  0,  0]])

In [51]:
# Printing out accuracy on test set

print('Accuracy Score: ', accuracy_score(y_test, pred))

Accuracy Score:  0.5882352941176471


**Part 5**


In [52]:
# Max Features set to 1000 most frequent words, and adding Bigrams
vectorizer = TfidfVectorizer(stop_words=stopwords, max_features=1000, ngram_range=(1,2))

In [56]:
# Getting the X and y parameters again
X = df['text']
y = df['author']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [58]:
# Applying the modified vectorizer to the training and test data again
X_train = vectorizer.fit_transform(X_train)  
X_test = vectorizer.transform(X_test)    

In [59]:
X_train.shape

(66, 1000)

In [60]:
X_test.shape

(17, 1000)

In [61]:
naive_bayes = BernoulliNB()
naive_bayes.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [62]:
pred = naive_bayes.predict(X_test)
confusion_matrix(y_test, pred)

array([[10,  0,  0,  0],
       [ 0,  3,  0,  0],
       [ 1,  0,  1,  0],
       [ 0,  0,  0,  2]])

In [63]:
# Printing out accuracy on test set

print('Accuracy Score: ', accuracy_score(y_test, pred))

Accuracy Score:  0.9411764705882353


Note: After Applying max_features of 100 and Bigrams to the dataset the accuracy improved from 0.588 to 0.941

**Part 6**

In [65]:
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.pipeline import Pipeline


In [89]:
# Getting the X and y parameters again
X = df['text']
y = df['author']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [90]:
vectorizer = TfidfVectorizer(stop_words=stopwords, max_features=1000, ngram_range=(1,2))

In [91]:
# Applying the modified vectorizer to the training and test data again
X_train = vectorizer.fit_transform(X_train)  
X_test = vectorizer.transform(X_test)   

In [92]:
log_regression = LogisticRegression()
log_regression.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [93]:
pred = log_regression.predict(X_test)
confusion_matrix(y_test, pred)

array([[10,  0,  0,  0],
       [ 3,  0,  0,  0],
       [ 2,  0,  0,  0],
       [ 2,  0,  0,  0]])

In [94]:
print('Accuracy Score: ', accuracy_score(y_test, pred))

Accuracy Score:  0.5882352941176471


In [124]:
# Modifying one of the values in logistic regression
log_regression = LogisticRegression(multi_class='ovr', solver='newton-cg',class_weight='balanced')
log_regression.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [122]:
pred = log_regression.predict(X_test)
confusion_matrix(y_test, pred)

array([[10,  0,  0,  0],
       [ 0,  2,  0,  1],
       [ 2,  0,  0,  0],
       [ 1,  0,  0,  1]])

In [123]:
print('Accuracy Score: ', accuracy_score(y_test, pred))

Accuracy Score:  0.7647058823529411


When the Logistic Regression Model is run without any parameters it gives an accuracy of: 0.588. <br>
However when Logistic Regression is fitted with parameters multi_class = 'ovr', and has an associated
solver for a muli-class prpblem with a balanced class_weight the accuracy goes up to : 0.765. <br>
This is still less than the accuracy that the Naive Baiyes Bernoulli classifier got.

**Part 7 - Neural Network**

In [162]:
# Multiple Topologies
from sklearn.neural_network import MLPClassifier


In [189]:
# Getting the X and y parameters again
X = df['text']
y = df['author']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [190]:
vectorizer = TfidfVectorizer(stop_words=stopwords, max_features=1000, ngram_range=(1,2))

In [191]:
# Applying the modified vectorizer to the training and test data again
X_train = vectorizer.fit_transform(X_train)  
X_test = vectorizer.transform(X_test)  

In [199]:
mlp_classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15, 10, 6), random_state=1)
mlp_classifier.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(15, 10, 6), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [200]:
pred = mlp_classifier.predict(X_test)
confusion_matrix(y_test, pred)

array([[10,  0,  0,  0],
       [ 0,  3,  0,  0],
       [ 2,  0,  0,  0],
       [ 0,  0,  0,  2]])

In [201]:
print('Accuracy Score: ', accuracy_score(y_test, pred))

Accuracy Score:  0.8823529411764706


So after messing with the 3 different neural networks that sklearn offers I decided to use MLPClassifier as it gave the best accuracy. <br>
##Accuracy
My final accuracy for the Neural Network was 0.882 a big jump from 0.588 when I used the default parameters of my neural network. The primary reasons for this jump was increasing the size of the hidden layers from (12,2) to have 3 hidden layers with the values (15, 10, 6)