In [None]:
import pandas as pd

# Project 2: Predicting Authorship of the Disputed Federalist Papers

The Federalist Papers are a collection of 85 essays written by James Madison, Alexander Hamilton, and John Jay under the collective pseudonym "Publius" to promote the ratification of the United States Constitution.

<img src="images\the_federalist_papers.jpg" width=200 height=50 />

Authorship of most of the papers were revealed some years later by Hamilton, though his claim to authorshipt of 12 papers were disputed for nearly 200 years.

| Author | Papers |
| :- | -: | 
| Jay | 2, 3, 4, 5, 64
| Madison | 10, 14, 37-48
| Hamilton | 1, 6, 7, 8, 9, 11, 12, 13, 15, 16, 17, 21-36, 59, 60, 61, 65-85
| Hamilton and Madison | 18, 19, 20
| Disputed | 49-58, 62, 63

The goal of this project is to use NLP and Naive Bayes to predict the author of the disputed papers.

Table of Contents

- [Getting and processing the data](#1.-Getting-and-processing-the-data)
- [Logistic Regression](#2.-Logistic-Regression)
- [Naive Bayes](#3.-Naive-Bayes-Classification)
- [Disputed Federalist Papers](#4.-Disputed-Federalist-Papers)

## 1. Getting and processing the data

Retrieve an electronic version of the Federalist Papers from the [Gutenberg project](http://www.gutenberg.org/). Use the search facility to search for the Federalist Papers. Several versions are available. 
We'll use the plain text version [1408-8.txt](http://www.gutenberg.org/cache/epub/1404/pg1404.txt)

First, we'll build a dictionary that identifies the author of each Federalist paper. We'll use the phrase To the People of the state of New York to identify the beginning of a paper, and the word PUBLIUS to identify the end of a paper (The word PUBLIUS marks the end of all papers except 37; we'll need to insert PUBLIUS at the end of Paper 37 manually).

In [None]:
from re import match

In [None]:
path = 'Data/papers.txt'
Fed_dict = {}
opening = 'To the People of the State of New York'
closing = 'PUBLIUS'

counter = 0
paper = ''

# build a dictionary with the Federalist papers 
with open(path) as f:
    for string in f: #  iterate over the lines of the txt file
        if match(opening, string):
            paper = '' # initialize Federalist Paper as an empty string
            counter += 1 # increase counter
        paper = paper+' '+string.replace('\n','') # remove end of line simbol \n; append new line; 
        if match(closing, string):
            Fed_dict[counter]=paper # done

In [None]:
len(Fed_dict)

In [None]:
# put the Federalist Papers into a DataFrame
papers = pd.DataFrame.from_dict(Fed_dict, orient='index',columns=['paper'])
papers.head(5)

In [None]:
# authorship function
def author(paper_num):
    'it returns the author of a Federalist Paper'
    # papers authored by Jay:
    Jay_list = [2,3,4,5,64]
    # papers authored by Madison:
    Madison_list = [10,14]+list(range(37,49))
    # papers authored by Hamilton
    Hamilton_list = [1,6,7,8,9,11,12,13,15,16,17]+list(range(21,37))+[59,60,61]+list(range(65,86))
    # papers authored by Hamilton+Madison
    Hamilton_Madison_list = [18,19,20]
    # disputed papers
    disputed_list = list(range(49,59))+[62,63]
    if paper_num in Jay_list:
        return 'Jay'
    elif paper_num in Hamilton_list:
        return 'Hamilton'
    elif paper_num in Madison_list:
        return 'Madison'
    elif paper_num in Hamilton_Madison_list:
        return 'Hamilton+Madison'
    elif paper_num in disputed_list:
        return 'Disputed'

In [None]:
# add column author to DataFrame
papers['author'] = papers.index.map(author)
papers.head(5)

In [None]:
papers.author.value_counts()

**Step 1:** train/test split

In [None]:
papers_train = papers[papers.author.isin(['Hamilton','Madison','Jay'])]
papers_test = papers[papers.author=='Disputed']

In [None]:
len(papers_train), len(papers_test)

**Step 2:** extract feature matrix and target vector

In [None]:
X_train = papers_train.paper
y_train = papers_train.author

In [None]:
X_test = papers_test.paper

**Step 3:** CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english')

In [None]:
# learn training data vocabulary
vect.fit(X_train)
# create document-term matrix
X_train_dtm = vect.transform(X_train)
X_test_dtm = vect.transform(X_test)

## 2. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression(max_iter=2000)

log_clf.fit(X_train_dtm,y_train)

**Model evaluation**

Problem: we don't have labels for the test set

**Option 1** (not recommended): train and test on the same set

In [None]:
y_train_pred = log_clf.predict(X_train_dtm) 

In [None]:
from sklearn import metrics
metrics.confusion_matrix(y_train,y_train_pred)

In [None]:
metrics.accuracy_score(y_train,y_train_pred)

The prediction function has an accuracy rate of 1.

**Option 2** (recommended): use [cross validation](https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation)

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(log_clf,X_train_dtm,y_train,cv=4,scoring='accuracy') # we'll use a small number of folds (cv)
scores

## 3. Naive Bayes Classification

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb_clf = MultinomialNB()

nb_clf.fit(X_train_dtm,y_train)

**Model evaluation**

In [None]:
y_train_pred = nb_clf.predict(X_train_dtm) 

In [None]:
# option 1
from sklearn import metrics
metrics.confusion_matrix(y_train,y_train_pred)

In [None]:
metrics.accuracy_score(y_train,y_train_pred)

In [None]:
# option 2
scores = cross_val_score(nb_clf,X_train_dtm,y_train,cv=4,scoring='accuracy') # we'll use a small number of folds (cv)
scores

In [None]:
scores.mean()

## 4. Disputed Federalist Papers

In [None]:
# logistic regression prediction
y_test_pred = log_clf.predict(X_test_dtm)
y_test_pred

In [None]:
# naive Bayes prediction
y_test_pred = nb_clf.predict(X_test_dtm)
y_test_pred

## 5. How does Naive Bayes choose between Hamilton and Madison

In [None]:
# store the vocabulary of X_train
X_train_words = vect.get_feature_names()

In [None]:
# examine the first 50 words
print(X_train_words[:50])

In [None]:
# examine the last 50 words
print(X_train_words[-50:])

In [None]:
# Naive Bayes counts the number of times each word appears in each class
nb_clf.feature_count_

In [None]:
# rows represent classes (Hamilton, Madison, Jay), columns represent words
nb_clf.feature_count_.shape

In [None]:
nb_clf.classes_

In [None]:
# number of times each word appears across all Hamilton's papers
Hamilton_word_count = nb_clf.feature_count_[0,:]
# number of times each word appears across all Madison's papers
Madison_word_count = nb_clf.feature_count_[2,:]

In [None]:
# create a DataFrame of words with their separate Hamilton and Madison counts
words = pd.DataFrame({'word' : X_train_words, 'Hamilton' : Hamilton_word_count, 'Madison' : Madison_word_count}).set_index('word')
words.head()

In [None]:
# examine 5 random DataFrame rows
words.sample(5)

In [None]:
# add 1 to Hamilton and Madison counts to avoid dividing by 0
words.Hamilton = words.Hamilton+1
words.Madison = words.Madison+1

In [None]:
# convert the Hamilton and Madison counts into frequencies
words.Hamilton = words.Hamilton/words.Hamilton.sum()
words.Madison = words.Madison/words.Madison.sum()

In [None]:
words.sample(5)

In [None]:
# calculate the ration of Hamilton-to-Madison and Madison-to-Hamilton for each word
words['Hamilton_ratio'] = words.Hamilton/words.Madison
words['Madison_ratio'] = words.Madison/words.Hamilton

In [None]:
words.sample(5)

In [None]:
# top 10 Hamiltonian words
words.sort_values(by='Hamilton_ratio', ascending=False).head(10)

In [None]:
# top 10 Madisonian words
words.sort_values(by='Madison_ratio', ascending=False).head(10)

## Parameter tuning using grid search

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
pipe = Pipeline(steps=
               [('vect', CountVectorizer()),
                #('tfidf', TfidfTransformer()),
                ('naive_bayes', MultinomialNB())])
pipe

In [None]:
# parameter dictionary
param_dict = {'vect__ngram_range': [(1, 1), (1, 2)], # (1,1) : use 1-grams (words); (1,2) : use 1 and 2 grams
              'vect__stop_words' : ['english',None],
              #'tfidf__use_idf': (True, False),
              'naive_bayes__alpha' : [0.0001, 0.001, 0.01,0.1, 1]} 

In [None]:
# grid search
grid = GridSearchCV(pipe, param_dict, cv=3, scoring='accuracy')

In [None]:
grid.fit(X_train,y_train)

In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [None]:
best_predictor = grid.estimator
best_predictor

In [None]:
best_predictor.fit(X_train,y_train)
y_test_pred = best_predictor.predict(X_test)
y_test_pred