# Loading modules

In [1]:
import numpy as np
from numpy import *
import json
from sklearn.feature_extraction import text
import scipy
from matplotlib import pyplot as plt
import math
from sklearn.naive_bayes import MultinomialNB
# from sklearn.naive_bayes import BernoulliNB

# Loading data

In [2]:
x = open('fedpapers_split.txt').read()
papers = json.loads(x)

papersH = papers[0] # papers by Hamilton 
papersM = papers[1] # papers by Madison
papersD = papers[2] # disputed papers

nH, nM, nD = len(papersH), len(papersM), len(papersD)

# Bag-of-Word model

In [3]:
# This allows you to ignore certain common words in English
# You may want to experiment by choosing the second option or your own
# list of stop words, but be sure to keep 'HAMILTON' and 'MADISON' in
# this list at a minimum, as their names appear in the text of the papers
# and leaving them in could lead to unpredictable results
my_stop_words = text.ENGLISH_STOP_WORDS.union({'hamilton','madison'})
#stop_words = {'HAMILTON','MADISON'}

## Form bag of words model using words used at least 10 times
vectorizer = text.CountVectorizer(lowercase=True,stop_words=my_stop_words,min_df=10)
X = vectorizer.fit_transform(papersH+papersM+papersD).toarray()
# Split word counts into separate matrices
XH, XM, XD = X[:nH,:], X[nH:nH+nM,:], X[nH+nM:,:]
# Label for XH, XM
YH = 0 * np.ones(nH) 
YM = 1 * np.ones(nM)
# generate Training DataSet and Training Label
X_train = np.r_[XH, XM]
Y_train = np.r_[YH, YM]

In [4]:
# Uncomment this line to see the full list of words remaining after filtering out 
# stop words and words used less than min_df times
vectorizer.get_feature_names()

['1787',
 '1788',
 '28',
 'able',
 'abolish',
 'absolute',
 'absolutely',
 'abuse',
 'according',
 'accordingly',
 'account',
 'accumulation',
 'accurate',
 'acquainted',
 'acquire',
 'acquired',
 'act',
 'active',
 'activity',
 'acts',
 'actual',
 'actually',
 'add',
 'added',
 'addition',
 'additional',
 'address',
 'adequate',
 'administer',
 'administered',
 'administration',
 'admission',
 'admit',
 'admitted',
 'adopted',
 'adoption',
 'advanced',
 'advantage',
 'advantages',
 'adversaries',
 'advice',
 'advocates',
 'affairs',
 'affect',
 'afford',
 'afforded',
 'agency',
 'agree',
 'agreed',
 'aid',
 'aim',
 'alarm',
 'alleged',
 'allowed',
 'altogether',
 'ambassadors',
 'ambition',
 'ambitious',
 'america',
 'american',
 'amounts',
 'ample',
 'anarchy',
 'ancient',
 'answer',
 'answered',
 'appeal',
 'appear',
 'appearance',
 'appeared',
 'appears',
 'applicable',
 'application',
 'applied',
 'apply',
 'appoint',
 'appointed',
 'appointing',
 'appointment',
 'appointments',
 

# Train the model by sklearn package MultinomialNB

In [5]:
naive = MultinomialNB()
classifier = naive.fit(X_train, Y_train.ravel())
predictTrain = classifier.predict(X_train)
correct = 0
for i in range(len(predictTrain)):
    if predictTrain[i] == Y_train[i]:
        correct += 1
rate = float(correct / len(Y_train))
print("The correctly classfied rate is: ",rate)
predictY = classifier.predict(XD)
print(predictY)

The correctly classfied rate is:  1.0
[1. 1. 1. 1. 0. 0. 1. 0. 0. 1. 1. 1.]


# Train the model by MLE (calculate the postprior)

In [6]:
def priorProbability(dataSet,labelList): # calculate prior probability
    DocNum = len(dataSet)
    pClass1 = sum(labelList)/float(DocNum)
    return pClass1

def conditionProbability(dataSet,labelList, pClass1):  # calculate condition probability
    dimNum = len(dataSet[0]) # number of feature vectors
    DocNum = len(dataSet) #number of papers
    p0Num = zeros(dimNum)
    p1Num = zeros(dimNum)
    p0Denom = 0.0; p1Denom = 0.0
    for i in range(DocNum):
        if labelList[i] == 0:
            p0Num += dataSet[i] #total number of occurrence of word_{j} in class 1 in the dataset
            p0Denom += sum(dataSet[i]) #number of all words labeled by 1 
        else:
            p1Num += dataSet[i] #total number of occurrence of word_{j} in class 0 in the dataset
            p1Denom += sum(dataSet[i]) #number of all words labeled by 0
    p1Vect = log((1+p1Num)/(dimNum+p1Denom)) #use Laplace Smoothing 
    p0Vect = log((1+p0Num)/(dimNum+p0Denom))
    return p1Vect,p0Vect
    
def naiveBayes(testData,dataSet,labelList):
    pClass1 = priorProbability(dataSet, labelList)
    p1Vect,p0Vect = conditionProbability(dataSet, labelList, pClass1)
    res = []
    for doc in testData:             
        p1 = sum(doc * p1Vect) + log(pClass1) #generate the Naive_Bayes classifier
        p0 = sum(doc * p0Vect) + log(1 - pClass1) #by calculating the postprior probability
        if p1 > p0:
            res.append(1)
        else:
            res.append(0)
    return res

In [7]:
predictLabel = naiveBayes(XD, X_train, Y_train.ravel())
predictLabel

[1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1]