### Naive Bayes

In [None]:
%%writefile NaiveBayes/NaiveBayesModel.py
#!/usr/bin/env python
from math import log
from math import exp

class NaiveBayesModel(object):

    def __init__(self, modelFile):
        self.model = {}
        recordStrs = [s.split('\n')[0].split('\t') for s in open(modelFile).readlines()]
        for word, statsStr in recordStrs:
            self.model[word] = map(float, statsStr.split(","))
        #Class priors: counts and probs (Pr(Class =0) and Pr(Class =1))
        self.c0, self.c1, self.prClass0, self.prClass1 = map(float, self.model["ClassPriors"])

    def classify(self, doc):
        # Posterior Probabilities Pr(Class=0| Doc) and Pr(Class=1| Doc) 
        # Naive Bayes inference Pr(Class=0| Doc)  ~ Pr(Class=0) * Pr(Class=0| word1) * Pr(Class=0| word2)...... 
        PrClass0GivenDoc = self.prClass0  
        PrClass1GivenDoc = self.prClass1
        for word in doc:
            PrClass0GivenDoc *= self.model[word][2]
            PrClass1GivenDoc *= self.model[word][3]
        return([PrClass0GivenDoc, PrClass1GivenDoc])
 
    # the natural log based version of this 
    # helps avoid underflow issues
    def classifyInLogs(self, doc):       
        # Posterior Probabilities Pr(Class=0| Doc) and Pr(Class=1| Doc) 
        # Naive Bayes inference Pr(Class=0| Doc)  ~ Pr(Class=0) * Pr(Class=0| word1) * Pr(Class=0| word2)...... 
        PrClass0GivenDoc = log(self.prClass0)  
        PrClass1GivenDoc = log(self.prClass1)
        for word in doc:  #NOTE: Improvement: on loading one should convert probs to log probs!
            c0 = self.model[word][2]
            c1 = self.model[word][3]
            if c0 != 0:
                PrClass0GivenDoc += log(c0)
            else:
                PrClass0GivenDoc = float("-inf")
            if c1 != 0:
                PrClass1GivenDoc += log(c1)
            else:
                PrClass1GivenDoc = float("-inf")
                
        return([PrClass0GivenDoc, PrClass1GivenDoc])
        
    def printModel(self):
        print "NaiveBayes Model starts here\n----------------"
        print "PRIORS: prClass0=%04.3f, prClass1=%04.3f" % (self.prClass0, self.prClass1)
        for word, stats in self.model.items():
            print "Pr(",word, "| Class)", stats  #Pr(Class=0| Doc)  all stats
        print "NaiveBayes Model ENDS here\n----------------"

In [None]:
%%writefile NaiveBayes/mapper_classify.py
#!/usr/bin/env python
from NaiveBayesModel import NaiveBayesModel
import sys, re, string
from math import exp
# Init mapper phase 

In [None]:
NBModel = NaiveBayesModel("NaiveBayes/model1.txt")     
NBModel.printModel()  
line = "D5	0	Chinese Chinese	Chinese Tokyo Japan"
docID, docClass,text = line.split("\t",2)   
words = text.split()
PrClass0GivenDoc, PrClass1GivenDoc = NBModel.classify(words)

print "Pr(Class=0| Doc=%s) is %6.5f" % (docID, PrClass0GivenDoc)
print "Pr(Class=1| Doc=%s) is %6.5f" % (docID, PrClass1GivenDoc)

PrClass0GivenDoc, PrClass1GivenDoc = NBModel.classifyInLogs(words)

print "Pr(Class=0| Doc=D5) = %6.5f, log(Pr(Class=0| Doc=D5)) = %f" % (exp(PrClass0GivenDoc), PrClass0GivenDoc)
print "Pr(Class=1| Doc=D5) = %6.5f, log(Pr(Class=1| Doc=D5)) = %f" % (exp(PrClass1GivenDoc), PrClass1GivenDoc)

In [None]:
%%writefile NaiveBayes/mapper_model.py
#!/usr/bin/env python
import sys, re, string

# Init mapper phase 
# define regex for punctuation removal
regex = re.compile('[%s]' % re.escape(string.punctuation))


#set class counter

#####################
class0_counter = 0
class1_counter = 0
#####################

# inner loop mapper phase: process each record
# input comes from STDIN (standard input)
for line in sys.stdin:
    # remove leading and trailing whitespace
    line = line.strip()
    # split the line into words
    # use subject and body 
    
   
    parts = line.split("\t")
    docID, docClass, title = parts[0:3]
    
    
    #=====update class counters====
    
    if int(docClass) == 1:
        class1_counter +=1
    if int(docClass) == 0:
        class0_counter +=1
    #===end class counter=======
    
    
    #----check if there's a text body
    if len(parts) == 4:
        body = parts[3]
    else:
        body = ""
    
    # remove punctuations, only have white-space as delimiter
    emailStr = regex.sub(' ', title.lower() + " " +body.lower()) #replace each punctuation with a space
    emailStr = re.sub( '\s+', ' ', emailStr )            # replace multiple spaces with a space
    # split the line into words
    words = emailStr.split()


# START STUDENT CODE HW221MAPPER

   
    for word in words:
        print "%s\t%s\t%s" %(docClass,word,1)


print "%s\t%s\t%s" %(0,'ClassPriors',class0_counter)
print "%s\t%s\t%s" %(1,'ClassPriors',class1_counter)
# END STUDENT CODE HW221MAPPER   

# define regex for punctuation removal

# increase counters
# write the results to STDOUT (standard output);
# what we output here will be the input for the
# Reduce step, i.e. the input for reducer.py
#
# tab-delimited; the trivial word count is 1
        
# END STUDENT CODE HW231MAPPER_MODEL


In [None]:
%%writefile NaiveBayes/reducer_model.py
#!/usr/bin/env python
import sys
from collections import defaultdict
# START STUDENT CODE HW231REDUCER_MODEL


class0_count = defaultdict(int)
class1_count = defaultdict(int)
vocab = []
for line in sys.stdin:

    # split
    docClass, word, count = line.split('\t')
    if int(docClass) == 1:
        class1_count[word] += int(count)
    if int(docClass) == 0:
        class0_count[word] += int(count)
    if word not in vocab:
        vocab.append(word)

# print class_priors
word = 'ClassPriors'
sum_class = class0_count[word] + class1_count[word]
Pr_word_ham = class0_count[word]/float(sum_class)
Pr_word_spam = class1_count[word]/float(sum_class)



print '%s\t%s,%s,%s,%s' % (word,class0_count[word],class1_count[word],Pr_word_ham,Pr_word_spam)

#-------------after print ClassPriors---------#
#----delete from the dictionaries and list so they won't affect the count -------#
vocab.remove(word)
del class0_count[word]
del class1_count[word]

#--------------end get prior-------------------#
        
# get total wordcount for each class
class0_wc = sum(class0_count.values())
class1_wc = sum(class1_count.values())

for word in sorted(vocab):
    Pr_word_ham = float(class0_count[word])/class0_wc
    Pr_word_spam = float(class1_count[word])/class1_wc
    print '%s\t%s,%s,%s,%s' % (word,class0_count[word],class1_count[word],Pr_word_ham,Pr_word_spam)
    
    
# input comes from STDIN

# parse the input we got from mapper.py

# convert count and spam flag (currently a string) to int


# handle msgID - store all IDs as we don't have too much
# not the best way to get prior, a two-level MapReduce jobs (ID - word) would be optimal
    
# calculate NB parameters, and write the dictionary to a file for the classification job
# prior probabilities

# conditional probability
    
# END STUDENT CODE HW231REDUCER_MODEL

In [None]:
%%writefile NaiveBayes/mapper_classify.py
#!/usr/bin/env python
from NaiveBayesModel import NaiveBayesModel
import sys, re, string
from math import exp
# Init mapper phase 

# read the MODEL into memory
# The model file resides the local disk (make sure to ship it home from HDFS).


NBModel = NaiveBayesModel("NaiveBayes/model.txt")  #----added correct way to call module
#NBModel.printModel() ### for testing purposes

#----uncomment this
#NBModel = NaiveBayesModel("NaiveBayes.txt")
#----


# define regex for punctuation removal
regex = re.compile('[%s]' % re.escape(string.punctuation))

# inner loop mapper phase: process each record
# input comes from STDIN (standard input)
for line in sys.stdin:
    # remove leading and trailing whitespace
    line = line.strip()
    # split the line into words
    parts = line.split("\t")
    docID, docClass, title = parts[0:3]
    if len(parts) == 4:
        body = parts[3]
    else:
        body = ""
    # use subject and body 
    # remove punctuations, only have white-space as delimiter
    
    ###----Added lower to title
    emailStr = regex.sub(' ', title.lower() + " " +body.lower()) #replace each punctuation with a space
    emailStr = re.sub( '\s+', ' ', emailStr )            # replace multiple spaces with a space
    # split the line into words
    words = emailStr.split()

# START STUDENT CODE HW231MAPPER_CLASSIFY
    PrClass0GivenDoc, PrClass1GivenDoc = NBModel.classifyInLogs(words)
    if exp(PrClass1GivenDoc) > exp(PrClass0GivenDoc):
        docClassClassify = 1
        
    else:
        docClassClassify = 0
        
    print '%s\t%s\t%s' % (docID,docClass,docClassClassify)
    


# END STUDENT CODE HW231MAPPER_CLASSIFY

In [None]:
%%writefile NaiveBayes/reducer_classify.py
#!/usr/bin/env python
from operator import itemgetter
import sys, operator, math


numberOfRecords = 0
NumberOfMisclassifications=0
classificationAccurary = 0

# START STUDENT CODE HW231REDUCER_CLASSIFY

# input comes from STDIN
for line in sys.stdin:
    

    
    numberOfRecords += 1
    docID,trueLabel,predLabel = line.split('\t')
    if int(trueLabel) != int(predLabel):
        NumberOfMisclassifications +=1
    
classificationAccurary = NumberOfMisclassifications/float(numberOfRecords)

# END STUDENT CODE HW231REDUCER_CLASSIFY

print 'Multinomial Naive Bayes Classifier Results [number of records, number of missed classified,\
 error rate] are \n %d,%d,%3.2f' % (numberOfRecords, NumberOfMisclassifications, classificationAccurary)
