In [72]:
import os
import io
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

def handleReadFile( strPath ) :
    for root, dirnames, filenames in os.walk( strPath ) :
        for filename in filenames :
            path = os.path.join( root, filename )
            
            inBody = False
            lines = []
            f = io.open( path, 'r', encoding = 'latin1' )
            for line in f :
                if inBody :
                    lines.append( line )
                elif line == '\n' :
                    inBody = True
            f.close()
            message = '\n' . join( lines )
            yield path, message

def handleDataFrameFromDirectory( strPath, strClassification ) :
    rows = []
    index = []
    for filename, message in handleReadFile( strPath ) :
        rows.append( { 'message' : message, 'class' : strClassification } )
        index.append( filename )

    return DataFrame( rows, index = index )


data = DataFrame( { 'message' : [], 'class' : [] } )

data = data.append( handleDataFrameFromDirectory( 'emails/spam/', 'spam' ) )
data = data.append( handleDataFrameFromDirectory( 'emails/ham/', 'ham' ) )

data.head()

Unnamed: 0,class,message
emails/spam/00001.7848dde101aa985090474a91ec93fcf0,spam,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr..."
emails/spam/00002.d94f1b97e48ed3b553b3508d116e6a09,spam,1) Fight The Risk of Cancer!\n\nhttp://www.adc...
emails/spam/00003.2ee33bc6eacdb11f38d052c44819ba6c,spam,1) Fight The Risk of Cancer!\n\nhttp://www.adc...
emails/spam/00004.eac8de8d759b7e74154f142194282724,spam,##############################################...
emails/spam/00005.57696a39d7d84318ce497886896bf90d,spam,I thought you might like these:\n\n1) Slim Dow...


In [73]:
# now we wl use countvectorizer, this will aplit the message and add all the words to a list of counts, then we will pass it to
# MutinomialNB and call fit() function to create spam classifier

vectorizer = CountVectorizer()
counts = vectorizer.fit_transform( data['message'].values )

classifier = MultinomialNB()
targets = data['class'].values

classifier.fit( counts, targets )

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [81]:
# Let's try it out

examples = [ "loan car", "hi, cricket tomrrow?" ]
examples_counts = vectorizer.transform( examples )
predictions = classifier.predict( examples_counts )

predictions

array(['spam', 'ham'],
      dtype='|S4')

In [75]:
X_train, X_test, y_train, y_test = train_test_split( data['message'].values, data['class'].values, test_size=0.30, random_state=42)

In [80]:
#Disclaimer : I am yet not sure if my way of splitting data and then training and testing is working properly or not, I am still in learning phase of this stuff
# let's use this as exercise to train on the segment we created above

train_counts = vectorizer.fit_transform( X_train )

classifier = MultinomialNB()
targets = data['class'].values
classifier.fit( train_counts, y_train )

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [82]:
# let's test on the testing segment

test_counts = vectorizer.transform( X_test )

predictions = classifier.predict( test_counts )

predictions

array(['ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'spam',
       'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam',
       'ham', 'ham', 'spam', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham',
       'ham', 'ham', 'spam', 'ham', 'spam', 'spam', 'spam', 'ham', 'ham',
       'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham',
       'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'spam', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'spam',
       'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam',
       'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'spam',
       'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', '