# Prepare: tokenizing text

In [1]:
# let's take a string
mySent='This book is the best book on Python or M.L. I have ever laid eyes upon.'

In [2]:
# let's split this string
mySent.split()

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M.L.',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon.']

In [3]:
# to convert the string into a word vector, you need to split on punctuations also
# we can use the regular expression class \W to split on anything that isn't a word or number
import re 
listofTokens = re.split('\W+', mySent)
listofTokens

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M',
 'L',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon',
 '']

In [4]:
# Now, we need to remove empty strings
listofTokens = [x for x in listofTokens if len(x)>0]
listofTokens

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M',
 'L',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

In [5]:
# Now we need to convert everything into lower case
listofTokens = [x.lower() for x in listofTokens]
listofTokens

['this',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'python',
 'or',
 'm',
 'l',
 'i',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

In [6]:
# Now let’s see this in action with a full email from our email dataset.
with open(r'email\ham\6.txt') as f:
    # let's split it using regular expressions
    listofTokens = re.split('\W+', f.read())
listofTokens 

['Hello',
 'Since',
 'you',
 'are',
 'an',
 'owner',
 'of',
 'at',
 'least',
 'one',
 'Google',
 'Groups',
 'group',
 'that',
 'uses',
 'the',
 'customized',
 'welcome',
 'message',
 'pages',
 'or',
 'files',
 'we',
 'are',
 'writing',
 'to',
 'inform',
 'you',
 'that',
 'we',
 'will',
 'no',
 'longer',
 'be',
 'supporting',
 'these',
 'features',
 'starting',
 'February',
 '2011',
 'We',
 'made',
 'this',
 'decision',
 'so',
 'that',
 'we',
 'can',
 'focus',
 'on',
 'improving',
 'the',
 'core',
 'functionalities',
 'of',
 'Google',
 'Groups',
 'mailing',
 'lists',
 'and',
 'forum',
 'discussions',
 'Instead',
 'of',
 'these',
 'features',
 'we',
 'encourage',
 'you',
 'to',
 'use',
 'products',
 'that',
 'are',
 'designed',
 'specifically',
 'for',
 'file',
 'storage',
 'and',
 'page',
 'creation',
 'such',
 'as',
 'Google',
 'Docs',
 'and',
 'Google',
 'Sites',
 'For',
 'example',
 'you',
 'can',
 'easily',
 'create',
 'your',
 'pages',
 'on',
 'Google',
 'Sites',
 'and',
 'share',


# Naive Bayes Classifier

In [7]:
# let's make the naive bayes classifier again

import numpy as np

def createVocabList(dataSet):
    vocabList = set([])
    for document in dataSet:
        vocabList = vocabList | set(document)
    return list(vocabList)

def bagofWords2Vector(vocabList, inputDoc):
    returnVec = [0]*len(vocabList)
    for word in inputDoc:
        try:
            index = vocabList.index(word)
        except ValueError:
            print("The word {} is not contained in the vocabList".format(word))
        else:
            returnVec[index] += 1
    return np.array(returnVec)

def trainNB(trainMatrix, categoryList):
    numofWords = len(trainMatrix[0])
    numofDocuments = len(trainMatrix)
    p1Num = np.ones(numofWords); p0Num = np.ones(numofWords)
    p1Denom = 2.0; p0Denom = 2.0
    for i in range(numofDocuments):
        if categoryList[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix)
        elif categoryList[i] == 0:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix)
    p1Vec = np.log(p1Num/p1Denom)
    p0Vec = np.log(p0Num/p0Denom)
    pSpam = sum(categoryList)/numofDocuments
    return p1Vec, p0Vec, pSpam

def classifyNB(vector2classify, p1Vec, p0Vec, pSpam):
    pNotSpam = 1-pSpam
    p1 = sum(vector2classify * p1Vec) + np.log(pSpam)
    p0 = sum(vector2classify * p0Vec) + np.log(pNotSpam)
    if p1>p0:
        return 1
    else:
        return 0    
        

In [8]:
import re
import random


# This function takes a big string and parses out the text into a list of strings.
# It eliminates anything under two characters long and converts everything to lowercase.
def textParse(bigString):
    listofTokens = [x for x in re.split('\W+', bigString) if len(x)>2]
    return listofTokens

# this function automates the naive bayes classifier
# and calculates and returns the error rate
def spamTest():
    # wordlist is for temporarily storing the read files (or let's call them documents)
    # docList is for storing the documents
    # categoryList stores the document's category (whether spam or not spam)
    docList=[]; fullList=[]; categoryList=[]
    for i in range(1, 26): # because there are 25 files
        with open('email/spam/%d.txt' % i) as f:
            wordList = textParse(f.read())
        docList.append(wordList)
        categoryList.append(1)
        with open('email/ham/%d.txt' % i) as f:
            wordList = textParse(f.read())
        docList.append(wordList)
        categoryList.append(0)
    vocabList = createVocabList(docList)
    # we'll train the classifier on 40 samples and then test it on the remaining 10 samples
    # let's create the training and testing set
    trainSet = np.arange(50)
    testSet=np.random.choice(trainSet, 10)
    trainSet = np.delete(trainSet, testSet)
    # Now let's create the training matrix and train Category list
    trainMat = []; trainCatList=[]
    for i in trainSet:
        trainMat.append(bagofWords2Vector(vocabList, docList[i]))
        trainCatList.append(categoryList[i])
    p0Vec, p1Vec, pSpam = trainNB(trainMat, trainCatList)
    # now let's calculate the error rate
    errorCount=0 # it increases by 1 nif there's an error
    for i in testSet:
        wordVec = bagofWords2Vector(vocabList, docList[i])
        if categoryList[i] != classifyNB(wordVec, p0Vec, p1Vec, pSpam):
            errorCount += 1
    errorRatio = errorCount/len(testSet)
    return float(errorRatio)

In [9]:
spamTest()

{'EMS', 'MBA', 'Microsoft', 'price', 'prototype', 'dusty', 'your', 'Just', 'two', 'PERMANANTLY', 'pavilion', 'message', 'reliever', 'These', 'issues', 'starting', 'inspired', 'computing', 'keep', 'art', 'Check', 'Could', 'enjoy', 'hotels', 'Sorry', 'yeah', '100', 'softwares', 'only', '5mg', 'because', 'Brands', 'welcome', 'Of_PenisEn1argement', 'designed', 'pill', 'VISA', 'there', 'finance', 'Sounds', 'Wilmott', 'Safest', 'comment', 'then', 'Zolpidem', 'plugin', 'UPS', 'foaming', 'model', 'fine', 'sites', 'titles', 'meet', 'view', 'too', 'page', 'signed', 'guy', 'Most', 'Shipment', 'follow', 'Brand', 'access', 'others', 'spaying', 'fans', 'Cost', 'retirement', '750', 'right', 'latest', 'NaturalPenisEnhancement', 'All', 'year', 'recieve', 'let', 'serial', 'risk', 'Experts', 'Amazing', 'butt', 'Can', 'Benoit', 'either', 'below', 'http', 'china', 'extended', 'Mandelbrot', 'supporting', '138', 'edit', 'Codeine', 'OEM', 'sky', 'level', 'Millions', '291', 'job', '588', 'York', 'MoneyBack', '

0.1