In [1]:
import pandas as pd
import numpy as npb

# 1.create train data

In [24]:
def loadDataSet():
    """
    创建数据集
    :return: 单词列表postingList, 所属类别classVec
    """
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], #[0,0,1,1,1......]
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1]  # 1 is abusive, 0 not
    return postingList, classVec
X,Y=loadDataSet()
X,Y

([['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
  ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
  ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
  ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
  ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
  ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']],
 [0, 1, 0, 1, 0, 1])

# 2.create Vocab List

In [19]:
def createVocabList(dataSet):
    """
    获取所有单词的集合
    :param dataSet: 数据集
    :return: 所有单词的集合(即不含重复元素的单词列表)
    """
    vocabSet = set([])  # create empty set
    for document in dataSet:
        # 操作符 | 用于求两个集合的并集
        vocabSet = vocabSet | set(document)  # union of the two sets
    return list(vocabSet)
voc_ls=createVocabList(X)
voc_ls

['I',
 'buying',
 'food',
 'has',
 'quit',
 'cute',
 'posting',
 'so',
 'dalmation',
 'steak',
 'ate',
 'help',
 'maybe',
 'my',
 'not',
 'stupid',
 'please',
 'to',
 'stop',
 'how',
 'is',
 'problems',
 'him',
 'love',
 'mr',
 'flea',
 'take',
 'worthless',
 'licks',
 'garbage',
 'park',
 'dog']

# 3.word to vec

In [20]:
def setOfWords2Vec(vocabList, inputSet):
    """
    遍历查看该单词是否出现，出现该单词则将该单词置1
    :param vocabList: 所有单词集合列表
    :param inputSet: 输入数据集
    :return: 匹配列表[0,1,0,1...]，其中 1与0 表示词汇表中的单词是否出现在输入的数据集中
    """
    # 创建一个和词汇表等长的向量，并将其元素都设置为0
    returnVec = [0] * len(vocabList)# [0,0......]
    # 遍历文档中的所有单词，如果出现了词汇表中的单词，则将输出的文档向量中的对应值设为1
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print ("the word is not in my Vocabulary!")
    return returnVec


In [25]:
for i in range(len(X)):
    X[i]=setOfWords2Vec(voc_ls,X[i])
X=np.array(X)
X

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
        1, 0, 0, 0, 1, 0, 0, 0, 1, 1],
       [1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
        1, 0, 1, 0, 0, 0, 1, 0, 0, 0],
       [0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 1]])

# 4.build model 

In [26]:
from sklearn.naive_bayes import BernoulliNB
clf_obj=BernoulliNB()
clf=clf_obj.fit(X,Y)

# 5.test

In [52]:
def test(input_ls):
    vec=setOfWords2Vec(voc_ls,input_ls)
    vec=np.array(vec)
    vec=vec.reshape((1,-1))
    Y_pre=clf.predict(vec)
    print('classified as %s'%Y_pre)

In [53]:
test1_Entry = ['love', 'my', 'dalmation']
test(test1_Entry)

classified as [0]


In [54]:
test2_Entry = ['stupid', 'garbage']
test(test2_Entry)

classified as [1]


# project2--Filtering spam 

# 1.sentence to word list

In [213]:
def textParse(bigString):
    '''
    Desc:
        接收一个大字符串并将其解析为字符串列表
    Args:
        bigString -- 大字符串
    Returns:
        去掉少于 2 个字符的字符串，并将所有字符串转换为小写，返回字符串列表
    '''
    import re
    # 使用正则表达式来切分句子，其中分隔符是除单词、数字外的任意字符串
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

In [214]:
mySent = 'This book is the best book on Python or M.L. I have ever laid eyes upon.'
textParse(mySent)

  return _compile(pattern, flags).split(string, maxsplit)


['this',
 'book',
 'the',
 'best',
 'book',
 'python',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

# 2.get dataset

In [245]:
X_word = []
Y = []
for i in range(1, 20):
    a=open("spam/%s.txt"%i).read()
    b=open("ham/%s.txt"%i).read()
    wordList = textParse(a)
    X_word.append(wordList)
    Y.append(1)
    wordList2 = textParse(b)
    X_word.append(wordList2)
    Y.append(0)

  return _compile(pattern, flags).split(string, maxsplit)


In [246]:
len(X_word)


38

In [247]:
voc_ls = createVocabList(X_word)
voc_ls

['things',
 'fast',
 'only',
 'you',
 'http',
 'naturalpenisenhancement',
 'his',
 'severepain',
 'codeine',
 'guaranteeed',
 'pictures',
 'kerry',
 'rock',
 'free',
 'trip',
 'members',
 'service',
 'said',
 'superb',
 '119',
 'cold',
 'bargains',
 'herbal',
 'per',
 'favorite',
 'discount',
 'all',
 'how',
 '156',
 'fine',
 'tabs',
 '138',
 'interesting',
 'done',
 '322',
 'peter',
 'via',
 'february',
 '25mg',
 'need',
 'from',
 'well',
 '562',
 'opioid',
 'program',
 '0nline',
 'supporting',
 'encourage',
 '129',
 'placed',
 'tokyo',
 '570',
 'storedetailview_98',
 'over',
 'jewerly',
 '15mg',
 'status',
 'fda',
 'jquery',
 '100',
 'style',
 'pro',
 'cards',
 'major',
 'amex',
 'winter',
 'opportunity',
 'strategy',
 'microsoft',
 'mailing',
 'view',
 'sent',
 'inches',
 'photoshop',
 'come',
 'since',
 'vuitton',
 'time',
 'pain',
 'here',
 'gucci',
 '366',
 'finder',
 'mail',
 'assistance',
 'fundamental',
 'away',
 'courier',
 '100m',
 'narcotic',
 'far',
 'shipment',
 'inform',

### tips : there are some numbers might be a problem

In [248]:
X_word

[['codeine',
  '15mg',
  'for',
  '203',
  'visa',
  'only',
  'codeine',
  'methylmorphine',
  'narcotic',
  'opioid',
  'pain',
  'reliever',
  'have',
  '15mg',
  '30mg',
  'pills',
  '15mg',
  'for',
  '203',
  '15mg',
  'for',
  '385',
  '15mg',
  'for',
  '562',
  'visa',
  'only'],
 ['peter',
  'with',
  'jose',
  'out',
  'town',
  'you',
  'want',
  'meet',
  'once',
  'while',
  'keep',
  'things',
  'going',
  'and',
  'some',
  'interesting',
  'stuff',
  'let',
  'know',
  'eugene'],
 ['hydrocodone',
  'vicodin',
  'brand',
  'watson',
  'vicodin',
  '750',
  '195',
  '120',
  '570',
  'brand',
  'watson',
  '750',
  '195',
  '120',
  '570',
  'brand',
  'watson',
  '325',
  '199',
  '120',
  '588',
  'noprescription',
  'required',
  'free',
  'express',
  'fedex',
  'days',
  'delivery',
  'for',
  'over',
  '200',
  'order',
  'major',
  'credit',
  'cards',
  'check'],
 ['yay',
  'you',
  'both',
  'doing',
  'fine',
  'working',
  'mba',
  'design',
  'strategy',
  'c

In [249]:
for i in range(len(X_word)):
    X_word[i]=setOfWords2Vec(voc_ls,X_word[i])
X_train=X_word[:-3][:]
X=np.array(X_train)
len(X)

35

In [250]:
Y=Y[:-3]
Y

[1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1]

# 3.train model

In [251]:
clf_obj2=BernoulliNB()
clf2=clf_obj2.fit(X,Y)

# 4.test

In [252]:
test1=open('ham/10.txt').read()
test1

'Ryan Whybrew commented on your status.\n\nRyan wrote:\n"turd ferguson or butt horn."\n'

In [259]:
def test(sentence):
    test_word=textParse(sentence)
    vec=setOfWords2Vec(voc_ls,test_word)
    test_entry=np.array(vec).reshape((1,-1))
    print(clf2.predict(test_entry))

In [260]:
test(test1)

[0]


  return _compile(pattern, flags).split(string, maxsplit)


In [261]:
test2=open('spam/10.txt').read()
test2

'OrderCializViagra Online & Save 75-90%\n\n0nline Pharmacy NoPrescription required\nBuy Canadian Drugs at Wholesale Prices and Save 75-90%\nFDA-Approved drugs + Superb Quality Drugs only!\nAccept all major credit cards'

In [263]:
test(test2)

[1]


  return _compile(pattern, flags).split(string, maxsplit)


In [264]:
test3=open('spam/1.txt').read()
test3

'--- Codeine 15mg -- 30 for $203.70 -- VISA Only!!! --\n\n-- Codeine (Methylmorphine) is a narcotic (opioid) pain reliever\n-- We have 15mg & 30mg pills -- 30/15mg for $203.70 - 60/15mg for $385.80 - 90/15mg for $562.50 -- VISA Only!!! ---'

In [265]:
test(test3)

[1]


  return _compile(pattern, flags).split(string, maxsplit)


In [266]:
test4=open('spam/25.txt').read()
test4

'Experience with BiggerPenis Today! Grow 3-inches more\n\nThe Safest & Most Effective Methods Of_PenisEn1argement.\nSave your time and money!\nBetterErections with effective Ma1eEnhancement products.\n\n#1 Ma1eEnhancement Supplement. Trusted by Millions. Buy Today!'

In [267]:
test(test3)

[1]


  return _compile(pattern, flags).split(string, maxsplit)
