## Edit Distance
- pip install editdistance

In [1]:
! pip install editdistance

Collecting editdistance
  Downloading editdistance-0.4-cp36-cp36m-win_amd64.whl
Installing collected packages: editdistance
Successfully installed editdistance-0.4


In [2]:
import editdistance
editdistance.eval('banana', 'bahama')

2

In [None]:
'banana' # n => h
'bahana' # n => m
'bahama'

In [3]:
import editdistance
editdistance.eval('machine', 'macinae')

2

In [None]:
'machine' #=> delete h
'macine'  #=> insert a
'macinae'

## Bag of words

In [8]:
?CountVectorizer

In [5]:
content = ['How to format my hard disk', 'Hard disk format problems']

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(content)
X

<2x7 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [7]:
print(vectorizer.get_feature_names())
X.toarray()

['disk', 'format', 'hard', 'how', 'my', 'problems', 'to']


array([[1, 1, 1, 1, 1, 0, 1],
       [1, 1, 1, 0, 0, 1, 0]], dtype=int64)

In [9]:
X.toarray().transpose()

array([[1, 1],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0]], dtype=int64)

## 讀取 toy 資料

In [17]:
import os
path = 'data/toy'
for f in os.listdir(path):
    print(open(os.path.join(path, f)).read())

This is a toy post about machine learning. Actually, it contains not much interesting stuff.
Imaging databases provide storage capabilities.
Most imaging databases safe images permanently.
Imaging databases store data.
Imaging databases store data. Imaging databases store data. Imaging databases store data.


In [20]:
posts = [open(os.path.join(path, f)).read() for f in os.listdir(path)]
posts

['This is a toy post about machine learning. Actually, it contains not much interesting stuff.',
 'Imaging databases provide storage capabilities.',
 'Most imaging databases safe images permanently.',
 'Imaging databases store data.',
 'Imaging databases store data. Imaging databases store data. Imaging databases store data.']

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X= vectorizer.fit_transform(posts)
X

<5x25 sparse matrix of type '<class 'numpy.int64'>'
	with 33 stored elements in Compressed Sparse Row format>

In [47]:
X.toarray()

array([[1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
        1, 1, 1],
       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0],
       [0, 0, 0, 0, 3, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,
        0, 0, 0]], dtype=int64)

In [23]:
print(vectorizer.get_feature_names())
X.toarray()

['about', 'actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'provide', 'safe', 'storage', 'store', 'stuff', 'this', 'toy']


array([[1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
        1, 1, 1],
       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0],
       [0, 0, 0, 0, 3, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,
        0, 0, 0]], dtype=int64)

In [26]:
new_post = 'imaging database qoo'
new_post_vec = vectorizer.transform([new_post])
new_post_vec

<1x25 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>

## Euclidean Distance

In [35]:
import scipy as sp
import numpy as np
a = np.array([1,1,0,0,1])
b = np.array([0,1,1,0,1])

# method 1
sum((a - b) ** 2) ** (1/2)

# method 2
#np.array([a,b])
sp.linalg.norm(np.array([a,b]))


2.4494897427831779

### 計算imaging database 與每篇文章的距離

In [56]:
def dist(v1, v2):
    delta = v1 - v2
    return sp.linalg.norm(delta.toarray())

In [57]:
import sys
best_doc = None
best_dist = 999
best_i = None
num_samples = len(posts)
for i in range(0, num_samples):
    post = posts[i]
    if post==new_post:
        continue
    post_vec = X.getrow(i)
    d = dist(post_vec, new_post_vec)
    print("=== Post %i with dist=%.2f: %s"%(i, d, post))
    if d<best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f"%(best_i, best_dist))


=== Post 0 with dist=3.87: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist=2.00: Imaging databases provide storage capabilities.
=== Post 2 with dist=2.24: Most imaging databases safe images permanently.
=== Post 3 with dist=1.73: Imaging databases store data.
=== Post 4 with dist=5.57: Imaging databases store data. Imaging databases store data. Imaging databases store data.
Best post is 3 with dist=1.73


### 使用sklearn 的 euclidean distance

In [58]:
from sklearn.metrics.pairwise import euclidean_distances
ed = euclidean_distances(new_post_vec, X)
ed

array([[ 3.87298335,  2.        ,  2.23606798,  1.73205081,  5.56776436]])

In [46]:
#pos = ed.argsort()[0][0]
pos = ed.argsort().flatten()[0]
posts[pos]
#posts[pos]

'Imaging databases store data.'

### 更改距離計算方式

In [64]:
import scipy as sp
import numpy as np
a = np.array([1,1,0,0,1])
b = np.array([0,1,1,0,1])
c = np.array([1,1,0,0,1,0,0,0,0,0,0,0])
d = np.array([1,1,0,0,1,1,1,1,1,1])
a
sp.linalg.norm([d])

2.8284271247461903

In [83]:
def dist(v1, v2):
    v1_normalized  = v1 / sp.linalg.norm(v1) 
    v2_normalized  = v2 / sp.linalg.norm(v2)
    delta = v1_normalized - v2_normalized
    return sp.linalg.norm(delta)

In [84]:
import sys
best_doc = None
best_dist = 999
best_i = None
num_samples = len(posts)
for i in range(0, num_samples):
    post = posts[i]
    if post==new_post:
        continue
    post_vec = X.getrow(i).toarray()
    d = dist(post_vec, new_post_vec.toarray())
    print("=== Post %i with dist=%.2f: %s"%(i, d, post))
    if d<best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f"%(best_i, best_dist))


=== Post 0 with dist=1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist=1.05: Imaging databases provide storage capabilities.
=== Post 2 with dist=1.09: Most imaging databases safe images permanently.
=== Post 3 with dist=1.00: Imaging databases store data.
=== Post 4 with dist=1.00: Imaging databases store data. Imaging databases store data. Imaging databases store data.
Best post is 3 with dist=1.00


In [90]:
#vectorizer = CountVectorizer()
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(posts)
vectorizer.get_feature_names()

['actually',
 'capabilities',
 'contains',
 'data',
 'databases',
 'images',
 'imaging',
 'interesting',
 'learning',
 'machine',
 'permanently',
 'post',
 'provide',
 'safe',
 'storage',
 'store',
 'stuff',
 'toy']

In [93]:
list(vectorizer.get_stop_words())[0:10]

['afterwards',
 'hundred',
 'them',
 'more',
 'name',
 'against',
 'beside',
 'indeed',
 'the',
 'about']

## 安裝 nltk
- pip install nltk

In [97]:
import nltk.stem
s = nltk.stem.SnowballStemmer('english')
s.stem('graphics')

'graphic'

In [98]:
s.stem('imaging')

'imag'

In [99]:
s.stem('image')

'imag'

In [100]:
s.stem('imagination')

'imagin'

In [101]:
s.stem('imagine')

'imagin'

In [102]:
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        # retunr lambda doc : (analyzer(doc))
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

In [103]:
vectorizer = StemmedCountVectorizer(min_df=1, stop_words='english')

In [104]:
X = vectorizer.fit_transform(posts)

In [106]:
print(vectorizer.get_feature_names())

['actual', 'capabl', 'contain', 'data', 'databas', 'imag', 'interest', 'learn', 'machin', 'perman', 'post', 'provid', 'safe', 'storag', 'store', 'stuff', 'toy']


In [108]:
X.toarray()

array([[1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1],
       [0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0]], dtype=int64)

In [111]:
new_post = 'imaging database qoo'
new_post_vec = vectorizer.transform([new_post])
new_post_vec

<1x17 sparse matrix of type '<class 'numpy.int64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [112]:
def dist(v1, v2):
    v1_normalized  = v1 / sp.linalg.norm(v1) 
    v2_normalized  = v2 / sp.linalg.norm(v2)
    delta = v1_normalized - v2_normalized
    return sp.linalg.norm(delta)

In [113]:
import sys
best_doc = None
best_dist = 999
best_i = None
num_samples = len(posts)
for i in range(0, num_samples):
    post = posts[i]
    if post==new_post:
        continue
    post_vec = X.getrow(i).toarray()
    d = dist(post_vec, new_post_vec.toarray())
    print("=== Post %i with dist=%.2f: %s"%(i, d, post))
    if d<best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f"%(best_i, best_dist))


=== Post 0 with dist=1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist=0.86: Imaging databases provide storage capabilities.
=== Post 2 with dist=0.63: Most imaging databases safe images permanently.
=== Post 3 with dist=0.77: Imaging databases store data.
=== Post 4 with dist=0.77: Imaging databases store data. Imaging databases store data. Imaging databases store data.
Best post is 2 with dist=0.63


In [115]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(posts)
print(vectorizer.get_feature_names())
X.toarray()

['about', 'actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'provide', 'safe', 'storage', 'store', 'stuff', 'this', 'toy']


array([[ 0.26726124,  0.26726124,  0.        ,  0.26726124,  0.        ,
         0.        ,  0.        ,  0.        ,  0.26726124,  0.26726124,
         0.26726124,  0.26726124,  0.26726124,  0.        ,  0.26726124,
         0.26726124,  0.        ,  0.26726124,  0.        ,  0.        ,
         0.        ,  0.        ,  0.26726124,  0.26726124,  0.26726124],
       [ 0.        ,  0.        ,  0.52451722,  0.        ,  0.        ,
         0.29550385,  0.        ,  0.29550385,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.52451722,  0.        ,
         0.52451722,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.26169047,  0.46449871,  0.26169047,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.46449871,  0.        ,
         0.        ,  0.46449871,  0.        ,  0

In [117]:
from sklearn.feature_extraction.text import TfidfVectorizer

class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer,self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))


In [126]:
vectorizer = StemmedTfidfVectorizer(min_df=1,stop_words='english')
X = vectorizer.fit_transform(posts)

In [127]:
vectorizer.get_feature_names()

['actual',
 'capabl',
 'contain',
 'data',
 'databas',
 'imag',
 'interest',
 'learn',
 'machin',
 'perman',
 'post',
 'provid',
 'safe',
 'storag',
 'store',
 'stuff',
 'toy']

In [128]:
new_post = 'imaging database qoo'
new_post_vec = vectorizer.transform([new_post])
new_post_vec

<1x17 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [129]:
import sys
best_doc = None
best_dist = 999
best_i = None
num_samples = len(posts)
for i in range(0, num_samples):
    post = posts[i]
    if post==new_post:
        continue
    post_vec = X.getrow(i).toarray()
    d = dist(post_vec, new_post_vec.toarray())
    print("=== Post %i with dist=%.2f: %s"%(i, d, post))
    if d<best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f"%(best_i, best_dist))


=== Post 0 with dist=1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist=1.08: Imaging databases provide storage capabilities.
=== Post 2 with dist=0.86: Most imaging databases safe images permanently.
=== Post 3 with dist=0.92: Imaging databases store data.
=== Post 4 with dist=0.92: Imaging databases store data. Imaging databases store data. Imaging databases store data.
Best post is 2 with dist=0.86


### 使用cosine distance 計算

In [132]:
from sklearn.metrics.pairwise import cosine_distances
cs = cosine_distances(new_post_vec,X)
cs.argsort()[::-1]

array([[2, 3, 4, 1, 0]], dtype=int64)

### 同義字處理 

In [139]:
# '柯文哲/柯P'

import jieba
jieba.load_userdict('userdict.txt')

a = ['柯文哲為了大巨蛋一事找趙藤雄算帳', '柯P將不在大巨蛋舉辦世運會']

corpus = [' '.join(jieba.cut(s)) for s in a]
corpus

['柯文哲 為了 大巨蛋 一事 找 趙藤雄 算帳', '柯P 將 不在 大巨蛋 舉辦 世運會']

In [142]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())

['一事', '不在', '世運會', '大巨蛋', '柯p', '柯文哲', '為了', '算帳', '舉辦', '趙藤雄']
[[1 0 0 1 0 1 1 1 0 1]
 [0 1 1 1 1 0 0 0 1 0]]


### 建立同義詞字典 (從維基百科取得同義詞)

In [146]:
import requests
from bs4 import BeautifulSoup
term = '柯文哲'
res = requests.get('https://zh.wikipedia.org/wiki/{}'.format(term))
soup = BeautifulSoup(res.text, 'lxml')
'/'.join([w.text for w in soup.select_one('.mw-parser-output p').select('b')])

'柯文哲/柯P/KP'

### 將同義詞輸入成Python 字典

In [156]:
synonym_dic = {}
for s in open('synonym.txt'):
    synonym = s.strip().split('/')
    for w in synonym[1:]:
        synonym_dic[w.lower()]  = synonym[0]
synonym_dic

{'kp': '柯文哲', '柯p': '柯文哲', '特朗普': '川普', '趙大雄': '趙藤雄'}

### 產生 SynonymCountVectorizer

In [157]:
import nltk.stem

class SynonymCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(SynonymCountVectorizer, self).build_analyzer()
        return lambda doc: (synonym_dic.get(w, w) for w in analyzer(doc))

In [158]:
vectorizer = SynonymCountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

['一事', '不在', '世運會', '大巨蛋', '柯文哲', '為了', '算帳', '舉辦', '趙藤雄']


In [159]:
X.toarray()

array([[1, 0, 0, 1, 1, 1, 1, 0, 1],
       [0, 1, 1, 1, 1, 0, 0, 1, 0]], dtype=int64)

### 設定中文停用詞

In [161]:
stopwords = ['為了', '一事', '不在']

In [162]:
vectorizer = SynonymCountVectorizer(stop_words=stopwords)
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

['世運會', '大巨蛋', '柯文哲', '算帳', '舉辦', '趙藤雄']


## 英文資料分類 (垃圾郵件)

In [3]:
with open('data/email3/easy_ham/01251.793e5c04967cb90191e805dfa619c55a', 'r') as f:
    msg = f.readlines()
    first_blank_index = msg.index('\n')
    msg = msg[(first_blank_index + 1): ]
    #print(''.join(msg))

In [4]:
#?open

def get_msg(path):
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        msg = f.readlines()
        first_blank_index = msg.index('\n')
        msg = msg[(first_blank_index + 1): ]
        return ''.join(msg)

In [5]:
get_msg('data/email3/easy_ham/01251.793e5c04967cb90191e805dfa619c55a')

'Matthias Haase wrote:\n> RH ships the code with the bytecode hinter disabled which makes \n> non-AA fonts really ugly.\n> This reqiures only a small change for include/freetype/config/ftoption.h,\n> it is very well documented.\n\nRed Hat 8.0 ships with the bytecode hinter enabled; I think 7.3 may have \nas well.\n\nThe enabling change to "ftoption.h" is made by Red Hat\'s SRPM before \nbuilding.  Take a look at "freetype-2.1.1-enable-ft2-bci.patch" from the \nSRPM; it\'s pretty clear that this does exactly what needs to be done.\n\nSo if your fonts look ugly, lack of bytecode hinting is *not* the cause.\n\n\n_______________________________________________\nRPM-List mailing list <RPM-List@freshrpms.net>\nhttp://lists.freshrpms.net/mailman/listinfo/rpm-list\n\n\n'

In [6]:
import os
filelist = os.listdir('data/email3/easy_ham')
filelist = [f for f in filelist if f != 'cmds']


In [7]:
import os
path = 'data/email3/easy_ham/'

def get_msgdir(path):
    msg_ary = []
    for f in os.listdir(path):
        if f != 'cmds':
            msg_ary.append(get_msg(os.path.join(path, f)))
    return msg_ary


In [8]:
train_spam_messages    = get_msgdir('data/email3/spam')
train_easyham_messages = get_msgdir('data/email3/easy_ham')

In [208]:
#train_easyham_messages[0]

In [9]:
test_spam_messages    = get_msgdir('data/email3/spam_2')
test_easyham_messages = get_msgdir('data/email3/easy_ham_2')

### 使用re 去除掉特殊字元


In [14]:
#train_spam_messages[0]
import re
a = 'Iabc3D4563D789I'
re.sub('3D', '', a)

'Iabc456789I'

In [29]:
msg = ' '.join(re.sub('<(.|\n)*?>', '',train_spam_messages[0]).split())
msg = re.sub('&\w+;', ' ', msg)
msg = re.sub('_+', '_', msg)

msg


"Save up to 70% on Life Insurance. Why Spend More Than You Have To? Life Quote Savings Ensurin= g your family's financial security is very important. Life Quote Savings ma= kes buying life insurance simple and affordable. We Provide FREE Access = to The Very Best Companies and The Lowest Rates. Life Quote Savings is FAST, EAS= Y and SAVES you money! Let us help you get started with the best val= ues in the country on new coverage. You can SAVE hundreds or even tho= usands of dollars by requesting a FREE quote from Lifequote Savings. = Our service will take you less than 5 minutes to complete. Shop an= d compare. SAVE up to 70% on all types of Life insurance! Click Here For Your= Free Quote! Protecting your family is the best investment you'll eve= r make! If you are in receipt of this= email in error and/or wish to be removed from our list, PLEASE CLICK HERE AND TYPE = REMOVE. If you reside in any state which prohibits e-mail solicitations for insuran= ce, please disregard this email."

In [37]:
#[w for w in msg.lower().split() if re.search('[a-zA-Z]', w)]

In [46]:
' '.join([w for w in re.split('=|\.|!| +|,|\?|/',msg.lower()) \
 if re.search('[a-zA-Z]', w) and len(w) > 1])

"save up to on life insurance why spend more than you have to life quote savings ensurin your family's financial security is very important life quote savings ma kes buying life insurance simple and affordable we provide free access to the very best companies and the lowest rates life quote savings is fast eas and saves you money let us help you get started with the best val ues in the country on new coverage you can save hundreds or even tho usands of dollars by requesting free quote from lifequote savings our service will take you less than minutes to complete shop an compare save up to on all types of life insurance click here for your free quote protecting your family is the best investment you'll eve make if you are in receipt of this email in error and or wish to be removed from our list please click here and type remove if you reside in any state which prohibits e-mail solicitations for insuran ce please disregard this email"

### 建立資料清理函數

In [49]:
def get_msg_words(msg):
    msg = re.sub('3D', '', msg)
    
    msg = re.sub('<(.|\n)*?>', ' ', msg)
    msg = re.sub('&\w+;', ' ', msg)

    msg = re.sub('_+', '_', msg)
    msg = ' '.join(msg.split())
    msg = ' '.join([w for w in re.split('=|\.|!| +|,|\?|/',msg.lower()) \
         if re.search('[a-zA-Z]', w) and len(w) > 1])
    return msg


In [50]:
get_msg_words(train_spam_messages[0])

"save up to on life insurance why spend more than you have to life quote savings ensurin your family's financial security is very important life quote savings ma kes buying life insurance simple and affordable we provide free access to the very best companies and the lowest rates life quote savings is fast eas and saves you money let us help you get started with the best val ues in the country on new coverage you can save hundreds or even tho usands of dollars by requesting free quote from lifequote savings our service will take you less than minutes to complete shop an compare save up to on all types of life insurance click here for your free quote protecting your family is the best investment you'll eve make if you are in receipt of this email in error and or wish to be removed from our list please click here and type remove if you reside in any state which prohibits e-mail solicitations for insuran ce please disregard this email"

In [53]:
train_set_spam = [get_msg_words(w) for w in train_spam_messages]
train_set_ham = [get_msg_words(w) for w in train_easyham_messages]

### 建立資料集

In [54]:
corpus = []
tags   = []

for message in train_set_spam:
    corpus.append(message)
    tags.append('spam')

for message in train_set_ham:
    corpus.append(message)
    tags.append('ham')

In [56]:
len(corpus), len(tags)

(3000, 3000)

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')

class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer,self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

In [72]:
#vectorizer = StemmedTfidfVectorizer(stop_words='english')
vectorizer = StemmedCountVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)
X

<3000x48946 sparse matrix of type '<class 'numpy.int64'>'
	with 272752 stored elements in Compressed Sparse Row format>

### 將資料分成訓練與測試資料集

In [73]:
from sklearn.model_selection import train_test_split
train_data, test_data, train_tag, test_tag = train_test_split(X, tags, test_size = 0.33, random_state = 42)

### 建立分類模型

In [74]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha = 0.1)
clf.fit(train_data, train_tag)


MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

### 驗證預測準確度

In [75]:
predicted = clf.predict(test_data)

In [76]:
from sklearn.metrics import accuracy_score
accuracy_score(test_tag, predicted)

0.98282828282828283

In [77]:
from sklearn.metrics import confusion_matrix
print(clf.classes_)
confusion_matrix(test_tag, predicted)

['ham' 'spam']


array([[817,   1],
       [ 16, 156]])