# ●ナイーブベイズ演習 迷惑メールの振り分け

In [1]:
import pandas as pd
df = pd.read_table('SMSSpamCollection', sep='\t' , header = None , names=['label', 'message'])

In [2]:
df.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
label      5572 non-null object
message    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


### ●前処理

In [4]:
df['label'] = df.label.map({'ham':0 , 'spam':1})
df.head(10)

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


### ●Bag of wordsを体験する

In [5]:
messages = ['Thank you for calling.',
            'Thank you for your inquiry',
            'Thanks for keeping in touch.',
            'Thanks for getting in touch with me?']

### -Bag of words 実装

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

count_vec = CountVectorizer()
count_vec

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [7]:
count_vec.fit(messages)
count_vec.vocabulary_   # fitメソッドはドキュメントにもある通り、データを読み込み、単語に区切った上で辞書形式にします。

{'calling': 0,
 'for': 1,
 'getting': 2,
 'in': 3,
 'inquiry': 4,
 'keeping': 5,
 'me': 6,
 'thank': 7,
 'thanks': 8,
 'touch': 9,
 'with': 10,
 'you': 11,
 'your': 12}

### -サンプル 行列形式に変形する transform

In [27]:
data = count_vec.transform(messages)   # fitさせた、CountVectorizerは、transformメソッドを使用することで、行列形式に変換することができます。
data.todense()   # 変換結果はtodence()メソッドを使用することで確認することができます。

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

### ●データセットを分割する

In [35]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(df['message'],df['label'],test_size=0.2,random_state=1)

### ●X_train、X_testをBag of wordsに置き換える

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

count_vec = CountVectorizer(lowercase=True)

count_vec.fit(X_train,X_test)
count_vec.vocabulary_

{'hi': 3426,
 'where': 7445,
 'are': 1053,
 'you': 7677,
 'we': 7373,
 're': 5567,
 'at': 1121,
 'and': 965,
 'they': 6797,
 'not': 4816,
 'keen': 3909,
 'to': 6892,
 'go': 3156,
 'out': 4988,
 'kind': 3943,
 'of': 4875,
 'am': 939,
 'but': 1581,
 'feel': 2798,
 'shouldn': 6080,
 'so': 6233,
 'can': 1635,
 'tomo': 6913,
 'don': 2411,
 'mind': 4477,
 'do': 2380,
 'if': 3596,
 'home': 3472,
 'then': 6785,
 'come': 1911,
 'down': 2436,
 'within': 7514,
 'min': 4474,
 'when': 7441,
 'guys': 3281,
 'getting': 3119,
 'back': 1199,
 'said': 5848,
 'were': 7422,
 'thinking': 6807,
 'about': 761,
 'staying': 6411,
 'for': 2930,
 'mcr': 4395,
 'tell': 6713,
 'my': 4653,
 'bad': 1202,
 'character': 1741,
 'which': 7451,
 'dnt': 2379,
 'lik': 4111,
 'in': 3636,
 'me': 4396,
 'll': 4159,
 'try': 7010,
 'change': 1733,
 'lt': 4246,
 'gt': 3257,
 'add': 815,
 'tat': 6674,
 'new': 4746,
 'year': 7653,
 'resolution': 5711,
 'waiting': 7311,
 'ur': 7150,
 'reply': 5691,
 'be': 1273,
 'frank': 2966,
 'go

### ●transformメソッドを使用して、X_train,X_testを変換しましょう。

In [37]:
X_train = count_vec.transform(X_train)
X_test = count_vec.transform(X_test)

In [42]:
X_train.todense()
X_test.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ..., 
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

### ●モデル実装・学習

In [43]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### ●評価

In [61]:
# accuracy_score(正解率)

from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)

score = accuracy_score(y_test,y_pred)
('正解率 {}'.format(score))

'正解率 0.9901345291479821'

In [60]:
# Precision(適合率) Recall(再現率) F1 Score(精度と再現率の調和平均)

print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.99      1.00      0.99       968
          1       0.98      0.95      0.96       147

avg / total       0.99      0.99      0.99      1115



In [62]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test,y_pred)

array([[965,   3],
       [  8, 139]])