### SPAM Detection - The Naive Bayes Algorithm in Python with Scikit-Learn 
D. Shahrokhian
https://stackabuse.com/the-naive-bayes-algorithm-in-python-with-scikit-learn/

In [62]:
import pandas as pd

# SMS Spam Collection Data Set
# https://archive.ics.uci.edu/ml/datasets/sms+spam+collection
df = pd.read_table('tabla5.txt',  
                   sep='\t', 
                   header=None,
                   names=['label', 'message'])
df.head()

Unnamed: 0,label,message
0,yes,Taipei Taiwan
1,yes,Macao Taiwan Shanghai
2,no,Japan Sapporo
3,no,Sapporo Osaka Taiwan
4,yes,Chinese Beijing Chinese


In [63]:
df['label'] = df.label.map({'yes': 0, 'no': 1})
df['message'] = df.message.map(lambda x: x.lower())
df['message'] = df.message.str.replace('[^\w\s]', '')
df.head()

Unnamed: 0,label,message
0,0,taipei taiwan
1,0,macao taiwan shanghai
2,1,japan sapporo
3,1,sapporo osaka taiwan
4,0,chinese beijing chinese


In [64]:
# https://www.nltk.org/ Natural Language Toolkit
# Punkt Sentence Tokenizer https://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.punkt
import nltk
nltk.download('punkt')

df['message'] = df['message'].apply(nltk.word_tokenize)
df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vsmurilloso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,label,message
0,0,"[taipei, taiwan]"
1,0,"[macao, taiwan, shanghai]"
2,1,"[japan, sapporo]"
3,1,"[sapporo, osaka, taiwan]"
4,0,"[chinese, beijing, chinese]"


In [65]:
# https://www.nltk.org/api/nltk.stem.html
#https://tartarus.org/martin/PorterStemmer/
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
df['message'] = df['message'].apply(lambda x: [stemmer.stem(y) for y in x]) 
df.head()

Unnamed: 0,label,message
0,0,"[taipei, taiwan]"
1,0,"[macao, taiwan, shanghai]"
2,1,"[japan, sapporo]"
3,1,"[sapporo, osaka, taiwan]"
4,0,"[chines, beij, chines]"


In [66]:
# Converts the list of words into space-separated strings
df['message'] = df['message'].apply(lambda x: ' '.join(x))
df.head()

Unnamed: 0,label,message
0,0,taipei taiwan
1,0,macao taiwan shanghai
2,1,japan sapporo
3,1,sapporo osaka taiwan
4,0,chines beij chines


In [67]:
# Convert a collection of text documents to a matrix of token counts
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
# to allow one letter words count_vect = CountVectorizer(token_pattern = r"(?u)\b\w+\b")
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(token_pattern = r"(?u)\b\w+\b") 
counts = count_vect.fit_transform(df['message'])  
print counts

  (0, 8)	1
  (0, 7)	1
  (1, 6)	1
  (1, 3)	1
  (1, 8)	1
  (2, 5)	1
  (2, 2)	1
  (3, 4)	1
  (3, 5)	1
  (3, 8)	1
  (4, 0)	1
  (4, 1)	2
  (5, 1)	2
  (5, 6)	1
  (6, 1)	1
  (6, 3)	1
  (7, 9)	1
  (7, 1)	1
  (7, 2)	1


In [68]:
counts.shape

(8, 10)

In [72]:
# https://stackoverflow.com/questions/28064634/random-state-pseudo-random-numberin-scikit-learn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.1, random_state=69) 

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(X_train, y_train)  

import numpy as np
predicted = model.predict(X_test)
print(np.mean(predicted == y_test))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predicted))

1.0
[[1]]


In [73]:
per = 0
for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.1) 
    model = MultinomialNB().fit(X_train, y_train)  

    import numpy as np
    predicted = model.predict(X_test)
    print(np.mean(predicted == y_test))
    per += np.mean(predicted == y_test)

    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, predicted))

print "average perfromance"
print per/10.0

0.0
[[0 1]
 [0 0]]
0.0
[[0 0]
 [1 0]]
1.0
[[1]]
0.0
[[0 0]
 [1 0]]
1.0
[[1]]
1.0
[[1]]
1.0
[[1]]
0.0
[[0 0]
 [1 0]]
1.0
[[1]]
0.0
[[0 0]
 [1 0]]
average perfromance
0.5


In [74]:
per = 0
for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.2) 
    model = MultinomialNB().fit(X_train, y_train)  

    import numpy as np
    predicted = model.predict(X_test)
    print(np.mean(predicted == y_test))
    per += np.mean(predicted == y_test)

    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, predicted))

print "average perfromance"
print per/10.0

0.5
[[1 0]
 [1 0]]
1.0
[[2]]
1.0
[[2]]
0.5
[[0 0]
 [1 1]]
1.0
[[2]]
0.0
[[0 0]
 [2 0]]
1.0
[[1 0]
 [0 1]]
0.5
[[1 1]
 [0 0]]
0.5
[[0 0]
 [1 1]]
1.0
[[2]]
average perfromance
0.7


In [76]:
per = 0
for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.5) 
    model = MultinomialNB().fit(X_train, y_train)  

    import numpy as np
    predicted = model.predict(X_test)
    print(np.mean(predicted == y_test))
    per += np.mean(predicted == y_test)

    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, predicted))

print "average perfromance"
print per/10.0

1.0
[[3 0]
 [0 1]]
0.75
[[2 0]
 [1 1]]
0.5
[[2 1]
 [1 0]]
0.75
[[2 0]
 [1 1]]
0.25
[[1 3]
 [0 0]]
0.5
[[1 2]
 [0 1]]
0.75
[[2 0]
 [1 1]]
0.75
[[3 1]
 [0 0]]
0.25
[[1 2]
 [1 0]]
0.75
[[2 0]
 [1 1]]
average perfromance
0.625
