### SPAM Detection - The Naive Bayes Algorithm in Python with Scikit-Learn 
D. Shahrokhian
https://stackabuse.com/the-naive-bayes-algorithm-in-python-with-scikit-learn/

In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [45]:
biggie_df = pd.read_csv('./biggie_lyrics.csv', usecols=[1], encoding='latin-1', header=None)
biggie_df.columns = ["lyrics"]
biggie_df["lyrics"] = biggie_df["lyrics"].str.replace('[^\w\s]','')
biggie_df["lyrics"] = biggie_df["lyrics"].str.lower()

In [46]:
biggie_df.tail()

Unnamed: 0,lyrics
11,relax and take notes while i take tokes of the...
12,good evenin ladies and gentlemen\nhows everybo...
13,who shot ya\nseperate the weak from the obsole...
14,when i die fuck it i wanna go to hell\ncause i...
15,when the lala hits ya lyrics just splits ya\nh...


In [47]:
pac_df = pd.read_csv('./2pac_lyrics.csv', usecols=[1], encoding='latin-1', header=None)
pac_df.columns = ["lyrics"]
pac_df["lyrics"] = pac_df["lyrics"].str.replace('[^\w\s]','')
pac_df["lyrics"] = pac_df["lyrics"].str.lower()

In [48]:
pac_df.head()

Unnamed: 0,lyrics
0,little something for my godson elijah\nand a l...
1,yo mo bee mayn drop that shit\nyou know what t...
2,rest in peace to my motherfucker biggy smallz\...
3,makaveli in this killuminati\nall through your...
4,its just me against the world\nnothin to lose\...


In [49]:
biggie_lyrics = biggie_df["lyrics"].values
biggie_lyrics = [ song.split('\n') for song in biggie_lyrics]
biggie_lyrics = [line for song in biggie_lyrics for line in song]
pac_lyrics = pac_df["lyrics"].values
pac_lyrics = [ song.split('\n') for song in pac_lyrics]
pac_lyrics = [line for song in pac_lyrics for line in song]

rap_lines = [] 

for line in biggie_lyrics:
    if len(line.split()) > 3:
        rap_lines.append(np.array([0,str(line)]))
        
for line in pac_lyrics:
    if len(line.split()) > 3:
        rap_lines.append(np.array([1,str(line)]))
        
rap_lines = np.array(rap_lines)

In [50]:
rap_lines = pd.DataFrame(rap_lines)
rap_lines.columns = ["label","line"]
rap_lines.head()
rap_lines['label'] = rap_lines['label'].replace(['0','1'],[0,1])

In [51]:
# https://www.nltk.org/ Natural Language Toolkit
# Punkt Sentence Tokenizer https://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.punkt
import nltk
nltk.download('punkt')

rap_lines['line'] = rap_lines['line'].apply(nltk.word_tokenize)
rap_lines.head()

[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


Unnamed: 0,label,line
0,0,"[fuck, all, you, hoes]"
1,0,"[get, a, grip, motherfucker]"
2,0,"[yeah, this, album, is, dedicated, to, all, th..."
3,0,"[id, never, amount, to, nothin, to, all, the, ..."
4,0,"[buildings, that, i, was, hustlin, in, front, ..."


In [52]:
# https://www.nltk.org/api/nltk.stem.html
#https://tartarus.org/martin/PorterStemmer/
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
rap_lines['line'] = rap_lines['line'].apply(lambda x: [stemmer.stem(y) for y in x]) 
rap_lines.head()

Unnamed: 0,label,line
0,0,"[fuck, all, you, hoe]"
1,0,"[get, a, grip, motherfuck]"
2,0,"[yeah, thi, album, is, dedic, to, all, the, te..."
3,0,"[id, never, amount, to, nothin, to, all, the, ..."
4,0,"[build, that, i, wa, hustlin, in, front, of, t..."


In [57]:
# Converts the list of words into space-separated strings
rap_lines['line'] = rap_lines['line'].apply(lambda x: ' '.join(x))
rap_lines.head()

Unnamed: 0,label,line
0,0,f u c k a l l y o u h o e
1,0,g e t a g r i p m o t h e r f u c k
2,0,y e a h t h i a l b u m i s d e d i c ...
3,0,i d n e v e r a m o u n t t o n o t h ...
4,0,b u i l d t h a t i w a h u s t l i n ...


In [59]:
# Convert a collection of text documents to a matrix of token counts
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
# to allow one letter words count_vect = CountVectorizer(token_pattern = r"(?u)\b\w+\b")
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(token_pattern = r"(?u)\b\w+\b") 
counts = count_vect.fit_transform(rap_lines['line'])  
print counts

  (0, 14)	1
  (0, 17)	1
  (0, 24)	2
  (0, 34)	1
  (0, 21)	2
  (0, 10)	1
  (0, 20)	1
  (0, 12)	1
  (0, 30)	2
  (0, 15)	1
  (1, 22)	1
  (1, 25)	1
  (1, 18)	1
  (1, 27)	2
  (1, 29)	2
  (1, 16)	2
  (1, 14)	2
  (1, 17)	1
  (1, 24)	1
  (1, 10)	1
  (1, 20)	1
  (1, 12)	1
  (1, 30)	1
  (1, 15)	1
  (2, 13)	3
  :	:
  (1968, 24)	5
  (1968, 34)	1
  (1968, 21)	1
  (1968, 10)	8
  (1968, 20)	2
  (1968, 12)	2
  (1968, 15)	1
  (1969, 32)	2
  (1969, 23)	1
  (1969, 13)	1
  (1969, 28)	1
  (1969, 22)	1
  (1969, 25)	1
  (1969, 18)	3
  (1969, 27)	2
  (1969, 29)	3
  (1969, 14)	4
  (1969, 17)	3
  (1969, 24)	1
  (1969, 34)	1
  (1969, 21)	1
  (1969, 10)	2
  (1969, 20)	1
  (1969, 12)	2
  (1969, 30)	1


In [60]:
counts.shape

(1970, 36)

In [62]:
# https://stackoverflow.com/questions/28064634/random-state-pseudo-random-numberin-scikit-learn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(counts, rap_lines['label'], test_size=0.1, random_state=69) 

from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB().fit(X_train, y_train)  

import numpy as np
predicted = model.predict(X_test)
print(np.mean(predicted == y_test))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predicted))

0.5380710659898477
[[28 67]
 [24 78]]


In [8]:
per = 0
for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.1) 
    model = BernoulliNB().fit(X_train, y_train)  

    import numpy as np
    predicted = model.predict(X_test)
    print(np.mean(predicted == y_test))
    per += np.mean(predicted == y_test)

    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, predicted))

print "average perfromance"
print per/10.0

1.0
[[1]]
1.0
[[1]]
1.0
[[1]]
0.0
[[0 0]
 [1 0]]
1.0
[[1]]
1.0
[[1]]
1.0
[[1]]
1.0
[[1]]
0.0
[[0 0]
 [1 0]]
0.0
[[0 0]
 [1 0]]
average perfromance
0.7


In [63]:
per = 0
for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(counts, rap_lines['label'], test_size=0.2) 
    model = BernoulliNB().fit(X_train, y_train)  

    import numpy as np
    predicted = model.predict(X_test)
    print(np.mean(predicted == y_test))
    per += np.mean(predicted == y_test)

    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, predicted))

print "average perfromance"
print per/10.0

0.6040609137055838
[[ 68 105]
 [ 51 170]]
0.6065989847715736
[[ 66 106]
 [ 49 173]]
0.5989847715736041
[[ 60 128]
 [ 30 176]]
0.6015228426395939
[[ 75 103]
 [ 54 162]]
0.6192893401015228
[[ 61 115]
 [ 35 183]]
0.6142131979695431
[[ 73  97]
 [ 55 169]]
0.5913705583756346
[[ 61 121]
 [ 40 172]]
0.5964467005076142
[[ 68 110]
 [ 49 167]]
0.5862944162436549
[[ 70 111]
 [ 52 161]]
0.5786802030456852
[[ 57 112]
 [ 54 171]]
average perfromance
0.5997461928934011


In [64]:
per = 0
for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(counts, rap_lines['label'], test_size=0.5) 
    model = BernoulliNB().fit(X_train, y_train)  

    import numpy as np
    predicted = model.predict(X_test)
    print(np.mean(predicted == y_test))
    per += np.mean(predicted == y_test)

    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, predicted))

print "average perfromance"
print per/10.0

0.601015228426396
[[169 267]
 [126 423]]
0.5908629441624366
[[172 257]
 [146 410]]
0.5766497461928934
[[138 306]
 [111 430]]
0.5857868020304569
[[175 247]
 [161 402]]
0.6081218274111675
[[173 256]
 [130 426]]
0.5685279187817259
[[116 328]
 [ 97 444]]
0.5959390862944163
[[161 272]
 [126 426]]
0.6131979695431472
[[164 264]
 [117 440]]
0.5766497461928934
[[163 264]
 [153 405]]
0.5918781725888325
[[167 259]
 [143 416]]
average perfromance
0.5908629441624366
