In [223]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

#from sklearn.feature_extraction.text import TfidfTransformer

In [224]:
# load data and show the first items
data = pd.read_csv('data/spam.csv', encoding = "ISO-8859-1", engine='python')
data.head()

Unnamed: 0,Class,Text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [225]:
# Unnamed:2, Unnamed:3, Unnamed:4 features not needed(insignificant features), 
dt=data[['Class','Text']]
dt.head(10)

Unnamed: 0,Class,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [226]:
len(dt)

5572

In [227]:
len(dt[dt.Class=='spam'])

747

In [228]:
len(dt[dt.Class=='ham'])

4825

In [229]:
dt['Class'] = dt.Class.map({'ham':0, 'spam':1})
dt.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Class,Text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [230]:
x=dt['Text']
y=dt['Class']

In [231]:
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will ÃŒ_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Text, Length: 5572, dtype: object

In [232]:
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Class, Length: 5572, dtype: int64

In [233]:
# SHOW THE SHAPE OF THE DATA
dt.shape

(5572, 2)

In [234]:
# remove the duplicate
dt.drop_duplicates(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [235]:
# check the shape after removing a duplicate
dt.shape

(5169, 2)

In [236]:
# show the number of missing data(NAN, NaN, na)
dt.isnull().sum()

Class    0
Text     0
dtype: int64

In [237]:
# download stopwords package
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sdjed\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [238]:
# remove punctuation,stopwords
def processing_messages(message):
    # remove punctuation
 punc_removed = [c for c in message if c not in string.punctuation]
 punc_removed = ''.join(punc_removed)
    #remove stopwords
 msg_cleaned = [w for w in punc_removed.split() if w.lower() not in stopwords.words('english')]
#return the cleaned text
 return msg_cleaned

In [243]:
# tokenization
dt['Text'].head(7).apply(processing_messages)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
5    [FreeMsg, Hey, darling, 3, weeks, word, back, ...
6    [Even, brother, like, speak, treat, like, aids...
Name: Text, dtype: object

In [241]:
# extract features by converting a text into token count matrix of tokens got by calling a function "processing_messages"
count_v=CountVectorizer(analyzer = processing_messages)
x_train_cv=count_v.fit_transform(x)

In [244]:
# get the shape of vectorized data
x_train_cv.shape

(5572, 11304)

In [245]:
# show the vectorized data in array
arr = x_train_cv.toarray()

In [246]:
arr

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [247]:
arr[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [248]:
len(arr[0])

11304

In [249]:
count_v.inverse_transform(arr[0])

[array(['Available', 'Cine', 'Go', 'amore', 'buffet', 'bugis', 'crazy',
        'e', 'got', 'great', 'jurong', 'la', 'n', 'point', 'wat', 'world'],
       dtype='<U52')]

In [250]:
# split our data into train and test using vectorized data and "Class" y=dt['class'] as target 
x_train, x_test, y_train, y_test = train_test_split(x_train_cv, y, test_size=0.25, random_state=0)

In [251]:
#build and train the classifier model
model = MultinomialNB()
model.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [252]:
# prediction
predict = model.predict(x_train)
print(predict)

[0 0 1 ... 0 0 0]


In [253]:
# print the actual values
print(y_train.values)

[0 0 1 ... 0 0 0]


In [254]:
# evaluate the model on the train data
predict = model.predict(x_train)
print(predict)
print(classification_report(y_train, predict))
print(confusion_matrix(y_train, predict))
print(accuracy_score(y_train, predict))

[0 0 1 ... 0 0 0]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3629
           1       0.98      0.98      0.98       550

    accuracy                           1.00      4179
   macro avg       0.99      0.99      0.99      4179
weighted avg       1.00      1.00      1.00      4179

[[3620    9]
 [  11  539]]
0.9952141660684374


In [255]:
# on the test data
predict = model.predict(x_test)
print(predict)
print(y_test.values)

[0 0 0 ... 0 0 1]
[0 0 0 ... 0 0 1]


In [256]:
# evaluate the model on test data
predict = model.predict(x_test)
print(predict)
print(classification_report(y_test, predict))
print(confusion_matrix(y_test, predict))
print(accuracy_score(y_test, predict))

[0 0 0 ... 0 0 1]
              precision    recall  f1-score   support

           0       0.99      0.96      0.98      1196
           1       0.81      0.94      0.87       197

    accuracy                           0.96      1393
   macro avg       0.90      0.95      0.92      1393
weighted avg       0.96      0.96      0.96      1393

[[1153   43]
 [  12  185]]
0.9605168700646087
