In [53]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\visha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [2]:
data = pd.read_csv('spam.csv', encoding='latin-1')

In [3]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [4]:
data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)
data = data.rename(columns={'v1':'class','v2':'text'})

In [5]:
data.head()

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
data.groupby('class').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


### removing stop words and punctuations

In [16]:
def process_text(text):
    new_text = ''
    new_list = []
    # removing punctuations
    for i in text:
        if i not in string.punctuation:
            new_text = new_text+i
    # removing stop words
    for i in new_text.split():
        if i.lower() not in stopwords.words('english'):
            new_list.append(i)
    return new_list

In [29]:
X = data['text'].apply(process_text)

In [31]:
X

0       [Go, jurong, point, crazy, Available, bugis, n...
1                          [Ok, lar, Joking, wif, u, oni]
2       [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3           [U, dun, say, early, hor, U, c, already, say]
4       [Nah, dont, think, goes, usf, lives, around, t...
5       [FreeMsg, Hey, darling, 3, weeks, word, back, ...
6       [Even, brother, like, speak, treat, like, aids...
7       [per, request, Melle, Melle, Oru, Minnaminungi...
8       [WINNER, valued, network, customer, selected, ...
9       [mobile, 11, months, U, R, entitled, Update, l...
10      [Im, gonna, home, soon, dont, want, talk, stuf...
11      [SIX, chances, win, CASH, 100, 20000, pounds, ...
12      [URGENT, 1, week, FREE, membership, å£100000, ...
13      [Ive, searching, right, words, thank, breather...
14                                         [DATE, SUNDAY]
15      [XXXMobileMovieClub, use, credit, click, WAP, ...
16                                    [Oh, kim, watching]
17      [Eh, u

### splitting into test and train

In [34]:
X_train,X_test,y_train,y_test = train_test_split(data['text'],data['class'],test_size = 0.2)

### converting the strings to vectors

In [62]:
vectorizer = CountVectorizer(analyzer=process_text)
vectorizer.fit(X_train)
x = vectorizer.transform(X_train)

# pipeline = Pipeline([
#     ('bow',CountVectorizer(analyzer=process_text)), # converts strings to integer counts
# #     ('tfidf',TfidfTransformer()), # converts integer counts to weighted TF-IDF scores
#     ('classifier',MultinomialNB()) 
# ])



In [63]:
x.toarray()
# pipeline.fit(X_train,y_train)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [64]:
nb = MultinomialNB()
nb.fit(x,y_train)
# pred = pipeline.predict(X_test)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [65]:
x2 = vectorizer.transform(X_test)
pred = nb.predict(x2)

In [66]:
print(classification_report(y_test,pred))

             precision    recall  f1-score   support

        ham       0.98      0.99      0.99       961
       spam       0.95      0.90      0.93       154

avg / total       0.98      0.98      0.98      1115

