In [295]:
import pandas as pd
import sklearn
import nltk

### 1) Loading Data

In [296]:
data = pd.read_csv("spam_or_not_spam.csv")
data.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


### 2) Tokenization

In [297]:
data['email'] = data['email'].apply(lambda x : str(x).split())

In [298]:
data.head()

Unnamed: 0,email,label
0,"[date, wed, NUMBER, aug, NUMBER, NUMBER, NUMBE...",0
1,"[martin, a, posted, tassos, papadopoulos, the,...",0
2,"[man, threatens, explosion, in, moscow, thursd...",0
3,"[klez, the, virus, that, won, t, die, already,...",0
4,"[in, adding, cream, to, spaghetti, carbonara, ...",0


### 3) Stemming
_CHANGES (cleaning, cleaner, cleans) -> clean_

In [299]:
from nltk.stem.snowball import SnowballStemmer

In [300]:
stemmer = SnowballStemmer('english', ignore_stopwords=False)

In [301]:
data['email'] = data['email'].apply(lambda x: [stemmer.stem(word) for word in x])

In [302]:
data.head()

Unnamed: 0,email,label
0,"[date, wed, number, aug, number, number, numbe...",0
1,"[martin, a, post, tasso, papadopoulo, the, gre...",0
2,"[man, threaten, explos, in, moscow, thursday, ...",0
3,"[klez, the, virus, that, won, t, die, alreadi,...",0
4,"[in, ad, cream, to, spaghetti, carbonara, whic...",0


###  4) Lemmitization
_CHANGES (is,am,are..) -> be_

In [303]:
from nltk.stem import WordNetLemmatizer
lemmitizer = WordNetLemmatizer()

In [304]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to C:\Users\V.L.S
[nltk_data]     RUTHWIK\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\V.L.S
[nltk_data]     RUTHWIK\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [305]:
data['email'] = data['email'].apply(lambda x: [lemmitizer.lemmatize(word,pos='a') for word in x]) #pos = a denotes adjective

In [306]:
data.head()

Unnamed: 0,email,label
0,"[date, wed, number, aug, number, number, numbe...",0
1,"[martin, a, post, tasso, papadopoulo, the, gre...",0
2,"[man, threaten, explos, in, moscow, thursday, ...",0
3,"[klez, the, virus, that, won, t, die, alreadi,...",0
4,"[in, ad, cream, to, spaghetti, carbonara, whic...",0


### 5) Stopword removal
_Removes common words like (in, the, .....)

In [307]:
from nltk.corpus import stopwords

In [308]:
stop_words = stopwords.words('english')
#print(stop_words)

In [309]:
data['email'] = data['email'].apply(lambda x: [word for word in x if not word in stop_words])

In [310]:
data.head()

Unnamed: 0,email,label
0,"[date, wed, number, aug, number, number, numbe...",0
1,"[martin, post, tasso, papadopoulo, greek, scul...",0
2,"[man, threaten, explos, moscow, thursday, augu...",0
3,"[klez, virus, die, alreadi, prolif, virus, eve...",0
4,"[ad, cream, spaghetti, carbonara, effect, past...",0


**Joining list to single text**

In [311]:
data['email'] = data['email'].apply(lambda x: ' '.join(x))

In [312]:
data.head()

Unnamed: 0,email,label
0,date wed number aug number number number numbe...,0
1,martin post tasso papadopoulo greek sculptor b...,0
2,man threaten explos moscow thursday august num...,0
3,klez virus die alreadi prolif virus ever klez ...,0
4,ad cream spaghetti carbonara effect pasta make...,0


### 6) Vectorization TDF / TF-IDF
_Converts textual data into numerical format and creates a matrix where each column represents a feature and each row reresents an individual review._

**There are two types of vectorization**  
**1) TF** _(Term frequency)_  

TF = No.of times word appear in the text / Total no.of words in the text  

**2) TF-IDF** _(Term frequency-Inverse document frequency)_  

IDF = log_e(Total no.of documents / No.of document with term t in it)

In [313]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [314]:
y = data.label.values
x = tfidf.fit_transform(data['email'])

In [315]:
from sklearn.model_selection import train_test_split
x_train,x_text,y_train,y_text = train_test_split(x,y,random_state=1,test_size=0.2,shuffle=True)

### 7) Classification using Logistic Regression

In [316]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train,y_train)
y_pred = clf.predict(x_text)

from sklearn.metrics import accuracy_score
acc_log = accuracy_score(y_pred, y_text)*100
print("accuracy:",acc_log)

accuracy: 96.0


### 8) Classification using LinearSVC Accuracy

In [317]:
from sklearn.svm import LinearSVC
linear_svc = LinearSVC(random_state=0)
linear_svc.fit(x_train,y_train)
y_pred = linear_svc.predict(x_text)
acc_linear_svc = accuracy_score(y_pred, y_text)*100
print("accuracy:",acc_linear_svc)

accuracy: 98.66666666666667


### Testing new data

In [362]:
def is_spam(x):
    text = tfidf.transform([x])
    if clf.predict(text)[0]:
        return 'spam'
    else:
        return 'ham'

In [382]:
is_spam(input())




'ham'