In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk

In [3]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
import numpy as np

In [5]:
import pandas as pd

# loading DATASET

In [6]:
ds = pd.read_csv("spam.csv" , encoding = 'Windows-1252')
ds.head(10)

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [7]:
ds['spam'] = ds['type'].map( {'spam':1 , 'ham':0} ).astype(int)
ds.head(5)

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
print('columns are:')
for col in ds.columns:
    print(col)

columns are:
type
text
spam


In [9]:
t=len(ds['type'])
print("NO OF ROWS IN REVIEW COLUMN:",t)

NO OF ROWS IN REVIEW COLUMN: 116


In [10]:
ds.head(5)

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


# Tokenization

In [11]:
# Before tokenization is apply
ds['text'][4]

"Nah I don't think he goes to usf, he lives around here though"

In [12]:
def tokenizer (text):
    return text.split()

In [13]:
ds['text']=ds['text'].apply(tokenizer)

In [14]:
# after tokenization is apply
ds['text'][4]

['Nah',
 'I',
 "don't",
 'think',
 'he',
 'goes',
 'to',
 'usf,',
 'he',
 'lives',
 'around',
 'here',
 'though']

# STEMMING

In [15]:
# before Stemming is apply
ds['text'][4]

['Nah',
 'I',
 "don't",
 'think',
 'he',
 'goes',
 'to',
 'usf,',
 'he',
 'lives',
 'around',
 'here',
 'though']

In [1]:
from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer("english", ignore_stopwords=False)

In [17]:
def stem_it (text):
    return [porter.stem (word) for word in text]

In [18]:
ds['text']=ds['text'].apply(stem_it)

In [19]:
# after Stemming is apply
ds['text'][4]

['nah',
 'i',
 "don't",
 'think',
 'he',
 'goe',
 'to',
 'usf,',
 'he',
 'live',
 'around',
 'here',
 'though']

# LEMMITIZATION

In [20]:
# before Lemmitization is apply
ds['text'][21]

['iâ€˜m', 'go', 'to', 'tri', 'for', '2', 'month', 'ha', 'ha', 'onli', 'joke']

In [21]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [22]:
def lemmit_it (text):
    return [lemmatizer.lemmatize (word, pos="a") for word in text]

In [23]:
ds['text']=ds['text'].apply(lemmit_it)

In [24]:
# After Lemmitization is apply
ds['text'][4]

['nah',
 'i',
 "don't",
 'think',
 'he',
 'goe',
 'to',
 'usf,',
 'he',
 'live',
 'around',
 'here',
 'though']

# STOPWORD REMOVAL

In [25]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
from nltk.corpus import stopwords
stop_words = stopwords.words ('english')

In [27]:
def stop_it (text):
    review = [word for word in text if not word in stop_words ]
    return review

In [28]:
ds['text']=ds['text'].apply(stop_it)

In [29]:
# After STOPWORD is apply
ds['text'][4]

['nah', 'think', 'goe', 'usf,', 'live', 'around', 'though']

In [30]:
ds.head()

Unnamed: 0,type,text,spam
0,ham,"[go, jurong, point,, crazy.., avail, onli, bug...",0
1,ham,"[ok, lar..., joke, wif, u, oni...]",0
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin...",1
3,ham,"[u, dun, say, earli, hor..., u, c, alreadi, sa...",0
4,ham,"[nah, think, goe, usf,, live, around, though]",0


In [31]:
ds['text'] = ds['text'].apply(' '.join)

In [32]:
ds.head()

Unnamed: 0,type,text,spam
0,ham,"go jurong point, crazy.. avail onli bugi n gre...",0
1,ham,ok lar... joke wif u oni...,0
2,spam,free entri 2 wkli comp win fa cup final tkts 2...,1
3,ham,u dun say earli hor... u c alreadi say...,0
4,ham,"nah think goe usf, live around though",0


# Transform Text Data into TDF /TF-IDF Vectors

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
y = ds.spam.values
X =vectorizer.fit_transform(ds['text'])

In [34]:
from sklearn.model_selection import train_test_split
X_train,X_text,y_train,y_text=train_test_split(X,y,random_state=1, test_size=0.2, shuffle=False)

# Classification using Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression()
clf.fit(X_train,y_train)
y_pred=clf.predict(X_text)
from sklearn.metrics import accuracy_score
acc_log = accuracy_score(y_pred, y_text)*100
print("accuracy: ", acc_log)

accuracy:  87.5


# Classification using LinearSVC Accuracy

In [36]:
 from sklearn.svm import LinearSVC

linear_svc = LinearSVC (random_state=0)
linear_svc.fit(X_train, y_train)

y_pred = linear_svc.predict(X_text)

acc_linear_svc = accuracy_score(y_pred, y_text) * 100
print("accuracy: ", acc_linear_svc)

accuracy:  87.5
