In [1]:
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.tokenize import word_tokenize
import sklearn.metrics as m

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
lemma=WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tirth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tirth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
dataset=pd.read_csv('spam.csv',encoding='latin-1')

In [4]:
dataset.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
dataset=dataset.rename(columns={'v2':'text','v1':'label'})

In [6]:
dataset['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [7]:
ham_words=""
spam_words=""

In [8]:
import re
for val in dataset[dataset['label']=='spam'].text:
    text=val.lower()
    text=re.sub('[^A-Za-z]',' ',text)
    text=re.sub(',',' ',text)
    tokens=nltk.word_tokenize(text)
    tokens=[lemma.lemmatize(i) for i in tokens if i not in stopwords.words('english')]
    for word in tokens:
        spam_words=spam_words+word+' '

for val in dataset[dataset['label']=='ham'].text:
    text=val.lower()
    text=re.sub('[^A-Za-z]',' ',text)
    text=re.sub(',',' ',text)
    tokens=nltk.word_tokenize(text)
    tokens=[lemma.lemmatize(i) for i in tokens if i not in stopwords.words('english')]
    for word in tokens:
        ham_words=ham_words+word+' '

In [9]:
dataset=dataset.replace(['ham','spam'],[0,1])

In [10]:
import re
def process_text(text):
    text=text.lower()
    text=re.sub('[^A-Za-z]',' ',text)
    text=re.sub(',',' ',text)
    words=word_tokenize(text)
    words=[lemma.lemmatize(i) for i in words if i not in stopwords.words('english')]
    text=' '.join(words)
    return text

In [11]:
dataset['processed_text']=dataset['text'].apply(process_text)
dataset

Unnamed: 0,label,text,Unnamed: 2,Unnamed: 3,Unnamed: 4,processed_text
0,0,"Go until jurong point, crazy.. Available only ...",,,,go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,,,,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,,,,free entry wkly comp win fa cup final tkts st ...
3,0,U dun say so early hor... U c already then say...,,,,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",,,,nah think go usf life around though
...,...,...,...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,,,,nd time tried contact u u pound prize claim ea...
5568,0,Will Ì_ b going to esplanade fr home?,,,,b going esplanade fr home
5569,0,"Pity, * was in mood for that. So...any other s...",,,,pity mood suggestion
5570,0,The guy did some bitching but I acted like i'd...,,,,guy bitching acted like interested buying some...


In [19]:
text=pd.DataFrame(dataset['processed_text'])
label=pd.DataFrame(dataset['label'])

In [20]:
text

Unnamed: 0,processed_text
0,go jurong point crazy available bugis n great ...
1,ok lar joking wif u oni
2,free entry wkly comp win fa cup final tkts st ...
3,u dun say early hor u c already say
4,nah think go usf life around though
...,...
5567,nd time tried contact u u pound prize claim ea...
5568,b going esplanade fr home
5569,pity mood suggestion
5570,guy bitching acted like interested buying some...


In [59]:
(dataset['processed_text'])

0       go jurong point crazy available bugis n great ...
1                                 ok lar joking wif u oni
2       free entry wkly comp win fa cup final tkts st ...
3                     u dun say early hor u c already say
4                     nah think go usf life around though
                              ...                        
5567    nd time tried contact u u pound prize claim ea...
5568                            b going esplanade fr home
5569                                 pity mood suggestion
5570    guy bitching acted like interested buying some...
5571                                       rofl true name
Name: processed_text, Length: 5572, dtype: object

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
tfidf=TfidfVectorizer(max_features=5000)

In [22]:
features=tfidf.fit_transform(dataset['processed_text'])

In [24]:
features

<5572x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 42917 stored elements in Compressed Sparse Row format>

In [26]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(features,dataset['label'],test_size=0.15,random_state=100)

In [32]:
from imblearn.over_sampling import SMOTE
from collections import Counter
smote=SMOTE(sampling_strategy={1:4000},random_state=100)

x_train,y_train=smote.fit_resample(x_train.astype('float'),y_train)
print("Before SMOTE:",Counter(y_train))

Before SMOTE: Counter({0: 4099, 1: 4000})


XGBoost

In [36]:
import xgboost as xgb
model=xgb.XGBClassifier()
model=model.fit(x_train,y_train)

In [38]:
from sklearn.metrics import accuracy_score
y_pred=model.predict(x_test)
accuracy=accuracy_score(y_test,y_pred)
accuracy

0.9808612440191388

In [48]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

#Naive Bayes

In [50]:
modelnb=MultinomialNB()
modelnb.fit(x_train,y_train)
y_pred=modelnb.predict(x_test)
accuracy=accuracy_score(y_test,y_pred)
accuracy
#accuracy

0.9677033492822966

#SVC

In [51]:
modelsvc=SVC()
modelsvc.fit(x_train,y_train)
y_pred=modelsvc.predict(x_test)
accuracy=accuracy_score(y_test,y_pred)
accuracy

0.9820574162679426

#Decision Tree

In [52]:
modeldt=DecisionTreeClassifier()
modeldt.fit(x_train,y_train)
y_pred=modeldt.predict(x_test)
accuracy=accuracy_score(y_test,y_pred)
accuracy

0.9605263157894737

#Random Forest

In [53]:
modelrf=RandomForestClassifier()
modelrf.fit(x_train,y_train)
y_pred=modelrf.predict(x_test)
accuracy=accuracy_score(y_test,y_pred)
accuracy

0.9892344497607656

#Logistic Regression

In [54]:
modellr=LogisticRegression()
modellr.fit(x_train,y_train)
y_pred=modellr.predict(x_test)
accuracy=accuracy_score(y_test,y_pred)
accuracy

0.9760765550239234

#Saving Best Model

In [57]:
import joblib
joblib.dump(modelrf,'best.pkl')
print('best model so far')
best=joblib.load('best.pkl')

best model so far


In [66]:
def find(p):
    if(p==0):
        print("ham")
    else:
        print("spam")

In [67]:
text1="As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune"
text2="Had your contract mobile 11 Mnths? Latest Motorola, Nokia etc. all FREE! Double Mins & Text on Orange tariffs. TEXT YES for callback, no to remove from records."

text1=process_text(text1)
text1=text1.split('\n')
text2=process_text(text2)
text2=text2.split('\n')
print(text1)
print(text2)
f1=tfidf.transform(text1)
f2=tfidf.transform(text2)

p1=best.predict(f1)[0]
p2=best.predict(f2)[0]

find(p1)
find(p2)

['per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune']
['contract mobile mnths latest motorola nokia etc free double min text orange tariff text yes callback remove record']
ham
spam
