In [1]:
import os
os.chdir('E:\AML')

In [2]:
import pandas as pd
import numpy as np
import chardet 
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

pd.read_csv('spam.csv', encoding = 'Latin-1') 

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [3]:
with open("spam.csv", 'rb') as f:
    result = chardet.detect(f.read())  # or readline if the file is large

print(result['encoding'])
df_raw = pd.read_csv('spam.csv', encoding=result['encoding'])

Windows-1252


In [4]:
df = df_raw[['v1', 'v2']].rename(columns={'v1' : 'label', 'v2' : 'sms'})
df.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.drop_duplicates(subset="sms",inplace=True)
df.describe()

Unnamed: 0,label,sms
count,5169,5169
unique,2,5169
top,ham,"XXXMobileMovieClub: To use your credit, click ..."
freq,4516,1


In [32]:
X = df["sms"]
y = df["label"]
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.3, random_state = 17)

In [33]:
#Model 1 - Decision Tree
from sklearn.tree import DecisionTreeClassifier

In [34]:
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 17, max_depth = 15, min_samples_leaf =10)
pipeline_decision_tree = Pipeline([('vectorizer', TfidfVectorizer()),
                     ('classifier', clf_entropy)])
cv_dt = cross_val_score(pipeline_decision_tree, X_train, y_train, cv = 5)
print("The Cross Validation Score is: ")
print(cv_dt)

The Cross Validation Score is: 
[0.94060773 0.94751381 0.93370166 0.95303867 0.93905817]


In [35]:
pipeline_decision_tree.fit(X_train, y_train)
y_pred_dt = pipeline_decision_tree.predict(X_test)

In [36]:
accuracy_score(y_test, y_pred_dt)

0.9529335912314636

In [37]:
print(classification_report(y_test, y_pred_dt))

              precision    recall  f1-score   support

         ham       0.97      0.98      0.97      1367
        spam       0.84      0.75      0.79       184

   micro avg       0.95      0.95      0.95      1551
   macro avg       0.90      0.87      0.88      1551
weighted avg       0.95      0.95      0.95      1551



In [38]:
'''The Decision Tree classifier breaks the data into parts based on decisions at each node. Entropy is the 
uncertainity present in the random variables. The nodes are divided by maximising the entropy of the data. '''

'The Decision Tree classifier breaks the data into parts based on decisions at each node. Entropy is the \nuncertainity present in the random variables. The nodes are divided by maximising the entropy of the data. '

In [39]:
# Model 2 - Naive Bayes
from sklearn.naive_bayes import BernoulliNB

In [41]:
clf_nb = BernoulliNB()
pipeline_nb = Pipeline([('vectorizer', TfidfVectorizer()),
                     ('classifier', clf_nb)])
cv_nb = cross_val_score(pipeline_nb, X_train, y_train, cv = 5)
print("The Cross Validation Score is: ")
print(cv_nb)

The Cross Validation Score is: 
[0.96961326 0.96132597 0.96823204 0.97099448 0.96260388]


In [42]:
pipeline_nb.fit(X_train, y_train)
y_pred_nb = pipeline_nb.predict(X_test)

In [43]:
accuracy_score(y_test, y_pred_nb)

0.97678916827853

In [44]:
print(classification_report(y_test, y_pred_nb))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1367
        spam       0.99      0.81      0.89       184

   micro avg       0.98      0.98      0.98      1551
   macro avg       0.98      0.90      0.94      1551
weighted avg       0.98      0.98      0.98      1551



In [45]:
'''Naive Bayes Classifier use the Bayes' Theorem to find the probability that a data-point belongs to a certain class. 
Here we use the Beroulli distribution for our Naive Bayes Classifier.'''

"Naive Bayes Classifier use the Bayes' Theorem to find the probability that a data-point belongs to a certain class. \nHere we use the Beroulli distribution for our Naive Bayes Classifier."

In [46]:
# Model 3 - SVM
from sklearn.svm import SVC

In [47]:
clf_svc = SVC(kernel = 'linear')
pipeline_svc = Pipeline([('vectorizer', TfidfVectorizer()),
                     ('classifier', clf_svc)])
cv_svc = cross_val_score(pipeline_svc, X_train, y_train, cv = 5)
print("The Cross Validation Score is: ")
print(cv_svc)

The Cross Validation Score is: 
[0.98066298 0.97928177 0.98066298 0.9820442  0.98476454]


In [48]:
pipeline_svc.fit(X_train, y_train)
y_pred_svc = pipeline_svc.predict(X_test)

In [49]:
accuracy_score(y_test, y_pred_svc)

0.9871050934880722

In [50]:
print(classification_report(y_test, y_pred_svc))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1367
        spam       0.98      0.91      0.94       184

   micro avg       0.99      0.99      0.99      1551
   macro avg       0.98      0.96      0.97      1551
weighted avg       0.99      0.99      0.99      1551



In [52]:
'''Support Vector Machines are a type of classifiers which classify the data by forming hyperplane(s). 
Here we use a linear kernel.'''

'Support Vector Machines are a type of classifiers which classify the data by forming hyperplane(s). \nHere we use a linear kernel.'