In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
%matplotlib inline

In [72]:
df = pd.read_csv("mail_data.csv")

In [73]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [74]:
df['label_no'] = df.Category.map({"ham":0,"spam":1})

In [75]:
df.head()

Unnamed: 0,Category,Message,label_no
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [77]:
x = df["Message"]
y = df["label_no"]

In [78]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 21)

In [79]:
count = CountVectorizer()

In [80]:
training_data = count.fit_transform(x_train)

In [81]:
print(training_data)

  (0, 2967)	1
  (0, 4704)	1
  (0, 363)	1
  (1, 4607)	1
  (1, 6386)	1
  (1, 3950)	1
  (1, 848)	1
  (1, 1343)	1
  (1, 2377)	1
  (2, 4129)	1
  (2, 950)	1
  (2, 4753)	1
  (2, 1453)	1
  (2, 3522)	1
  (2, 5749)	1
  (2, 3139)	1
  (2, 4306)	1
  (2, 5001)	1
  (3, 1096)	1
  (3, 3949)	1
  (3, 7227)	2
  (3, 7051)	1
  (3, 6622)	1
  (3, 5270)	1
  (4, 3139)	1
  :	:
  (3897, 3372)	1
  (3897, 1265)	1
  (3897, 983)	1
  (3897, 1005)	1
  (3897, 6669)	1
  (3897, 3270)	1
  (3898, 6395)	1
  (3898, 1029)	1
  (3898, 3637)	1
  (3898, 6735)	1
  (3898, 6091)	1
  (3898, 7200)	1
  (3898, 4377)	1
  (3898, 6538)	1
  (3898, 6947)	1
  (3898, 3926)	1
  (3898, 3086)	1
  (3898, 2228)	1
  (3898, 4287)	1
  (3898, 4911)	1
  (3898, 2299)	1
  (3899, 7227)	1
  (3899, 6978)	1
  (3899, 7076)	1
  (3899, 7109)	1


In [82]:
testing_data = count.transform(x_test)

In [83]:
print(testing_data)

  (0, 295)	1
  (0, 361)	1
  (0, 419)	1
  (0, 473)	1
  (0, 508)	1
  (0, 545)	1
  (0, 571)	1
  (0, 882)	1
  (0, 2034)	1
  (0, 2813)	1
  (0, 2978)	1
  (0, 3409)	1
  (0, 3964)	1
  (0, 4310)	1
  (0, 4377)	1
  (0, 4417)	1
  (0, 4848)	1
  (0, 5870)	1
  (0, 5881)	1
  (0, 6146)	1
  (0, 6753)	6
  (0, 7111)	1
  (0, 7126)	1
  (0, 7227)	1
  (1, 882)	1
  :	:
  (1669, 7021)	1
  (1669, 7176)	1
  (1670, 848)	1
  (1670, 2157)	1
  (1670, 2379)	1
  (1670, 3409)	1
  (1670, 3770)	1
  (1670, 3823)	1
  (1670, 4137)	1
  (1670, 4335)	1
  (1670, 4528)	1
  (1670, 4541)	1
  (1670, 4599)	1
  (1670, 5505)	1
  (1670, 5947)	1
  (1670, 6091)	1
  (1670, 6644)	1
  (1670, 6753)	1
  (1670, 6943)	1
  (1671, 2755)	1
  (1671, 3053)	1
  (1671, 3428)	1
  (1671, 3983)	1
  (1671, 4108)	1
  (1671, 6057)	1


In [84]:
dt_model = DecisionTreeClassifier(criterion = "entropy")

In [85]:
dt_model.fit(training_data,y_train)

DecisionTreeClassifier(criterion='entropy')

In [86]:
dt_pred = dt_model.predict(testing_data)

In [87]:
dt_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [88]:
def print_metrics(y_true, preds, model_name=None):
    print('Accuracy score for ' + model_name + ' :' , format(accuracy_score(y_true, preds)))
    print('Precision score ' + model_name + ' :', format(precision_score(y_true, preds)))
    print('Recall score ' + model_name + ' :', format(recall_score(y_true, preds)))
    print('F1 score ' + model_name + ' :', format(f1_score(y_true, preds)))

In [89]:
print_metrics(y_test,dt_pred,'DT')

Accuracy score for DT : 0.9647129186602871
Precision score DT : 0.8857142857142857
Recall score DT : 0.8416289592760181
F1 score DT : 0.8631090487238979


In [90]:
def give_input(test_message):
    count_vector = CountVectorizer()
    training_data = count_vector.fit_transform(x_train)
    testing_data = count_vector.transform(pd.Series(test_message))
    return dt_model.predict(testing_data)

In [91]:
give_input(df.Message[5570])

array([0], dtype=int64)

In [92]:
df.label_no[5570]

0