In [1]:
import numpy as np
import pandas as pd

In [36]:
from nltk.tokenize import word_tokenize

In [45]:
spam = pd.read_csv('spam.csv')

In [46]:
spam.columns

Index(['label', 'sms'], dtype='object')

In [47]:
spam.head(10)

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [48]:
spam.describe()

Unnamed: 0,label,sms
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


Let's tokenize them to get the word frequency

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()

In [49]:
spam.label

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: label, Length: 5572, dtype: object

In [50]:
spam.loc[:,'label'] = spam.label.map({'ham':0, 'spam':1})
print(spam.shape)

(5572, 2)


In [51]:
spam.head()

Unnamed: 0,label,sms
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


Before vectorizing let's split them in testing and training data

In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(spam['sms'], 
                                                    spam['label'],test_size=0.20, 
                                                    random_state=1)

In [53]:
X_train

1642                             Sleeping nt feeling well
2899    Come aftr  &lt;DECIMAL&gt; ..now i m cleaning ...
480                          Almost there, see u in a sec
3485                     Yeah, probably earlier than that
157     Hello, my love. What are you doing? Did you ge...
                              ...                        
905     Hey what's up charles sorry about the late reply.
5192    Oh oh... Den muz change plan liao... Go back h...
3980    Huh i cant thk of more oredi how many pages do...
235     I have printed it oh. So  &lt;#&gt;  come upst...
5157                              K k:) sms chat with me.
Name: sms, Length: 4457, dtype: object

In [54]:
y_train

1642    0
2899    0
480     0
3485    0
157     0
       ..
905     0
5192    0
3980    0
235     0
5157    0
Name: label, Length: 4457, dtype: int64

Let's implement Linear regression first

In [94]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score#Import scikit-learn metrics module for accuracy calculation

In [64]:
dt = DecisionTreeClassifier()

In [65]:
classification = dt.fit(training_data, y_train)

In [67]:
y_pred = classification.predict(testing_data)

In [68]:
accuracy = metrics.accuracy_score(y_test, y_pred)

In [107]:
print(f"Accuracy using Decision trees: {accuracy*100}%")
print(f"F1 score: {f1_score(y_test, y_pred)}")
print(f"Precision_score: {precision_score(y_test, y_pred)}")
print(f"Accuracy_score: {accuracy_score(y_test, y_pred)}")

Accuracy using Decision trees: 96.8609865470852%
F1 score: 0.8701754385964912
Precision_score: 0.8493150684931506
Accuracy_score: 0.9668161434977578


Now, let's tune it a bit and see if we can acheive a higher accuracy

In [100]:
dt1 = DecisionTreeClassifier(criterion="entropy", max_depth=10)
classification1 = dt1.fit(training_data, y_train)
y_pred1 = classification1.predict(testing_data)
accuracy = metrics.accuracy_score(y_test, y_pred1)
print(f"Accuracy using Decision trees with maximum depth 10: {accuracy*100}%")
print(f"F1 score: {f1_score(y_test, y_pred1)}")
print(f"Precision_score: {precision_score(y_test, y_pred1)}")
print(f"Accuracy_score: {accuracy_score(y_test, y_pred1)}")

Accuracy using Decision trees with maximum depth 10: 96.05381165919282%
F1 score: 0.8482758620689654
F1 score: 0.8145695364238411
F1 score: 0.9605381165919282


In [110]:
dt1 = DecisionTreeClassifier(criterion="entropy", max_depth=15)
classification1 = dt1.fit(training_data, y_train)
y_pred1 = classification1.predict(testing_data)
accuracy = metrics.accuracy_score(y_test, y_pred1)
print(f"Accuracy using Decision trees with maximum depth 15: {accuracy*100}%")
print(f"F1 score: {f1_score(y_test, y_pred1)}")
print(f"Precision_score: {precision_score(y_test, y_pred1)}")
print(f"Accuracy_score: {accuracy_score(y_test, y_pred1)}")

Accuracy using Decision trees with maximum depth 15: 96.32286995515696%
F1 score: 0.8581314878892733
Precision_score: 0.8266666666666667
Accuracy_score: 0.9632286995515695


In [117]:
dt1 = DecisionTreeClassifier(criterion="entropy", max_depth=17)
classification1 = dt1.fit(training_data, y_train)
y_pred1 = classification1.predict(testing_data)
accuracy = metrics.accuracy_score(y_test, y_pred1)
print(f"Accuracy using Decision trees with maximum depth 17: {accuracy*100}%")
print(f"F1 score: {f1_score(y_test, y_pred1)}")
print(f"Precision_score: {precision_score(y_test, y_pred1)}")
print(f"Accuracy_score: {accuracy_score(y_test, y_pred1)}")

Accuracy using Decision trees with maximum depth 17: 96.32286995515696%
F1 score: 0.8581314878892733
Precision_score: 0.8266666666666667
Accuracy_score: 0.9632286995515695


In [119]:
dt1 = DecisionTreeClassifier(criterion="entropy", max_depth=19)
classification1 = dt1.fit(training_data, y_train)
y_pred1 = classification1.predict(testing_data)
accuracy = metrics.accuracy_score(y_test, y_pred1)
print(f"Accuracy using Decision trees with maximum depth 19: {accuracy*100}%")
print(f"F1 score: {f1_score(y_test, y_pred1)}")
print(f"Precision_score: {precision_score(y_test, y_pred1)}")
print(f"Accuracy_score: {accuracy_score(y_test, y_pred1)}")

Accuracy using Decision trees with maximum depth 19: 96.7713004484305%
F1 score: 0.8758620689655172
Precision_score: 0.8410596026490066
Accuracy_score: 0.967713004484305


In [120]:
dt1 = DecisionTreeClassifier(criterion="entropy", max_depth=20)
classification1 = dt1.fit(training_data, y_train)
y_pred1 = classification1.predict(testing_data)
accuracy = metrics.accuracy_score(y_test, y_pred1)
print(f"Accuracy using Decision trees with maximum depth 20: {accuracy*100}%")
print(f"F1 score: {f1_score(y_test, y_pred1)}")
print(f"Precision_score: {precision_score(y_test, y_pred1)}")
print(f"Accuracy_score: {accuracy_score(y_test, y_pred1)}")

Accuracy using Decision trees with maximum depth 20: 97.04035874439462%
F1 score: 0.8858131487889273
Precision_score: 0.8533333333333334
Accuracy_score: 0.9704035874439462


In [122]:
dt1 = DecisionTreeClassifier(criterion="entropy", max_depth=22)
classification1 = dt1.fit(training_data, y_train)
y_pred1 = classification1.predict(testing_data)
accuracy = metrics.accuracy_score(y_test, y_pred1)
print(f"Accuracy using Decision trees with maximum depth 22 {accuracy*100}%")
print(f"F1 score: {f1_score(y_test, y_pred1)}")
print(f"Precision_score: {precision_score(y_test, y_pred1)}")
print(f"Accuracy_score: {accuracy_score(y_test, y_pred1)}")

Accuracy using Decision trees with maximum depth 22 96.95067264573991%
F1 score: 0.880281690140845
Precision_score: 0.8620689655172413
Accuracy_score: 0.9695067264573991


### Like we can see, Decision tree with maximum depth of 20 is what gives us the best accuracy,  if we try to increase the accuracy a bit more it starts to overfit and the accuracy starts going down.