In [58]:
# Libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

In [18]:
df = pd.read_csv(r"datasets/spam.csv", encoding="ISO-8859-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [20]:
df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], inplace=True, axis = 1)

In [21]:
df.columns = ["label", "text"]
df.head(2)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...


### Exploratory Data Analysis

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [24]:
print(df["label"].value_counts())

label
ham     4825
spam     747
Name: count, dtype: int64


### Text Cleaning and Preprocessing


In [31]:
nltk.download("stopwords") # this method will remove stop words like (and, the, with, etc.)
nltk.download("wordnet") # for find lemma 
nltk.download("omw-1.4") 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yacan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yacan\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\yacan\AppData\Roaming\nltk_data...


True

In [37]:
text = list(df.text)
lemmatizer = WordNetLemmatizer()
print(text[:5])

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'Ok lar... Joking wif u oni...', "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'U dun say so early hor... U c already then say...', "Nah I don't think he goes to usf, he lives around here though"]


In [43]:
corpus = []
for i in text:
    i = re.sub(r"[^a-zA-Z]", " ", i)
    i = i.lower()                                                            # Making all charcters lower
    i = i.split()                                                           # split sentence word by word
    i = [word for word in i if word not in stopwords.words("english")]  
    i = [lemmatizer.lemmatize(word) for word in i]
    i =  " ".join(i)
    corpus.append(i)

In [45]:
corpus[:5]

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though']

In [47]:
new_df = pd.DataFrame()

new_df["label"] = df["label"]
new_df["text"] = corpus

new_df.head(5)

Unnamed: 0,label,text
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already say
4,ham,nah think go usf life around though


### Model Training and Evaluation

In [51]:
X = new_df["text"]
y = new_df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Size of X_train : ", len(X_train))
print("Size of X_test : ", len(X_test))
print("Size of Y_train : ", len(y_train))
print("Size of y_test : ", len(y_test))

Size of X_train :  4457
Size of X_test :  1115
Size of Y_train :  4457
Size of y_test :  1115


In [54]:
# Bag Of Words
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)

In [57]:
dt = DecisionTreeClassifier()
dt.fit(X_train_cv, y_train)

X_test_cv = cv.transform(X_test)

In [60]:
prediction = dt.predict(X_test_cv)    #prediction
cfmt = confusion_matrix(y_test, prediction)

In [62]:
accuracy = (cfmt[0][0] + cfmt[1][1]) / (cfmt[0][0]+ cfmt[0][1]+ cfmt[1][0]+ cfmt[1][1])
print(accuracy)

0.9730941704035875
