In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import re

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

from pprint import pprint

from env import get_db_url
from prep import basic_clean, stem, lemmatize

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 250)



### Acquire data

In [2]:
df = pd.read_csv('spam.csv')

In [3]:
#Use function from env module to query data in MySQL server
url = get_db_url('spam_db')
df = pd.read_sql('SELECT * FROM spam', url)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5572 non-null   int64 
 1   label   5572 non-null   object
 2   text    5572 non-null   object
dtypes: int64(1), object(2)
memory usage: 130.7+ KB


Unnamed: 0,id,label,text
0,0,ham,"Go until jurong point, crazy.. Available only ..."
1,1,ham,Ok lar... Joking wif u oni...
2,2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,3,ham,U dun say so early hor... U c already then say...
4,4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#Save data to csv for future easy access
df.to_csv('spam.csv', index=False)

### Clean data

In [5]:
# Condense df.text into strings and run cleaning function
all_text = ' '.join(df.text)
spam_text = ' '.join(df[df.label == 'spam'].text)
ham_text = ' '.join(df[df.label == 'ham'].text)
all_text = basic_clean(all_text)
spam_text = basic_clean(spam_text)
ham_text = basic_clean(ham_text)



In [6]:
all_freq = pd.Series(all_text.split()).value_counts()
ham_freq = pd.Series(ham_text.split()).value_counts()
spam_freq = pd.Series(spam_text.split()).value_counts()

In [7]:
df['clean_text'] = df.text.apply(basic_clean)

df.head()

Unnamed: 0,id,label,text,clean_text
0,0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i don't think he goes to usf he lives arou...


In [8]:
df['lemmatized_text'] = df.clean_text.apply(lemmatize)

In [9]:
df.head()

Unnamed: 0,id,label,text,clean_text,lemmatized_text
0,0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,go until jurong point crazy available only in ...
1,1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,ok lar joking wif u oni
2,2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,u dun say so early hor u c already then say
4,4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i don't think he goes to usf he lives arou...,nah i don't think he goes to usf he lives arou...


In [10]:
lemmatized_df = df.copy()[['label', 'lemmatized_text']]

lemmatized_df.head()

Unnamed: 0,label,lemmatized_text
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i don't think he goes to usf he lives arou...


In [11]:
X_lem = lemmatized_df[['lemmatized_text']]
y_lem = lemmatized_df.label

X_lem_train, X_lem_test, y_lem_train, y_lem_test = train_test_split(X_lem, y_lem, test_size=.2, random_state=302)
X_lem_train, X_lem_validate, y_lem_train, y_lem_validate  = train_test_split(X_lem_train, y_lem_train, test_size=.3, random_state=302)

print(X_lem_train.shape, X_lem_validate.shape, X_lem_test.shape)

(3119, 1) (1338, 1) (1115, 1)


In [12]:
X_lem_train.head()

Unnamed: 0,lemmatized_text
4614,sunshine quiz win a super sony dvd recorder if...
1575,sounds gd haha can wah u yan jiu so fast liao
5304,pls ask macho how much is budget for bb bold 2...
5261,i absolutely love south park i only recently s...
314,you made my day do have a great day too


In [27]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
import graphviz

cv = CountVectorizer()
X_lem_bow = cv.fit_transform(X_lem_train.lemmatized_text)

tree = DecisionTreeClassifier(max_depth=15)
tree.fit(X_lem_bow, y_lem_train)

tree.score(X_lem_bow, y_lem_train)



0.9852516832318051

In [28]:
X_lem_val_bow = cv.transform(X_lem_validate.lemmatized_text)


tree.score(X_lem_val_bow, y_lem_validate)

0.9484304932735426

In [15]:
def score_spam_trees(depth):
    cv = CountVectorizer()
    X_lem_bow = cv.fit_transform(X_lem_train.lemmatized_text)

    tree = DecisionTreeClassifier(max_depth=depth)
    tree.fit(X_lem_bow, y_lem_train)

    spam_tree_score = tree.score(X_lem_bow, y_lem_train)
    return spam_tree_score

print(score_spam_trees(17))

for i in range(1,20+1):
    tree_scores = []
    tree_scores.append(score_spam_trees(i))

tree_scores
    

0.986213529977557


[0.9890990702148125]

In [16]:
bc_df = df.copy()[['label', 'clean_text']]

bc_df.head()


Unnamed: 0,label,clean_text
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i don't think he goes to usf he lives arou...


In [17]:
X_bc = bc_df[['clean_text']]
y_bc = bc_df.label

X_bc_train, X_bc_test, y_bc_train, y_bc_test = train_test_split(X_bc, y_bc, test_size=.2, random_state=302)
X_bc_train, X_bc_validate, y_bc_train, y_bc_validate  = train_test_split(X_bc_train, y_bc_train, test_size=.3, random_state=302)

print(X_bc_train.shape, X_bc_validate.shape, X_bc_test.shape)

(3119, 1) (1338, 1) (1115, 1)


In [18]:
X_bc_bow = cv.fit_transform(X_bc_train.clean_text)

bc_tree = DecisionTreeClassifier(max_depth=15)
bc_tree.fit(X_bc_bow, y_bc_train)

bc_tree.score(X_bc_bow, y_bc_train)

0.9852516832318051

In [19]:
X_bc_val_bow = cv.transform(X_bc_validate.clean_text)

bc_tree.score(X_bc_val_bow, y_bc_validate)

0.9506726457399103

In [20]:
tfidf = TfidfVectorizer()

X_tf_lembow = tfidf.fit_transform(X_lem_train.lemmatized_text)

lem_tree = DecisionTreeClassifier(max_depth=15)
lem_tree.fit(X_tf_lembow, y_lem_train)

lem_tree.score(X_tf_lembow, y_lem_train)


0.989740301378647

In [21]:
X_tf_val_lembow = tfidf.transform(X_lem_validate.lemmatized_text)

lem_tree.score(X_tf_val_lembow, y_lem_validate)

0.9499252615844545

In [22]:
X_tf_bcbow = tfidf.fit_transform(X_bc_train.clean_text)

tfbc_tree = DecisionTreeClassifier(max_depth=15)
tfbc_tree.fit(X_tf_bcbow, y_bc_train)

tfbc_tree.score(X_tf_bcbow, y_bc_train)

0.989740301378647

In [23]:
X_tf_val_bcbow = tfidf.transform(X_bc_validate.clean_text)


tfbc_tree.score(X_tf_val_bcbow, y_bc_validate)

0.9506726457399103