# Setup Notebook

In [107]:
import IPython
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Data Preparation

In [108]:
msg_df = pd.read_csv('spam.csv', encoding='cp1252')
msg_df = msg_df.drop(msg_df.columns[[2, 3, 4]], axis=1)

msg_df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [109]:
# converting label to 0s and 1s 
for index, row in msg_df.iterrows():
        if row["v1"] == "ham":
            row["v1"] = 0
        else:
            row["v1"] = 1

msg_df.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [110]:
# split data into feature and labels
label = msg_df['v1']
feature = msg_df['v2']

In [111]:
# split the data into training and testing (30% split)
train_feature, test_feature, train_label, test_label = train_test_split(feature, label, test_size = 0.3)

# uses TF-IDF to identify the features we will be using
cv = TfidfVectorizer(min_df = 1, stop_words="english")
train_feat_cv = cv.fit_transform(train_feature)
a = train_feat_cv.toarray()

# Create a Model Using Gaussian Naives Bayes Classifier

In [112]:
from sklearn.model_selection import cross_validate


# create a classifier
gnb = GaussianNB()
train_label = train_label.astype('int')

# fit the data
gnb.fit(train_feat_cv.todense(), train_label)

# identify the features in the testing data
test_feat_cv = cv.transform(test_feature)

# uses the model we created from the training 
# data to predict the testing dataset
pred = gnb.predict(test_feat_cv.toarray())

In [113]:
actual = np.array(test_label)
type(actual)

count = 0
for i in range(len(pred)): 
    if pred[i] == actual[i]:
        count = count + 1

count
print('accuracy:', count/len(pred))

accuracy: 0.8995215311004785


In [114]:
## calculate the log loss data
from sklearn.metrics import log_loss
log_loss(b, pred)

9.62637899597505

In [115]:
from sklearn.metrics import precision_score
precision_score(b, pred)

0.1323529411764706

# Create a Model Using Random Forrest Classifier

In [98]:
rf = RandomForestClassifier()
rf.fit(train_feat_cv.todense(), train_label)

rf_pred = rf.predict(test_feat_cv.toarray())

In [99]:
actual = np.array(test_label)
type(actual)

count = 0
for i in range(len(pred)): 
    if pred[i] == actual[i]:
        count = count + 1

count
print('accuracy:', count/len(pred))

accuracy: 0.9019138755980861
