# Setup Notebook

In [20]:
import IPython
import numpy as np
import pandas as pd
import matplotlib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Data Preparation

In [21]:
msg_df = pd.read_csv('spam.csv', encoding='cp1252')
msg_df = msg_df.drop(msg_df.columns[[2, 3, 4]], axis=1)

msg_df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [22]:
# converting label to 0s and 1s 
for index, row in msg_df.iterrows():
        if row["v1"] == "ham":
            row["v1"] = 0
        else:
            row["v1"] = 1

msg_df.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [23]:
# split data into feature and labels
label = msg_df['v1']
feature = msg_df['v2']

In [46]:
# split the data into training and testing (30% split)
train_feature, test_feature, train_label, test_label = train_test_split(feature, label, test_size = 0.3)

# uses TF-IDF to identify the features we will be using
cv = TfidfVectorizer(min_df = 1, stop_words="english")
train_feat_cv = cv.fit_transform(train_feature).toarray()
train_label = train_label.astype('int')

test_feat_cv = cv.transform(test_feature).toarray()
test_label = test_label.astype('int')

# Create a Model Using Gaussian Naives Bayes Classifier

In [47]:
# create a classifier
gnb = GaussianNB()

# fit the data
gnb.fit(train_feat_cv, train_label)

# uses the model we created from the training 
# data to predict the testing dataset
gnb_pred = gnb.predict(test_feat_cv)

In [34]:
from sklearn.cross_validation import KFold, cross_val_score, cross_val_predict
k_fold = KFold(len(train_feat_cv), n_folds=10, shuffle=True, random_state=0)
gnb = GaussianNB()
train_label = train_label.astype('int')
print(cross_val_score(gnb, train_feat_cv, train_label, cv=k_fold, n_jobs=1))

[ 0.90512821  0.89487179  0.87948718  0.90512821  0.89487179  0.8974359
  0.88974359  0.85897436  0.85641026  0.86153846]


In [56]:
predictions = cross_val_predict(gnb, test_feat_cv, test_label)
k_fold_test = KFold(len(test_feat_cv), n_folds=10, shuffle=True, random_state=0)
print(cross_val_score(gnb, predictions.reshape(-1, 1), test_label, cv=k_fold_test, n_jobs=1))

[ 0.88095238  0.93452381  0.88622754  0.8502994   0.88023952  0.93413174
  0.92814371  0.8742515   0.8742515   0.85628743]


In [60]:
accuracy_score(test_label, gnb_pred)

0.87081339712918659

In [61]:
accuracy_score(test_label, predictions)

0.88995215311004783

# Create a Model Using Random Forrest Classifier

In [29]:
# split the data into training and testing (30% split)
train_feature, test_feature, train_label, test_label = train_test_split(feature, label, test_size = 0.3)

# uses TF-IDF to identify the features we will be using
cv = TfidfVectorizer(min_df = 1, stop_words="english")
train_feat_cv = cv.fit_transform(train_feature).toarray()
train_label = train_label.astype('int')

test_feat_cv = cv.transform(test_feature).toarray()
test_label = test_label.astype('int')

In [30]:
rf = RandomForestClassifier()
rf.fit(train_feat_cv, train_label)

rf_pred = rf.predict(test_feat_cv)

In [31]:
accuracy_score(test_label, rf_pred)

0.97607655502392343

# Create a Model Using Logistic Regression

In [32]:

# create the classifier
lr = LogisticRegression()

# fit the data
lr.fit(train_feat_cv, train_label)

# identify the features in the testing data
test_feat_cv = cv.transform(test_feature)

# uses the model we created from the training 
# data to predict the testing dataset
pred = lr.predict(test_feat_cv)

accuracy_score(test_label, pred)

0.9611244019138756