### Spam Detection using Text Mining and Naive Bayes Classifier

- Each SMS message can be a spam or a ham (legitimate).

In [None]:
# Import useful libararies used for data management
import pandas as pd
import numpy as np

# load dataset 'SpamHamEmail.csv'
dataset = pd.read_csv('SpamHamEmail.csv')

In [None]:
dataset.head()

In [None]:
dataset['Class'].value_counts()

### Data Preparation

In [None]:
# convert the 'label' column into a numeric variable; 'ham' as 0, 'spam' as 1

dataset['Label'] = dataset['Class'].map({'ham':0, 'spam':1})

In [None]:
dataset.head()

In [None]:
# Now let's define X and y 
X = dataset['Text']
y = dataset['Label']

In [None]:
# show the dimension of the X
#https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.shape.html
X.shape

In [None]:
# Import the CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Now we are ready to vectorize the data
# first, instantiate the vectorizer
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# When building the vocabulary ignore terms that have a document frequency strictly lower than the min_df (proportion), or higher than max_df
vectorizer = CountVectorizer(lowercase=True,stop_words='english',min_df=0.02, max_df=0.5)

In [None]:
# Learn the vocabulary dictionary and return term-document matrix.
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer.fit_transform
X_vec = vectorizer.fit_transform(X)

In [None]:
# print the terms
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer.get_feature_names
print(vectorizer.get_feature_names())

In [None]:
len(vectorizer.get_feature_names())

In [None]:
# get the count of each terms in each document
print(X_vec.toarray())

In [None]:
print(X_vec)

### Model Building using Multinomial NB

In [None]:
#Import Multinomial Naive Bayes model from sklearn
from sklearn.naive_bayes import MultinomialNB

# Create a Multinomial Naive Bayes Classifier, which is frequently used in Tf-idf
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html
mnb = MultinomialNB()

### Model Evaluation

In [None]:
# Now let's import cross validation
from sklearn.model_selection import cross_val_score,cross_val_predict

In [None]:
# Make class prediction for test set
# y_pred_class is the binary label
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB.predict
score = cross_val_score(mnb,X_vec,y,cv=10)


In [None]:
score.mean()

In [None]:
# import libararies for evaluation measures
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
# get predicted class for y
y_pred_class = cross_val_predict(mnb,X_vec,y,cv=10)

In [None]:
y_pred_class

In [None]:
print("Confusion Matrix:", "\n", confusion_matrix(y, y_pred_class))
print("Classification Report:","\n", classification_report(y, y_pred_class))

In [None]:
# y_pred_prob is the probability estimate
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB.predict_proba
y_pred_prob = cross_val_predict(mnb,X_vec,y,cv=10, method ='predict_proba')

In [None]:
y_pred_prob

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

# y_test is the true label, pred_prob[:,1] measures the predicted probability of belonging to class 1 (which is defined as positive class)
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html
fpr, tpr, thresholds = roc_curve(y, y_pred_prob[:,1], pos_label=1)


In [None]:
# Aread under ROC curve
roc_auc=auc(fpr,tpr)
roc_auc

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure()
lw = 2
plt.plot(fpr, tpr, marker='o',color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# train the model using all training set 
mnb.fit(X_vec, y)

In [None]:
# Now let's see the ham that has been classcified as Spam
X[(y == 0) & (y_pred_class==1)]

#### TODO
- Please try TfidfVectorizer later to see the model performance (https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)
- You need to use "from sklearn.feature_extraction.text import TfidfVectorizer" to import the TfidfVectorizer first