In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB 

# Building a SMS spam detector

In [None]:
# load the dataset
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/sms.tsv.txt'
sms = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

In [None]:
sms.head()

In [None]:
# spam example
print(sms[sms.label=='spam'].message.iloc[100])

In [None]:
# ham example
print(sms[sms.label=='ham'].message.iloc[100])

In [None]:
sms.label.value_counts()

In [None]:
# feature matrix/target vector
X = sms.message
y = sms.label

In [None]:
# train/test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [None]:
# initialize the vectorizer (with default parameters)
vect = CountVectorizer()

In [None]:
# learn training vocabulary, then use it to create a document-term matrix
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [None]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)

In [None]:
# import and initialize a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [None]:
# train the model using X_train_dtm 
nb.fit(X_train_dtm, y_train)

In [None]:
# make class predictions for X_test_dtm
y_test_pred = nb.predict(X_test_dtm)

## Evaluation of the performance on the test set

In [None]:
# evaluate the model
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
# accuracy
accuracy_score(list(y_test), y_test_pred)

In [None]:
# confusion matrix
confusion_matrix(y_test, y_test_pred)

In [None]:
# print messages text for the false positives (ham incorrectly classified as spam) 
X_test[(y_test=='ham') & (y_test_pred=='spam')]

In [None]:
# example of false positive
X_test[1260]

In [None]:
# print messages text for the false negatives (span incorrectly classified as ham)  
X_test[(y_test=='spam') & (y_test_pred=='ham')]

In [None]:
# example of false negatives
X_test[4968]

## Compare Naive Bayes to Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_clf = LogisticRegression()

In [None]:
log_clf.fit(X_train_dtm,y_train)

In [None]:
y_test_pred = log_clf.predict(X_test_dtm)

In [None]:
accuracy_score(y_test,y_test_pred)

In [None]:
confusion_matrix(y_test,y_test_pred)

# From occurrences to frequencies

Occurrence count is a good start but there is an issue: longer documents will have higher average count values than shorter documents, even though they might talk about the same topics.

To avoid these potential discrepancies it suffices to divide the number of occurrences of each word in a document by the total number of words in the document: these new features are called **tf (for Term Frequencies)**.

Another refinement on top of tf is to downscale weights for words that occur in many documents in the corpus and are therefore less informative than those that occur only in a smaller portion of the corpus. This downscaling is called **tf–idf (for “Term Frequency times Inverse Document Frequency”)**.

Both tf and tf–idf can be computed using [TfidfTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
# initialize
tf_transformer = TfidfTransformer(use_idf=False) # use tf
# fit
tf_transformer.fit(X_train_dtm)
# transform
X_train_tf = tf_transformer.transform(X_train_dtm)
X_test_tf = tf_transformer.transform(X_test_dtm)

In [None]:
log_clf.fit(X_train_tf,y_train)
y_test_pred = log_clf.predict(X_test_tf)

In [None]:
accuracy_score(y_test,y_test_pred)

In [None]:
confusion_matrix(y_test,y_test_pred)