In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB 

# Building a SMS spam detector

In [None]:
# load the dataset
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/sms.tsv.txt'
sms = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

In [None]:
sms.head()

In [None]:
# spam example
print(sms[sms.label=='spam'].message.iloc[100])

In [None]:
# ham example
print(sms[sms.label=='ham'].message.iloc[100])

In [None]:
sms.label.value_counts()

In [None]:
# feature matrix/target vector
X = sms.message
y = sms.label

In [None]:
# train/test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [None]:
# initialize the vectorizer (with default parameters)
vect = CountVectorizer(stop_words='english',max_features=1000,min_df=10)

In [None]:
# learn training vocabulary, then use it to create a document-term matrix
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [None]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)

## Naive Bayes model

In [None]:
# import and initialize a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb_clf = MultinomialNB()

In [None]:
# train the model using X_train_dtm 
nb_clf.fit(X_train_dtm, y_train)

In [None]:
# make class predictions for X_test_dtm
y_test_pred = nb_clf.predict(X_test_dtm)

In [None]:
# evaluate the model
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
# accuracy
accuracy_score(list(y_test), y_test_pred)

In [None]:
# confusion matrix
confusion_matrix(y_test, y_test_pred)

In [None]:
# print messages text for the false positives (ham incorrectly classified as spam) 
X_test[(y_test=='ham') & (y_test_pred=='spam')]

In [None]:
X_test[4862]

In [None]:
# print messages text for the false negatives (span incorrectly classified as ham)  
X_test[(y_test=='spam') & (y_test_pred=='ham')]

In [None]:
# example of false negatives
X_test[68]

## How does Naive Bayes choose between spam and ham

In [None]:
# store the vocabulary of X_train
words = vect.get_feature_names_out()
words

In [None]:
len(words)

In [None]:
nb_clf.classes_

In [None]:
# Naive Bayes counts the number of times each word appears in each class
# Rows represent classes (ham and spam), columns represent words
nb_clf.feature_count_

In [None]:
# number of times each word appears across all ham messages
ham_word_count = nb_clf.feature_count_[0,:]
# number of times each word appears across all spam messages
spam_word_count = nb_clf.feature_count_[1,:]

In [None]:
# create a DataFrame of words with their separate ham and spam counts
words = pd.DataFrame({'word' : words, 
                      'ham' : ham_word_count, 
                      'spam' : spam_word_count}).set_index('word')
words.head()

In [None]:
# add 1 to the columns counts to avoid dividing by 0
words.ham = words.ham+1
words.spam = words.spam+1

In [None]:
# convert the ham and spam counts into frequencies
words.ham = words.ham/words.ham.sum()
words.spam = words.spam/words.spam.sum()
words.head()

In [None]:
# calculate the ratio of ham-to-spam and spam-to-ham for each word
words['ham_ratio'] = words.ham/words.spam
words['spam_ratio'] = words.spam/words.ham

In [None]:
# top 20 spam words
words.sort_values(by='spam_ratio', ascending=False).head(20)

In [None]:
# top 20 ham words
words.sort_values(by='ham_ratio', ascending=False).head(20)

## Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipe = Pipeline(steps=[
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])
pipe

In [None]:
len(X)

In [None]:
param_dic = {
    'vect__stop_words' : [None, 'english'],
    'vect__ngram_range' : [(1,1),(1,2)], # 1-grams (words) or 1 and 2 grams
    'vect__max_df' : [1.0,0.9,0.8],
    'vect__min_df' : [1,10,25,50],
    'vect__max_features' : [None,500,1000,2000,5000]
}

grid = GridSearchCV(pipe,
                    param_dic,
                    cv=5,
                    scoring='accuracy',
                    n_jobs=-1,
                    error_score='raise')

grid.fit(X_test,y_test)

In [None]:
grid.best_params_

In [None]:
best_pipe = grid.best_estimator_

In [None]:
y_test_pred = best_pipe.predict(X_test)
confusion_matrix(y_test, y_test_pred)

## Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_clf = LogisticRegression()

In [None]:
log_clf.fit(X_train_dtm,y_train)

In [None]:
y_test_pred = log_clf.predict(X_test_dtm)

In [None]:
accuracy_score(y_test,y_test_pred)

In [None]:
confusion_matrix(y_test,y_test_pred)

In [None]:
print(X_test[(y_test == 'spam') & (y_test_pred == 'ham')].iloc[5])

In [None]:
# top coefficients
coeffs = pd.DataFrame(data = log_clf.coef_.T, index=vect.get_feature_names(),columns=['coefficient'])
coeffs.sort_values(by='coefficient').tail(20).plot.barh(figsize=(5,10))

In [None]:
log_clf.classes_

# From occurrences to frequencies

Occurrence count is a good start but there is an issue: longer documents will have higher average count values than shorter documents, even though they might talk about the same topics.

To avoid these potential discrepancies it suffices to divide the number of occurrences of each word in a document by the total number of words in the document: these new features are called **tf (for Term Frequencies)**.

Another refinement on top of tf is to downscale weights for words that occur in many documents in the corpus and are therefore less informative than those that occur only in a smaller portion of the corpus. This downscaling is called **tf–idf (for “Term Frequency times Inverse Document Frequency”)**.

Both tf and tf–idf can be computed using [TfidfTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
# initialize
tf_transformer = TfidfTransformer(use_idf=False) # use tf
# fit
tf_transformer.fit(X_train_dtm)
# transform
X_train_tf = tf_transformer.transform(X_train_dtm)
X_test_tf = tf_transformer.transform(X_test_dtm)

In [None]:
log_clf.fit(X_train_tf,y_train)
y_test_pred = log_clf.predict(X_test_tf)

In [None]:
accuracy_score(y_test,y_test_pred)

In [None]:
confusion_matrix(y_test,y_test_pred)