# Data Preparation for Model

This notebook explores the best fitting data preparation for a baseline model

## Setup

### Imports

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
import os

%matplotlib inline
sns.set_style("whitegrid")

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics

### Globals 

In [51]:
COLOR_MAP = "viridis" #"Greens" 

plt.rcParams["image.cmap"] = COLOR_MAP
sns.set_theme(style="whitegrid", palette= COLOR_MAP) 

current_dir = Path(os.getcwd()).resolve().parent
RAW_DATA_PATH = current_dir / "data" / "SMSSpamCollection.csv"

In [78]:
# results print function: 
def print_scores(y_true, y_pred):
    print("=======Accuracy Score===========")
    print(f'{metrics.accuracy_score(y_true, y_pred):.3f}\n')

    # print the confusion matrix
    print("=======Confision Matrix===========")
    print(f"{metrics.confusion_matrix(y_true, y_pred)}\n")
    print("=======Precision Score===========")
    print(f"{metrics.precision_score(y_true, y_pred):.3f}\n")

    print("=======Recall Score===========")
    print(f"{metrics.recall_score(y_true, y_pred):.3f}")

### Load

In [53]:
data = pd.read_csv(RAW_DATA_PATH, delimiter='\t', header=None, encoding='utf-8', names=['label', 'text'])

## Feature enrichment

### add length features: characters, words, sentences

In [54]:
# adding columns of text length
data['nr_chars'] = data.text.apply(len)
data['nr_words'] = data.apply(lambda row: nltk.word_tokenize(row["text"]), axis=1).apply(len)
data["nr_sentences"] = data.apply(lambda row: nltk.sent_tokenize(row["text"]), axis=1).apply(len)
data
 

Unnamed: 0,label,text,nr_chars,nr_words,nr_sentences
0,ham,"Go until jurong point, crazy.. Available only ...",111,24,2
1,ham,Ok lar... Joking wif u oni...,29,8,2
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,37,2
3,ham,U dun say so early hor... U c already then say...,49,13,1
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,15,1
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,160,35,4
5568,ham,Will ü b going to esplanade fr home?,36,9,1
5569,ham,"Pity, * was in mood for that. So...any other s...",57,15,2
5570,ham,The guy did some bitching but I acted like i'd...,125,27,1


### clean, tokenize and lemmatize: 



In [55]:
lemmatizer = WordNetLemmatizer()
from nltk.tokenize import word_tokenize
def clean_special_chars(text):
    
    text = text.lower()
    text = text.split()
    clean_text = ' '.join(text)

    tokens = word_tokenize(clean_text)
    stop_words = set(stopwords.words("english"))
    filtered_text = [word for word in tokens if word not in stop_words]

    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in filtered_text]
    lemmatized_text = " ".join(lemmas)
    return lemmatized_text


data['lemmatized_text'] = data.text.apply(clean_special_chars)

data

Unnamed: 0,label,text,nr_chars,nr_words,nr_sentences,lemmatized_text
0,ham,"Go until jurong point, crazy.. Available only ...",111,24,2,"go jurong point , crazy .. available bugis n g..."
1,ham,Ok lar... Joking wif u oni...,29,8,2,ok lar ... joke wif u oni ...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,37,2,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,49,13,1,u dun say early hor ... u c already say ...
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,15,1,"nah n't think go usf , live around though"
...,...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,160,35,4,2nd time try 2 contact u. u £750 pound prize ....
5568,ham,Will ü b going to esplanade fr home?,36,9,1,ü b go esplanade fr home ?
5569,ham,"Pity, * was in mood for that. So...any other s...",57,15,2,"pity , * mood . ... suggestions ?"
5570,ham,The guy did some bitching but I acted like i'd...,125,27,1,guy bitch act like 'd interest buy something e...


### Train-Test-Split, Vectorizing

In [56]:
# label encoding 0=ham, 1=spam
label_encoder = LabelEncoder()
data["label"] = label_encoder.fit_transform(data["label"])

In [57]:
X = data['lemmatized_text'].to_numpy()
y = data['label']

In [58]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, random_state=90, shuffle=True, test_size=0.25)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, random_state=90, shuffle=True, test_size=0.2)
print(f'X_train: {X_train.shape}\n y_train: {y_train.shape} \n X_val: {X_val.shape} \n y_val: {y_val.shape}\n X_test: {X_test.shape} \n y_test: {y_test.shape}')

X_train: (4179,)
 y_train: (4179,) 
 X_val: (1114,) 
 y_val: (1114,)
 X_test: (279,) 
 y_test: (279,)


In [59]:
vector = CountVectorizer()
vector.fit(X_train)
X_train_cv = vector.transform(X_train)
X_val_cv = vector.transform(X_val)
X_test_cv = vector.transform(X_test)


## Baseline model NB

for the baseline, use the simplest vectorizer: CountVectorizer. 
second step - compare with TF IDF

In [60]:
nb = MultinomialNB()


In [61]:
%time nb.fit(X_train_cv, y_train)

CPU times: user 6.53 ms, sys: 0 ns, total: 6.53 ms
Wall time: 5.7 ms


0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [62]:
y_pred_class = nb.predict(X_val_cv)


In [79]:
print('Multinominal Naive Bayes')
print_scores(y_val, y_pred_class)


Multinominal Naive Bayes
0.986

[[965   3]
 [ 13 133]]

0.978

0.911


In [64]:
# false negative: 
X_val[y_pred_class < y_val]


array(["freemsg hey darling 's 3 week 's word back ! 'd like fun still ? tb ok ! xxx std chgs send , £1.50 rcv",
       'check choose babe videos @ sms.shsex.netun fgkslpopw fgkslpo',
       'date : two . start send text talk sport radio last week . connection think coincidence ?',
       'come take little time child afraid dark become teenager want stay night ?',
       'call 09090900040 & listen extreme dirty live chat go office right total privacy one know [ sic ] listen 60p min 24/7mp 0870753331018+',
       "interflora - \x93it 's late order interflora flower christmas call 0800 505060 place order midnight tomorrow .",
       'hello . need posh bird chap user trial prod champneys . put ? need address dob asap . ta r',
       "realize 40 years , 'll thousands old ladies run around tattoo ?",
       'block breaker come deluxe format new feature great graphics t-mobile . buy £5 reply get bbdeluxe take challenge',
       'hello darling today ? would love chat , dont tell look like sex

In [65]:
# false positive: 
X_val[y_pred_class > y_val]


array(['lay airtel line rest ?', 'k k : ) sms chat .',
       'gettin rdy ship comp'], dtype=object)

already the accuracy is quite high. 
Next step - compare with the TF-IDF score       

In [66]:
tfidf_transformer = TfidfTransformer()
X_train_tf = tfidf_transformer.fit_transform(X_train_cv)
X_val_tf = tfidf_transformer.transform(X_val_cv)
X_test_tf = tfidf_transformer.transform(X_test_cv)

In [67]:
nb_tf = MultinomialNB()  

In [68]:
%time nb_tf.fit(X_train_tf, y_train)

CPU times: user 5.92 ms, sys: 1 ms, total: 6.92 ms
Wall time: 5.78 ms


0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [69]:
y_pred_tf = nb_tf.predict(X_val_tf)

In [81]:
print_scores(y_val, y_pred_tf)


0.964

[[968   0]
 [ 40 106]]

1.000

0.726


In [71]:
nb_tf.feature_log_prob_

array([[-9.65120713, -9.65120713, -9.43857641, ..., -9.29686483,
        -9.47998969, -9.65120713],
       [-7.95512192, -7.31058937, -9.07623064, ..., -9.07623064,
        -9.07623064, -8.87220416]], shape=(2, 6573))

Despite TF-IDF superiour handling high frequency words, we have lower accuracy, lowering false positive rate. 

### Baseline Log-Reg

In [82]:
logreg = LogisticRegression()
%time logreg.fit(X_train_cv, y_train)

CPU times: user 814 ms, sys: 5.94 ms, total: 820 ms
Wall time: 175 ms


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [83]:
y_pred_log_cv = logreg.predict(X_val_cv)
print('logistic regression:')
print_scores(y_val, y_pred_log_cv)

logistic regression:
0.979

[[968   0]
 [ 23 123]]

1.000

0.842


In [46]:
logreg.coef_


array([[ 0.48967183,  0.76193436, -0.02015072, ..., -0.16692595,
        -0.03363657,  0.10052975]], shape=(1, 6573))

In [85]:
logreg = LogisticRegression(class_weight='balanced')
%time logreg.fit(X_train_tf, y_train)
y_pred_log_tf = logreg.predict(X_val_tf)
print('Logistic Regression, bias balanced')
print_scores(y_val, y_pred_log_tf)

CPU times: user 787 ms, sys: 3.42 ms, total: 791 ms
Wall time: 173 ms
Logistic Regression, bias balanced
0.982

[[963   5]
 [ 15 131]]

0.963

0.897
