## Naive Bayes and LDA

In [None]:
# importing necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB, GaussianNB
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Reading Data from CSV file

In [None]:
df = pd.read_csv('/content/drive/MyDrive/SPAM text message 20170820 - Data.csv')

In [None]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Data Preprocessing 

#### Checking for Missing values

In [None]:
df.shape

(5572, 2)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


We can see that there are no missing values 

#### Checking class distribution

In [None]:
df['Category'].unique()

array(['ham', 'spam'], dtype=object)

In [None]:
df['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

We can see that, the dataset contains few samples of 'Spam' class and more samples of 'Not Spam' class

In [None]:
df_clean = df.copy()

#### Converting to Lower Case

The messages are converted into lower Case so that lower case and upper case letters are considered as same words

In [None]:
df_clean['Message'] = df_clean['Message'].apply(lambda x:x.lower())

#### Removing digits and words containing digits

Digits are removed from the message so that they are not considered as tokens

In [None]:
df_clean['Message'] = df_clean['Message'].apply(lambda x: re.sub('\w*\d\w*','', x))

#### Tokenize words

Convert the message into individual words

In [None]:
df_clean['text_clean'] = df_clean['Message'].apply(nltk.word_tokenize)

In [None]:
df_clean.head()

Unnamed: 0,Category,Message,text_clean
0,ham,"go until jurong point, crazy.. available only ...","[go, until, jurong, point, ,, crazy.., availab..."
1,ham,ok lar... joking wif u oni...,"[ok, lar, ..., joking, wif, u, oni, ...]"
2,spam,free entry in a wkly comp to win fa cup final...,"[free, entry, in, a, wkly, comp, to, win, fa, ..."
3,ham,u dun say so early hor... u c already then say...,"[u, dun, say, so, early, hor, ..., u, c, alrea..."
4,ham,"nah i don't think he goes to usf, he lives aro...","[nah, i, do, n't, think, he, goes, to, usf, ,,..."


#### Remove Stop Words

Since stop words are common in all messages irrespective of the class, these stop words are removed. 

In [None]:
stop_words=set(nltk.corpus.stopwords.words("english"))

In [None]:
df_clean['text_clean'] = df_clean['text_clean'].apply(lambda x: [item for item in x if item not in stop_words])

In [None]:
df_clean.head()

Unnamed: 0,Category,Message,text_clean
0,ham,"go until jurong point, crazy.. available only ...","[go, jurong, point, ,, crazy.., available, bug..."
1,ham,ok lar... joking wif u oni...,"[ok, lar, ..., joking, wif, u, oni, ...]"
2,spam,free entry in a wkly comp to win fa cup final...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,u dun say so early hor... u c already then say...,"[u, dun, say, early, hor, ..., u, c, already, ..."
4,ham,"nah i don't think he goes to usf, he lives aro...","[nah, n't, think, goes, usf, ,, lives, around,..."


#### Removing punctuation and special characters (only keeping words)

Since punctuation and special characters have no importance in class identification, they are removed 

In [None]:
regex = '[a-z]+'

In [None]:
df_clean['text_clean'] = df_clean['text_clean'].apply(lambda x: [item for item in x if re.match(regex, item)])

In [None]:
df_clean.head()

Unnamed: 0,Category,Message,text_clean
0,ham,"go until jurong point, crazy.. available only ...","[go, jurong, point, crazy.., available, bugis,..."
1,ham,ok lar... joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,free entry in a wkly comp to win fa cup final...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,u dun say so early hor... u c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"nah i don't think he goes to usf, he lives aro...","[nah, n't, think, goes, usf, lives, around, th..."


#### Lemmatization

Converting the words to their root form.

In [None]:
lem = nltk.stem.wordnet.WordNetLemmatizer()

In [None]:
df_clean['text_clean'] = df_clean['text_clean'].apply(lambda x: [lem.lemmatize(item, pos='v') for item in x])

In [None]:
df_clean.head()

Unnamed: 0,Category,Message,text_clean
0,ham,"go until jurong point, crazy.. available only ...","[go, jurong, point, crazy.., available, bugis,..."
1,ham,ok lar... joking wif u oni...,"[ok, lar, joke, wif, u, oni]"
2,spam,free entry in a wkly comp to win fa cup final...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,u dun say so early hor... u c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"nah i don't think he goes to usf, he lives aro...","[nah, n't, think, go, usf, live, around, though]"


In [None]:
df_clean.drop('Message', axis=1, inplace = True)

#### Final Dataframe after Preprocessing

In [None]:
df_clean.head()

Unnamed: 0,Category,text_clean
0,ham,"[go, jurong, point, crazy.., available, bugis,..."
1,ham,"[ok, lar, joke, wif, u, oni]"
2,spam,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"[nah, n't, think, go, usf, live, around, though]"


#### Label Encoding on Categorical variable

In [None]:
enc = LabelEncoder()
df_clean['Category'] = enc.fit_transform(df_clean['Category'])
labels = list(enc.classes_)

In [None]:
df_clean.head()

Unnamed: 0,Category,text_clean
0,0,"[go, jurong, point, crazy.., available, bugis,..."
1,0,"[ok, lar, joke, wif, u, oni]"
2,1,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,0,"[u, dun, say, early, hor, u, c, already, say]"
4,0,"[nah, n't, think, go, usf, live, around, though]"


In [None]:
df_clean['text_clean'] = df_clean['text_clean'].apply(lambda x: ' '.join(x))

#### Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_clean['text_clean'], df_clean['Category'], test_size=0.2, random_state = 44)

In [None]:
X_train.shape

(4457,)

In [None]:
X_test.shape

(1115,)

#### Using CountVectorizer 

Countvectorizer is used to transform features into categorical data

In [None]:
cv = CountVectorizer()

In [None]:
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [None]:
X_train_cv.shape

(4457, 5949)

We can see that the transformed dataset has 5949 features. Each feature represents a word  

#### Checking Sparsity of the Matrix

In [None]:
sparsity = 1 - (np.count_nonzero(X_train_cv.toarray()) / float(X_train_cv.toarray().size))
print(f'Sparsity of matrix without removing low frequency words: {round(sparsity * 100, 2)}%')

Sparsity of matrix without removing low frequency words: 99.87%


In [None]:
word_freq_df = pd.DataFrame(X_train_cv.toarray(), columns=cv.get_feature_names_out())

In [None]:
word_freq_df.head()

Unnamed: 0,aa,aah,aaniye,aaooooright,aathi,ab,abdomen,aberdeen,abi,ability,abiola,abj,able,abnormally,aboutas,absence,absolutely,absolutly,abstract,abt,abta,aburo,abuse,abusers,ac,academic,acc,accent,accenture,accept,access,accessible,accidant,accident,accidentally,accommodationvouchers,accordin,accordingly,account,accumulation,...,yogasana,yor,yorge,you,youi,young,younger,youphone,your,youre,yourinclusive,yourjob,youuuuu,youwanna,yowifes,yr,yrs,ystrday,ything,yummmm,yummy,yun,yunny,yuo,yuou,yup,yupz,zac,zaher,zealand,zebra,zed,zero,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
top_words_df = pd.DataFrame(word_freq_df.sum()).sort_values(0, ascending=False)

In [None]:
top_words_df

Unnamed: 0,0
get,565
call,528
go,462
ur,310
gt,264
...,...
jamz,1
janarige,1
janx,1
jaya,1


Out of 5949 words, words like 'get', 'call' etc are most frequent

### Training Naive Bayes Model 

In [None]:
naive = MultinomialNB() 

In [None]:
naive.fit(X_train_cv, y_train)

MultinomialNB()

### Training LDA

In [None]:
lda = LinearDiscriminantAnalysis()

In [None]:
lda.fit(X_train_cv.toarray(), y_train)

LinearDiscriminantAnalysis()

### Model Evaluating 

In [None]:
def evaluate(y_train, y_pred):
    CM = confusion_matrix(y_train, y_pred)
    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]

    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP/(TP+FN)
    # Specificity or true negative rate
    TNR = TN/(TN+FP) 

    return TPR, TNR

### Evaluating Naive Bayes model

In [None]:
y_pred = naive.predict(X_test_cv.toarray())

TPR, TNR = evaluate(y_test, y_pred)

print(f'Sensitivity: {round(TPR,2)}')
print(f'Specificity: {round(TNR,2)}')

Sensitivity: 0.92
Specificity: 0.99


In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       944
           1       0.95      0.92      0.93       171

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



### Evaluating LDA model

In [None]:
y_pred = lda.predict(X_test_cv.toarray())

TPR, TNR = evaluate(y_test, y_pred)

print(f'Sensitivity: {round(TPR,2)}')
print(f'Specificity: {round(TNR,2)}')

Sensitivity: 0.78
Specificity: 0.96


In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       944
           1       0.77      0.78      0.78       171

    accuracy                           0.93      1115
   macro avg       0.87      0.87      0.87      1115
weighted avg       0.93      0.93      0.93      1115



From the above results we can see that Naives Bayes performs better than LDA

#### Using Count Vector and only Removing low frequency words

Countvectorizer is used to transform features into categorical data

In [None]:
cv = CountVectorizer(max_features=5000)

max_features parameter only keeps 5000 most frequent words

In [None]:
X_train_cv_rm = cv.fit_transform(X_train)
X_test_cv_rm = cv.transform(X_test)

In [None]:
X_train_cv_rm.shape

(4457, 5000)

We can see that the transformed dataset has 5000 features. Each feature represents a word  

#### Checking Sparsity of the Matrix

In [None]:
sparsity = 1 - (np.count_nonzero(X_train_cv_rm.toarray()) / float(X_train_cv_rm.toarray().size))
print(f'Sparsity of matrix after removing low frequency words: {round(sparsity * 100, 2)}%')

Sparsity of matrix after removing low frequency words: 99.84%


### Training Naive Bayes Model 

In [None]:
naive = MultinomialNB() 

In [None]:
naive.fit(X_train_cv_rm, y_train)

MultinomialNB()

### Evaluating Naive Bayes model

In [None]:
y_pred = naive.predict(X_test_cv_rm.toarray())

TPR, TNR = evaluate(y_test, y_pred)

print(f'Sensitivity: {round(TPR,2)}')
print(f'Specificity: {round(TNR,2)}')

Sensitivity: 0.92
Specificity: 0.99


In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       944
           1       0.95      0.92      0.93       171

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



### Training LDA

In [None]:
lda = LinearDiscriminantAnalysis()

In [None]:
lda.fit(X_train_cv_rm.toarray(), y_train)

LinearDiscriminantAnalysis()

### Evaluating LDA model

In [None]:
y_pred = lda.predict(X_test_cv_rm.toarray())

TPR, TNR = evaluate(y_test, y_pred)

print(f'Sensitivity: {round(TPR,2)}')
print(f'Specificity: {round(TNR,2)}')

Sensitivity: 0.81
Specificity: 0.97


In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       944
           1       0.82      0.81      0.82       171

    accuracy                           0.94      1115
   macro avg       0.89      0.89      0.89      1115
weighted avg       0.94      0.94      0.94      1115



After dimensionality reduction, the accuracy of both Naive Bayes and LDA increases however Naive Bayes still performs better than LDA

### Conclusion



1.   Naive Bayes is a probabilistic algorithm and it assumes that features are independent of each other. 
2.   LDA performs dimentionality reduction and can also be used for classification.  

