In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('./data/Spam_SMS.csv')
df.head()

Unnamed: 0,Class,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.shape

(5574, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5574 entries, 0 to 5573
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Class    5574 non-null   object
 1   Message  5574 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


Lets convert the Class column datatype to category as it will help save space.

In [5]:
df['Class'] = df['Class'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5574 entries, 0 to 5573
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   Class    5574 non-null   category
 1   Message  5574 non-null   object  
dtypes: category(1), object(1)
memory usage: 49.2+ KB


In [6]:
df.duplicated().sum()

415

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
df['Class'].value_counts()

Class
ham     4518
spam     641
Name: count, dtype: int64

We can see that there is a class imbalance problem here. We will use SMOTE to solve this problem.

### Text Preprocessing

In [9]:
df['Message'].head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [10]:
# lowercasing
df['Message'] = df['Message'].str.lower()
df['Message'].head()

0    go until jurong point, crazy.. available only ...
1                        ok lar... joking wif u oni...
2    free entry in 2 a wkly comp to win fa cup fina...
3    u dun say so early hor... u c already then say...
4    nah i don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [11]:
# removing punctuations
import string

def rmv_punct(message):
    return message.translate(str.maketrans('','',string.punctuation))

df['Message'] = df['Message'].apply(rmv_punct)
df['Message'].head()

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in 2 a wkly comp to win fa cup fina...
3          u dun say so early hor u c already then say
4    nah i dont think he goes to usf he lives aroun...
Name: Message, dtype: object

In [12]:
# stopwords removal
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def rmv_sw(messages):
    message = word_tokenize(messages)

    message = [word for word in message if word not in stopwords.words('english')]

    return " ".join(message)

df['Message'] = df['Message'].apply(rmv_sw)
df['Message'].head()

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry 2 wkly comp win fa cup final tkts 2...
3                  u dun say early hor u c already say
4          nah dont think goes usf lives around though
Name: Message, dtype: object

In [13]:
# stemming
from nltk.stem import SnowballStemmer

ss = SnowballStemmer('english')

def stemming(messages):
    message = word_tokenize(messages)

    message = [ss.stem(word) for word in message]

    return " ".join(message)

df['Message'] = df['Message'].apply(stemming)
df['Message'].head()

0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri 2 wkli comp win fa cup final tkts 2...
3                  u dun say earli hor u c alreadi say
4            nah dont think goe usf live around though
Name: Message, dtype: object

In [14]:
df['Message'].sample(5)

984                           look fuckin time fuck think
3736                                          ‘ £6 get ok
919     hey gave photo regist drive ah tmr wan na meet...
4944    check maili mail varma kept copi regard member...
909                                 white fudg oreo store
Name: Message, dtype: object

### Splitting

In [15]:
X = df.iloc[:, 1:]
y = df['Class']

In [16]:
X

Unnamed: 0,Message
0,go jurong point crazi avail bugi n great world...
1,ok lar joke wif u oni
2,free entri 2 wkli comp win fa cup final tkts 2...
3,u dun say earli hor u c alreadi say
4,nah dont think goe usf live around though
...,...
5569,2nd time tri 2 contact u u £750 pound prize 2 ...
5570,ü b go esplanad fr home
5571,piti mood soani suggest
5572,guy bitch act like id interest buy someth els ...


In [17]:
y

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5569    spam
5570     ham
5571     ham
5572     ham
5573     ham
Name: Class, Length: 5159, dtype: category
Categories (2, object): ['ham', 'spam']

In [18]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)
y

array([0, 0, 1, ..., 0, 0, 0])

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [20]:
X_train.shape

(3611, 1)

In [21]:
X_test.shape

(1548, 1)

In [22]:
pd.Series(y_train).value_counts()

0    3164
1     447
Name: count, dtype: int64

### Feature Extraction

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
cv1 = CountVectorizer(max_features=5000)
cv2 = CountVectorizer(ngram_range=(1,2), max_features=5000)
tfidf = TfidfVectorizer(max_features=5000)

In [25]:
X_train_bow = cv1.fit_transform(X_train['Message']).toarray()
X_test_bow = cv1.transform(X_test['Message']).toarray()

X_train_bigrams = cv2.fit_transform(X_train['Message']).toarray()
X_test_bigrams = cv2.transform(X_test['Message']).toarray()

X_train_tfidf = tfidf.fit_transform(X_train['Message']).toarray()
X_test_tfidf = tfidf.transform(X_test['Message']).toarray()

In [26]:
X_train_bow.shape

(3611, 5000)

In [27]:
X_train_bigrams.shape

(3611, 5000)

In [28]:
X_train_tfidf.shape

(3611, 5000)

### Using SMOTE

In [29]:
from imblearn.over_sampling import SMOTE

ModuleNotFoundError: No module named 'imblearn'

In [None]:
smote = SMOTE(random_state=42)

X_train_bow_resampled, y_train_bow_resampled = smote.fit_resample(X_train_bow, y_train)
X_train_bigrams_resampled, y_train_bigrams_resampled = smote.fit_resample(X_train_bigrams, y_train)
X_train_tfidf_resampled, y_train_tfidf_resampled = smote.fit_resample(X_train_tfidf, y_train)

In [None]:
pd.Series(y_train_bow_resampled).value_counts()

0    3164
1    3164
Name: count, dtype: int64

### Model training and Evaluation

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

gnb_bow = GaussianNB()
gnb_bigrams = GaussianNB()
gnb_tfidf = GaussianNB()

gnb_bow.fit(X_train_bow_resampled, y_train_bow_resampled)
gnb_bigrams.fit(X_train_bigrams_resampled, y_train_bigrams_resampled)
gnb_tfidf.fit(X_train_tfidf_resampled, y_train_tfidf_resampled)

y_pred_bow = gnb_bow.predict(X_test_bow)
y_pred_bigrams = gnb_bigrams.predict(X_test_bigrams)
y_pred_tfidf = gnb_tfidf.predict(X_test_tfidf)

In [None]:
def print_metrics(y_test, y_pred, model_name, vectorizer_name):
    print(f"\nModel: {model_name} | Vectorizer: {vectorizer_name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

In [None]:
print_metrics(y_test, y_pred_bow, "GaussianNB", "BoW")
print_metrics(y_test, y_pred_bigrams, "GaussianNB", "Bigrams")
print_metrics(y_test, y_pred_tfidf, "GaussianNB", "TF-IDF")


Model: GaussianNB | Vectorizer: BoW
Accuracy: 0.8669250645994832
Confusion Matrix:
[[1163  191]
 [  15  179]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.86      0.92      1354
           1       0.48      0.92      0.63       194

    accuracy                           0.87      1548
   macro avg       0.74      0.89      0.78      1548
weighted avg       0.92      0.87      0.88      1548


Model: GaussianNB | Vectorizer: Bigrams
Accuracy: 0.8682170542635659
Confusion Matrix:
[[1166  188]
 [  16  178]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.86      0.92      1354
           1       0.49      0.92      0.64       194

    accuracy                           0.87      1548
   macro avg       0.74      0.89      0.78      1548
weighted avg       0.92      0.87      0.88      1548


Model: GaussianNB | Vectorizer: TF-IDF
Accuracy: 0.8649870801033591
Confu

In [None]:
from sklearn.naive_bayes import MultinomialNB

mnb_bow = MultinomialNB()
mnb_bigrams = MultinomialNB()
mnb_tfidf = MultinomialNB()

mnb_bow.fit(X_train_bow_resampled, y_train_bow_resampled)
mnb_bigrams.fit(X_train_bigrams_resampled, y_train_bigrams_resampled)
mnb_tfidf.fit(X_train_tfidf_resampled, y_train_tfidf_resampled)

y_pred_bow = mnb_bow.predict(X_test_bow)
y_pred_bigrams = mnb_bigrams.predict(X_test_bigrams)
y_pred_tfidf = mnb_tfidf.predict(X_test_tfidf)

In [None]:
print_metrics(y_test, y_pred_bow, "MultinomialNB", "BoW")
print_metrics(y_test, y_pred_bigrams, "MultinomialNB", "Bigrams")
print_metrics(y_test, y_pred_tfidf, "MultinomialNB", "TF-IDF")


Model: MultinomialNB | Vectorizer: BoW
Accuracy: 0.9612403100775194
Confusion Matrix:
[[1311   43]
 [  17  177]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1354
           1       0.80      0.91      0.86       194

    accuracy                           0.96      1548
   macro avg       0.90      0.94      0.92      1548
weighted avg       0.96      0.96      0.96      1548


Model: MultinomialNB | Vectorizer: Bigrams
Accuracy: 0.9664082687338501
Confusion Matrix:
[[1325   29]
 [  23  171]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1354
           1       0.85      0.88      0.87       194

    accuracy                           0.97      1548
   macro avg       0.92      0.93      0.92      1548
weighted avg       0.97      0.97      0.97      1548


Model: MultinomialNB | Vectorizer: TF-IDF
Accuracy: 0.9605943152454

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_bow = RandomForestClassifier(n_estimators=100, random_state=42)
rf_bigrams = RandomForestClassifier(n_estimators=100, random_state=42)
rf_tfidf = RandomForestClassifier(n_estimators=100, random_state=42)

rf_bow.fit(X_train_bow_resampled, y_train_bow_resampled)
rf_bigrams.fit(X_train_bigrams_resampled, y_train_bigrams_resampled)
rf_tfidf.fit(X_train_tfidf_resampled, y_train_tfidf_resampled)

y_pred_bow = rf_bow.predict(X_test_bow)
y_pred_bigrams = rf_bigrams.predict(X_test_bigrams)
y_pred_tfidf = rf_tfidf.predict(X_test_tfidf)

In [None]:
print_metrics(y_test, y_pred_bow, "RandomForest", "BoW")
print_metrics(y_test, y_pred_bigrams, "RandomForest", "Bigrams")
print_metrics(y_test, y_pred_tfidf, "RandomForest", "TF-IDF")


Model: RandomForest | Vectorizer: BoW
Accuracy: 0.8662790697674418
Confusion Matrix:
[[1171  183]
 [  24  170]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.86      0.92      1354
           1       0.48      0.88      0.62       194

    accuracy                           0.87      1548
   macro avg       0.73      0.87      0.77      1548
weighted avg       0.92      0.87      0.88      1548


Model: RandomForest | Vectorizer: Bigrams
Accuracy: 0.8682170542635659
Confusion Matrix:
[[1176  178]
 [  26  168]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.87      0.92      1354
           1       0.49      0.87      0.62       194

    accuracy                           0.87      1548
   macro avg       0.73      0.87      0.77      1548
weighted avg       0.92      0.87      0.88      1548


Model: RandomForest | Vectorizer: TF-IDF
Accuracy: 0.979328165374677


We can see that Random Forest with TFIDF produced highest metrics(accuracy, precision, recall and f1 score).