In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings 
warnings.simplefilter("ignore")

In [4]:
df = pd.read_csv('mail_data.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.shape

(5572, 2)

In [6]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [7]:
df.duplicated().sum()

415

In [8]:
df.drop_duplicates(inplace=True)

#### Data Exploration

In [10]:
df['Category'].value_counts()

# dataset is very imbalanced. There will be need
# to balance it by oversampling

Category
ham     4516
spam     641
Name: count, dtype: int64

In [11]:
data  = df.copy()

#### Data Preprocessing

In [12]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5157 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5157 non-null   object
 1   Message   5157 non-null   object
dtypes: object(2)
memory usage: 120.9+ KB


In [14]:
# Map the Category column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

mappings = {}
data['Category'] = le.fit_transform(data['Category'])
mappings['Category'] = {label:code for label, code in zip(le.classes_, le.transform(le.classes_))}

In [16]:
data.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
mappings

{'Category': {'ham': 0, 'spam': 1}}

In [19]:
X = data['Message']
y = data['Category']

#### Split Data into Training and Test Subsets

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

#### Applying TFIDF to the training data

In [25]:
tfidf = TfidfVectorizer()

In [34]:
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

In [35]:
classifiers = {
    'Random Forest' : RandomForestClassifier(random_state=42),
    'LogisticRegression' : LogisticRegression(random_state=42),
    'Gradient Boosting' : GradientBoostingClassifier(random_state=42),
    'Support Vector Classifier': SVC(random_state=42),
    'Decision Tree' : DecisionTreeClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes' : GaussianNB(),
    'XGboost' : XGBClassifier(random_state=42),
    'Adaboost' : AdaBoostClassifier(random_state=42),
    'LGBM' : LGBMClassifier(random_state=42)
}

In [36]:
metrics = {
    "Model" : [],
    "Accuracy" : [],
    "Precision" : [],
    "Recall" : [],
    "F1_Score" : []
}

In [37]:
for model_name, classifier in classifiers.items():
    classifier.fit(X_train_tfidf, y_train)

    y_pred = classifier.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    metrics["Model"].append(model_name)
    metrics['Accuracy'].append(accuracy)
    metrics['Precision'].append(precision)
    metrics['Recall'].append(recall)
    metrics['F1_Score'].append(f1)

[LightGBM] [Info] Number of positive: 505, number of negative: 3620
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004978 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12376
[LightGBM] [Info] Number of data points in the train set: 4125, number of used features: 442
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122424 -> initscore=-1.969671
[LightGBM] [Info] Start training from score -1.969671


In [38]:
metrics_df = pd.DataFrame(metrics)

In [39]:
metrics_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1_Score
0,Random Forest,0.976744,0.982759,0.838235,0.904762
1,LogisticRegression,0.97093,0.981818,0.794118,0.878049
2,Gradient Boosting,0.967054,0.932203,0.808824,0.866142
3,Support Vector Classifier,0.985465,0.984,0.904412,0.942529
4,Decision Tree,0.96124,0.858209,0.845588,0.851852
5,KNN,0.907946,1.0,0.301471,0.463277
6,Naive Bayes,0.90407,0.59204,0.875,0.706231
7,XGboost,0.974806,0.923077,0.882353,0.902256
8,Adaboost,0.967054,0.918033,0.823529,0.868217
9,LGBM,0.979651,0.96748,0.875,0.918919


In [40]:
rf = RandomForestClassifier(random_state=42)

In [43]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

In [42]:
params_dist = {
    'n_estimators' : randint(100,500),
    'max_depth' : randint(10,50),
    'min_samples_split' : randint(2,10),
    'min_samples_leaf' : randint(1,5),
    'bootstrap' : [True, False]
}

In [44]:
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=params_dist,
    n_iter=5,
    cv=5,
    verbose=2,
    random_state=42,
    scoring='accuracy'
)

In [45]:
random_search.fit(X_train_tfidf, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END bootstrap=True, max_depth=38, min_samples_leaf=3, min_samples_split=4, n_estimators=171; total time=  16.8s
[CV] END bootstrap=True, max_depth=38, min_samples_leaf=3, min_samples_split=4, n_estimators=171; total time=  18.1s
[CV] END bootstrap=True, max_depth=38, min_samples_leaf=3, min_samples_split=4, n_estimators=171; total time=  16.3s
[CV] END bootstrap=True, max_depth=38, min_samples_leaf=3, min_samples_split=4, n_estimators=171; total time=  18.0s
[CV] END bootstrap=True, max_depth=38, min_samples_leaf=3, min_samples_split=4, n_estimators=171; total time=  17.1s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=3, min_samples_split=3, n_estimators=314; total time=  28.4s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=3, min_samples_split=3, n_estimators=314; total time=  27.4s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=3, min_samples_split=3, n_estimators=314; total time=  27.2s
[CV]

In [46]:
random_search.best_params_

{'bootstrap': False,
 'max_depth': 33,
 'min_samples_leaf': 3,
 'min_samples_split': 7,
 'n_estimators': 408}

In [47]:
best_rf_model = random_search.best_estimator_

In [48]:
y_pred = best_rf_model.predict(X_test_tfidf)

In [49]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [50]:
accuracy

0.9689922480620154

In [51]:
recall

0.7720588235294118

In [52]:
precision

0.9905660377358491

In [54]:
f1

0.8677685950413223

In [56]:
import pickle

In [None]:
pickle.dump(rf, open('rf_v1', 'wb'))