In [39]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer

In [51]:
df = pd.read_csv("D:\\Python.vs\\Data Set\\mail_data.csv")

In [41]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [43]:
df.shape

(5572, 2)

In [44]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [45]:
df.Category

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: Category, Length: 5572, dtype: object

In [52]:
df["Category"] = df.Category.apply(lambda x: 0 if x == "spam" else 1)

In [53]:
df['Category']

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: int64

In [54]:
df['Category'].value_counts()

1    4825
0     747
Name: Category, dtype: int64

In [29]:
from sklearn import metrics
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, auc, accuracy_score

In [84]:
def Model(data, models, thrsold):
    x = data['Message']
    y = data['Category']

    #  Converting the textual data to numerical data
    tfid = TfidfVectorizer()
    x = tfid.fit_transform(x)
    

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25,  stratify = y, random_state = 112)

    col_names = ["Algo", "Accuracy", "Precision", "Recall", "F1_score", "Auc"]
    results_train = pd.DataFrame(columns = col_names)
    results_test = pd.DataFrame(columns = col_names)

    i = 0
    for name, model in models:

        model.fit(x_train, y_train)
        x_train_pred = model.predict_proba(x_train)[:,1]
        accuracy, precision, recall, f1_score, roc_auc = eval_metrics_train(y_train, x_train_pred, thrsold)
        results_train.loc[i] = [name, accuracy, precision, recall, f1_score, roc_auc]

        model.fit(x_test, y_test)
        x_test_pred = model.predict_proba(x_test)[:,1]
        accuracy, precision, recall, f1_score, roc_auc = eval_metrics_train(y_test, x_test_pred, thrsold)
        results_test.loc[i] = [name, accuracy, precision, recall, f1_score, roc_auc]
        i += 1

    return(results_train, results_test)

    

In [85]:
def eval_metrics_train(target, prob, threshold):
        """ target = y_yest,prob = logistic.predict(X_test),Threshold = Cutoff Threshold """
        data=pd.DataFrame()
        data['pred_proba'] = pd.Series(prob)
        data['pred'] = data['pred_proba'].map(lambda x: 1.0 if x > threshold else 0.0)
        accuracy=metrics.accuracy_score(target,data['pred'])
        precision= metrics.precision_score(target,data['pred'])
        recall= metrics.recall_score(target,data['pred'])
        f1_score = metrics.f1_score(target,data['pred'])
        roc_auc = metrics.roc_auc_score(target,data['pred'])
        return(accuracy,precision,recall,f1_score,roc_auc)
    

In [86]:
def eval_metrics_test(target, prob, threshold):
        """ target = y_yest,prob = logistic.predict(X_test),Threshold = Cutoff Threshold """
        data_1=pd.DataFrame()
        data_1['pred_proba'] = pd.Series(prob)
        data_1['pred'] = data_1['pred_proba'].map(lambda x: 1.0 if x > threshold else 0.0)
        accuracy=metrics.accuracy_score(target,data_1['pred'])
        precision= metrics.precision_score(target,data_1['pred'])
        recall= metrics.recall_score(target,data_1['pred'])
        f1_score = metrics.f1_score(target,data_1['pred'])
        roc_auc = metrics.roc_auc_score(target,data_1['pred'])
        return(accuracy,precision,recall,f1_score,roc_auc)

In [87]:
models = []
models.append(("xgboost", XGBClassifier()))
models.append(("SVM", SVC(probability = True)))
models.append(("Random Forest", RandomForestClassifier()))
models.append(("Random Forest", DecisionTreeClassifier()))
models.append(("Logistic Regression", LogisticRegression()))
models.append(("Ada Boost", AdaBoostClassifier(base_estimator = RandomForestClassifier(n_estimators = 50))))

In [88]:
Model(df, models, thrsold = 0.5)









(                  Algo  Accuracy  Precision    Recall  F1_score       Auc
 0              xgboost  0.996889   0.996694  0.999724  0.998207  0.989148
 1                  SVM  0.999761   0.999724  1.000000  0.999862  0.999107
 2        Random Forest  1.000000   1.000000  1.000000  1.000000  1.000000
 3        Random Forest  1.000000   1.000000  1.000000  1.000000  1.000000
 4  Logistic Regression  0.974635   0.972051  0.999447  0.985559  0.906867
 5            Ada Boost  1.000000   1.000000  1.000000  1.000000  1.000000,
                   Algo  Accuracy  Precision  Recall  F1_score       Auc
 0              xgboost  0.999282   0.999171     1.0  0.999586  0.997326
 1                  SVM  1.000000   1.000000     1.0  1.000000  1.000000
 2        Random Forest  1.000000   1.000000     1.0  1.000000  1.000000
 3        Random Forest  1.000000   1.000000     1.0  1.000000  1.000000
 4  Logistic Regression  0.934673   0.929838     1.0  0.963644  0.756684
 5            Ada Boost  1.000000   