In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

import plotly.express as px
import plotly.figure_factory as ff

In [2]:
path = 'dataset-SpamSMS.csv'
dataset_spam = pd.read_csv(path)
dataset_spam.head()

Unnamed: 0,type,text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or £10,000 ..."
4,spam,okmail: Dear Dave this is your final notice to...


In [3]:
#print the shape (get the number of rows and columns of the dataset)
dataset_spam.shape

(15698, 2)

In [4]:
#get the column names
dataset_spam.columns

Index(['type', 'text'], dtype='object')

In [5]:
dataset_spam.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15698 entries, 0 to 15697
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    15698 non-null  object
 1   text    15698 non-null  object
dtypes: object(2)
memory usage: 245.4+ KB


In [6]:
#check for duplicates rows and remove or drop them later
dataset_spam.drop_duplicates (inplace= True)

In [7]:
#show the new shape of the data, meaning the new number of rows and columns
dataset_spam.shape

(9882, 2)

In [8]:
#show number of missing data for each column (NAN, NaN, na)
dataset_spam.isnull().sum()

type    0
text    0
dtype: int64

###We are going to use stopwords within the function that we are going to create, and this function will be used to process the text.
###Stopwords in natural language processing are useless words or data

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfVectorizer = TfidfVectorizer()

X = dataset_spam['text']
X_tfidfvect = tfidfVectorizer.fit_transform(X)

In [10]:
#Split the data into 80% training and 20% testing
X_train_tfidfvect, X_test_tfidfvect, y_train_tfidfvect, y_test_tfidfvect = train_test_split(X_tfidfvect, 
                                                                                            dataset_spam['type'],
                                                                                            stratify = dataset_spam['type'],
                                                                                            test_size=0.20, 
                                                                                            random_state= 0)

# **Algorithms Comparison**

In [11]:
classification_models = []
classification_models.append(('LOGISTIC', LogisticRegression(solver='liblinear', multi_class='ovr')))
classification_models.append(('NEIGHBOR', KNeighborsClassifier()))
classification_models.append(('TREE', DecisionTreeClassifier()))
classification_models.append(('NAIVEBAYES', MultinomialNB()))
#classification_models.append(('NAIVEBAYES', GaussianNB()))
classification_models.append(('VECTOR', SVC(gamma='auto')))
classification_models.append(('FOREST',RandomForestClassifier(n_estimators=50, random_state=1, max_features = 'sqrt', n_jobs=-1, verbose = 1)))

In [12]:
results = []
algorithms = []
cv_results_mean = []
cv_results_std = []
for algorith, model in classification_models:
    kfold = StratifiedKFold(n_splits = 8, random_state = 1, shuffle = True)
    cv_results = (cross_val_score(model, X_train_tfidfvect, y_train_tfidfvect, cv=kfold, scoring='accuracy'))*100
    results.append(cv_results)
    algorithms.append(algorith)
    cv_results_mean.append(cv_results.mean())
    cv_results_std.append(cv_results.std())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   16.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   17.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    1.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    1.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent work

However, these code generate the plot for the algorithms comparison, the plot is saved and showed in the dashboard.

In [13]:
df_results_traspose = (pd.DataFrame(results, index =algorithms)).T


algorithms_comparison = px.box(df_results_traspose)
algorithms_comparison.update_layout(title_text="MODELS COMPARISON", title_x=0.5)
algorithms_comparison.update_layout(xaxis_title='Algorithms', yaxis_title='Accuracy (%)')
algorithms_comparison['layout']['title']['font'] = dict(size=20)

#algorithms_comparison.show()


Accuracy_chart_data = [cv_results_mean]
mean_accuracy = pd.DataFrame(Accuracy_chart_data, columns = algorithms, index = ['Mean_Accuracy(%)'])
full_chart = pd.concat([df_results_traspose, mean_accuracy])

full_chart

Unnamed: 0,LOGISTIC,NEIGHBOR,TREE,NAIVEBAYES,VECTOR,FOREST
0,94.944388,88.068756,95.652174,93.22548,81.395349,95.955511
1,95.1417,87.044534,95.34413,93.623482,81.477733,96.153846
2,94.02834,87.651822,94.736842,93.522267,81.477733,95.34413
3,93.825911,86.94332,96.052632,92.510121,81.477733,95.748988
4,93.522267,86.740891,95.1417,92.510121,81.376518,95.34413
5,95.1417,87.854251,96.356275,93.218623,81.376518,96.862348
6,94.02834,86.538462,94.230769,93.016194,81.376518,95.040486
7,93.522267,87.449393,95.748988,93.421053,81.376518,96.255061
Mean_Accuracy(%),94.269364,87.286429,95.407939,93.130918,81.416828,95.838062


In [14]:
def accuracyscore(X_train, y_train, model, X_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    return predictions

In [15]:
#Split the data into 80% training and 20% testing
dataset_spam = pd.read_csv(path)

X = dataset_spam['text'].values
y = dataset_spam['type'].values

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    stratify = y,
                                                    test_size=0.20, 
                                                    random_state= 0)

# Training the Logistic Regression algorithm

The Logistic Regression algorithm is build with the use of a pipeline which will pass the training data through the process of tfidf and vectorization at one with TfidfVectorizer and after will train the model with the LogisticRegression().

In [16]:
#The code below is eliminating stop words.
#tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')


#convert a collection of text to a matrix of words without eliminating stop words
model_LR = make_pipeline(TfidfVectorizer(), 
                      LogisticRegression(solver='liblinear', 
                                         multi_class='ovr'))

Prediction with real SMS

In [17]:
def predict_SMS(sms, model):
    pred = model.predict([sms])
    return pred[0]

In [18]:
#real_sms_prediction = predict_SMS('Your AIB online access is suspended due to unusual activity. To restore access please follow the steps via: https://securemobileaibapp.com', model_LR)
#print(real_sms_prediction)

# Training the Random Forest Classifier Algorith

The Random Forest Classifier was choosen because it has one of the two best performance in the "Comparative of the Algorithms" done previously. The process os training is done with a pipeline where the training data go through the tfidf and vectorizer transformation and then use the RandomForestClassifier() function with the hyperparameters set.

In [19]:
model_RFC = make_pipeline(TfidfVectorizer(), 
                      RandomForestClassifier(random_state = 1, 
                               max_features = 'sqrt',
                               n_jobs=-1, 
                               verbose = 1))

# Accuracy of the models
After the training is done, the resulting model will be use with the 20% testing set of the dataset (y_test), the resulting value tell that the Logistic Regression model has an accuracy of 96.97% and the Random Forest Classifier of 98.53% those compare with the 94.26% and 95.83% respectivly showed in the Comparative, tell and increase in the accuracy.

This is worrisome as too close to a 100% accuracy makes most of the models unreliable and in need of more research / training etcetera.

In [20]:
logictic_regression_predictions = accuracyscore(X_train, y_train, model_LR, X_test)

print("LOGISTIC REGRESSION accuracy score", accuracy_score(y_test, logictic_regression_predictions)*100, '%')

random_forest_predictions = accuracyscore(X_train, y_train, model_RFC, X_test)
print("RANDOM FOREST CLASSIFIER accuracy score", accuracy_score(y_test, random_forest_predictions)*100, '%')

LOGISTIC REGRESSION accuracy score 96.97452229299363 %


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s


RANDOM FOREST CLASSIFIER accuracy score 98.53503184713375 %


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [21]:
print('CLASSIFICATION REPORT FOR LOGISTIC REGRESSION:\n', classification_report(y_test, logictic_regression_predictions))

print('\n\nCLASSIFICATION REPORT FOR RANDOM FOREST:\n', classification_report(y_test, random_forest_predictions))

CLASSIFICATION REPORT FOR LOGISTIC REGRESSION:
               precision    recall  f1-score   support

         ham       0.97      1.00      0.98      2599
        spam       0.98      0.84      0.91       541

    accuracy                           0.97      3140
   macro avg       0.98      0.92      0.94      3140
weighted avg       0.97      0.97      0.97      3140



CLASSIFICATION REPORT FOR RANDOM FOREST:
               precision    recall  f1-score   support

         ham       0.99      1.00      0.99      2599
        spam       0.98      0.94      0.96       541

    accuracy                           0.99      3140
   macro avg       0.98      0.97      0.97      3140
weighted avg       0.99      0.99      0.99      3140



### Sample of the predictions gotten against the real values in y_test

In [22]:
pred_val_comp = np.concatenate((logictic_regression_predictions.reshape(len(logictic_regression_predictions),1), y_test.reshape(len(y_test),1)),1)
pred_val_comp = pd.DataFrame(pred_val_comp, columns = ['Predictions', 'Validation_Set'])
pred_val_comp.head(3)

Unnamed: 0,Predictions,Validation_Set
0,ham,ham
1,ham,ham
2,ham,ham


In [23]:
pred_val_comp = np.concatenate((random_forest_predictions.reshape(len(random_forest_predictions),1), y_test.reshape(len(y_test),1)),1)
pred_val_comp = pd.DataFrame(pred_val_comp, columns = ['Predictions', 'Validation_Set'])
pred_val_comp.head(3)

Unnamed: 0,Predictions,Validation_Set
0,ham,ham
1,ham,ham
2,ham,ham


Plot to show visualy how close the predictictions are close or match the testing values.

In [24]:
"""
plt.figure(figsize=(5,4))
plt.plot(pred_val_comp.Predictions, 'rD')
plt.plot(pred_val_comp.Validation_Set, 'b*')
plt.xlabel('Observations')
plt.ylabel('SMS type')
plt.legend(['Predictions', 'Validations'], loc='best')
a = plt.title('\nPREDICTIONS vs VALIDATIONS\n'.upper(), loc='center', fontsize = 20)
"""

"\nplt.figure(figsize=(5,4))\nplt.plot(pred_val_comp.Predictions, 'rD')\nplt.plot(pred_val_comp.Validation_Set, 'b*')\nplt.xlabel('Observations')\nplt.ylabel('SMS type')\nplt.legend(['Predictions', 'Validations'], loc='best')\na = plt.title('\nPREDICTIONS vs VALIDATIONS\n'.upper(), loc='center', fontsize = 20)\n"

## **Confusion Matrices plots**

This function generate a headplot to illustrate the **Confusion Matrices**, it is not done with "plot_confusion_matrix" library because the plots for the Dasboard need to be in plotly. Therefore plotly.express was used instead.

In [25]:
def create_confusion_matrix_plot (model_name, y_test, predictions):
    labels = ["spam", "ham"]
    true_negative, false_positive, false_negative, true_positive = confusion_matrix(y_test, predictions, labels=labels).ravel()

    confusion_matrix_values = [[false_positive, true_negative],
                              [true_positive, false_negative]]

    # change each element of z to type string for annotations
    confusion_matrix_string = [[str(y) for y in x] for x in confusion_matrix_values]

    # set up figure 
    confusion_matrix_plot = ff.create_annotated_heatmap(confusion_matrix_values, x=labels, y=labels, annotation_text=confusion_matrix_string, colorscale='Blues')

    # add title
    confusion_matrix_plot.update_layout(title_text = model_name,
                      title_x=0.5,
                      #xaxis = dict(title='Predictions'),
                      yaxis = dict(title='Testing')
                     )

    # add custom xaxis title
    confusion_matrix_plot.add_annotation(dict(font=dict(color="black",size=14),
                            x=0.5,
                            y=-0.15,
                            showarrow=False,
                            text='Predictions',
                            xref="paper",
                            yref="paper"))

    confusion_matrix_plot['layout']['title']['font'] = dict(size=20)

    # add custom yaxis title
    confusion_matrix_plot.add_annotation(dict(font=dict(color="black",size=14),
                            x=-0.35,
                            y=0.5,
                            showarrow=False,
                            text="Testing",
                            textangle=-90,
                            xref="paper",
                            yref="paper"))

    # adjust margins to make room for yaxis title
    #confusion_matrix.update_layout(margin=dict(t=50, l=200))

    # add colorbar
    confusion_matrix_plot['data'][0]['showscale'] = True
    #confusion_matrix_plot.show()
    return confusion_matrix_plot

These code generate the plots for to illustrate de Confusion Models for the Models availables.
The actual vizualizations happend in the Dashboard.

In [26]:
conf_matrix_plot_log_reg= create_confusion_matrix_plot ("Logistic Regression", y_test, logictic_regression_predictions)
conf_matrix_plot_ran_for_clas= create_confusion_matrix_plot ("Random Forest Classifier", y_test, random_forest_predictions)

# Real SMS
The code below is just for testing the use of the Random Forest Classifier after the training is done with a real SMS, but it is comented because the real testing occurs in a interactive way with the use of a GUI.

In [28]:
#real_sms_prediction = predict_SMS('Your AIB online access is suspended due to unusual activity. To restore access please follow the steps via: https://securemobileaibapp.com', 
#                                  model_RFC)
#print(real_sms_prediction)

# Dashboard

The dashboard is done using JupyterDash library to start with, this along with Bootstrap will help to construct an enviroment where the plots, graphs and dataset are displayed in a more pleaseant way, rendering it in the same web browser where the code is performed(for example Edge)

After running the dashboard, one gets a link and as mentioned that will allows a web browser to be open onece is clicked and show all our graphs and vizuale descriptions.

In [29]:
#pip install dash
#pip install jupyter-dash
#pip install dash_bootstrap_components

These are all the liiibraries used in this particular Dashboard, "https://cdn.jsdelivr.net/npm/bootswatch@5.1.3/dist/flatly/bootstrap.min.css" is a boostrap pre-build theme which will help with a faster set up.

In [30]:
from dash import Dash, html, dcc, dash_table
from dash.dependencies import Output, Input
import dash_bootstrap_components as dbc
from jupyter_dash import JupyterDash

dbc.themes.FLATLY

'https://cdn.jsdelivr.net/npm/bootswatch@5.1.3/dist/flatly/bootstrap.min.css'

In [31]:
dashboard_sms_anlytics = JupyterDash(__name__, external_stylesheets=[dbc.themes.ZEPHYR])

dashboard_sms_anlytics.layout = dbc.Container(
    [
        #First Row with the title
        dbc.Row(dbc.Col(html.H2('SPAM IN SMS ANALYTICS', className='text-center text-primary, mb-3'))),
        
        #Second Row
        dbc.Row([  # start of forth row
            dbc.Col([  # first column on forth row
                html.H3('COMPARISON ALGORITHMS CHART', className='text-center'),
            ], width={'size': 4, 'offset': 0, 'order': 1}),  # width first column on second row
            dbc.Col([  # second column on third row
                html.H3('CONFUSION MATRIX', className='text-center'),
            ], width={'size': 8, 'offset': 0, 'order': 2}),  # width second column on second row
        ]),  # end of forth row
        
        #Third row
        dbc.Row([
            dbc.Col([  # first column on thir row
            dcc.Graph(id='chrt-portfolio-main',
                      figure = algorithms_comparison,
                      style={'height':550}),
            html.Hr(),
            ], width={'size': 4, 'offset': 0, 'order': 1}),  # width first column on third row
            dbc.Col([  # second column on third row
            dcc.Graph(id='indicators-ptf',
                      figure = conf_matrix_plot_log_reg,
                      style={'height':550}),
            html.Hr()
            ], width={'size': 4, 'offset': 0, 'order': 2}),  # width second column on third row
            dbc.Col([  # third column on third row
            #html.H5('S&P500', className='text-center'),
            dcc.Graph(id='indicators-sp',
                      figure = conf_matrix_plot_ran_for_clas,
                      style={'height':550}),
            html.Hr()
            ], width={'size': 4, 'offset': 0, 'order': 3}),  # width third column on third row
        ]),  # end of third row
        
        dbc.Row([  # start of forth row
            dbc.Col([  # first column on forth row
                html.H5('Monthly Return (%)', className='text-center'),
                dcc.Graph(id='chrt-portfolio-secondary',
                      figure = algorithms_comparison,
                      style={'height':380}),
            ], width={'size': 8, 'offset': 0, 'order': 1}),  # width first column on second row
            dbc.Col([  # second column on third row
                html.H5('Top 15 Holdings', className='text-center'),
                dcc.Graph(id='pie-top15',
                      figure = algorithms_comparison,
                      style={'height':380}),
            ], width={'size': 4, 'offset': 0, 'order': 2}),  # width second column on second row
        ])  # end of forth row
        
    ], fluid=True)


if __name__ == "__main__":
    dashboard_sms_anlytics.run_server(debug=True, port=8058)
    #dashboard_sms_anlytics.run_server(mode = "inline")

Dash app running on http://127.0.0.1:8058/


# **Graphical User Interface**

In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

from PIL import Image
import requests
from io import BytesIO
### CREATE VIRTUAL DISPLAY ###
#!apt-get install -y xvfb # X Virtual Frame Buffer Installation
import os

In [None]:
os.system('Xvfb :1 -screen 0 1600x1200x16  &')   # virtual display [1600x1200] 16 bit color (can be 8, 16, 24)
os.environ['DISPLAY']=':1.0'    # request X clients to use virtual DISPLAY :1.0.

In [None]:
# VERSION 3.0
import sys
try:
    # python 2.x
    from Tkinter import * #import Tkinter as tk
except ImportError:
    # python 3.x
    from tkinter import * #import tkinter as tk

# Pillow library (Images)
from PIL import Image, ImageTk

def on_resize(event):
  pass
    # resize BG_IMG(Label_SIZE) & update Label IMG
    #BG_.image = ImageTk.PhotoImage(BG_IMG.resize((event.width, event.height), Image.ANTIALIAS))
    #BG_.config(image=BG_.image)

# Define Function for get Value
def detection():
    get_user_sms = msg.get("1.0",'end-1c')
    
    LogisticRegression_Prediction = predict_SMS(get_user_sms, model_LR)
    RandomForest_Prediction = predict_SMS(get_user_sms, model_RFC)
    
    output = "This SMS acording with...\n\n"
    output += "Logistic Regression Model is: \t" + LogisticRegression_Prediction
    output += "\n\nRandom Forest Classifier Model is: " + RandomForest_Prediction

    myTKlabel['text'] = output
    print(get_user_sms)
    
root = Tk()
root.wm_title('            SMS SPAM DETECTION            ')

HEIGTH = 550#700
WIDTH = 400
BG_color = 'black'

root.geometry('{}x{}'.format(WIDTH, HEIGTH))
BG_ = Label(root, 
            bg=BG_color,
           )
BG_.place(x=0, y=0, relwidth=1, relheight=1) # make label l to fit the parent window always

# TITLE
title = Label(
    root,
    text='SMS SPAM DETECTION',
    relief=SUNKEN,
    fg='gold',
    bg=BG_color,
    font=("Helvetica", 14, 'underline'),
    borderwidth=0
    )
title.pack(pady = 25, ipadx=0, ipady=0)

# INPUT MSG
msg = Text(root, height = 5, width = 45)#, width=40)
msg.pack()
msg.insert('1.0', 'Enter SMS Here...')

# LABEL: OUTPUT AREA (SPAM | HAM)
myTKlabel = Label(root, text='?', fg='green', bg='white', font=("Helvetica", 12),
                  borderwidth=4, relief="raised", height=15, width=39, justify=LEFT
                  )
myTKlabel.pack(pady = 10, ipadx=0, ipady=0)

# BUTTON: DETECT CLASSIFY|DETECT SPAM
BTN_ = Button(root, height=1, width=30, text="CLASSIFY", command = detection, bg="green", fg="white",
            font=("Helvetica", 14)
             )
BTN_.pack()

root.mainloop()