In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import ipywidgets as widgets
from IPython.display import display

### Data Collection

In [2]:
raw_mail_data = pd.read_csv('data/email.csv')

In [3]:
maiL_data = raw_mail_data.where((pd.notnull(raw_mail_data)), '')

In [4]:
maiL_data.shape

(5573, 2)

### Label Encoding

In [5]:
# label spam mail as 0, ham mail as 1
maiL_data.loc[maiL_data["Category"] == "spam", "Category"] = 0
maiL_data.loc[maiL_data["Category"] == "ham", "Category"] = 1

spam = 0

ham = 1

In [6]:
maiL_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


### Separating data as texts and labels

In [7]:
x = maiL_data['Message']
y = maiL_data['Category']

### Splitting the data into training data & test data

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=3)

In [9]:
print(x.shape[0])
print(x_train.shape[0])
print(x_test.shape[0])

5573
4458
1115


### Feature Extraction

In [10]:
last_index_y_train = y_train.index[-1]
last_index_y_test = y_test.index[-1]

y_train = y_train.drop(last_index_y_train)
y_test = y_test.drop(last_index_y_test)

In [11]:
# Check for non-numeric values and display their indexes
non_numeric = y_train.apply(lambda x: isinstance(x, str) and not x.isnumeric())
print(y_train[non_numeric])


5572    {"mode":"full"
Name: Category, dtype: object


In [12]:
y_train_str = y_train.astype(str)
non_numeric_indexes = y_train_str.apply(lambda x: not x.isnumeric())
y_train_clean = pd.to_numeric(y_train_str, errors='coerce').fillna(0).astype(int)



y_test_str = y_test.astype(str)
non_numeric_indexes = y_test_str.apply(lambda x: not x.isnumeric())
y_test_clean = pd.to_numeric(y_test_str, errors='coerce').fillna(0).astype(int)

In [13]:
#transform the test data to feature vectors that can be used as input to the logistic model
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

#covert y_train and y_test values as integer
y_train = y_train_clean.astype("int")
y_test = y_test_clean.astype("int")

In [14]:
print(x_train_features.shape)
print(y_train.shape)

(4458, 7433)
(4457,)


## Training the Model

## Logistic Regression

In [15]:
model = LogisticRegression()

In [16]:
y_train = pd.concat([y_train, pd.Series([0])], ignore_index=True)

In [17]:
print(x_train.shape)

print(y_train.shape)


(4458,)
(4458,)


In [18]:
#training logistic regression model with the x_train
model.fit(x_train_features, y_train)

### Evaluating the trained model

In [19]:
prediction_on_training_data = model.predict(x_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [20]:
print(f"Accuracy score is {accuracy_on_training_data}")

Accuracy score is 0.9679228353521758


In [21]:
x_test_features = x_test_features[:-1, :]
print(x_test_features.shape)

print(y_test.shape)

(1114, 7433)
(1114,)


In [22]:
prediction_on_test_data = model.predict(x_test_features)
accuracy_on_test_data = accuracy_score(y_test, prediction_on_test_data)

In [23]:
print(f"Accuracy score on test data is {accuracy_on_test_data}")

Accuracy score on test data is 0.966786355475763


## Building a Predicting System

In [24]:
input_mail = [""]
#convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

#making prediction
prediction = model.predict(input_data_features)

result = 'Ham mail' if prediction[0] == 1 else 'Spam mail'
print(f"The email is classified as: {result}")

The email is classified as: Ham mail


In [25]:
# function for handling the prediction
def classify_email(btn):
    input_mail = text.value.strip()  # remove leading and trailing spaces
    if not input_mail:  # check if the input is empty
        with output:
            output.clear_output()
            print("Enter the text, please")
    else:
        # transform the input text to feature vectors
        input_data_features = feature_extraction.transform([input_mail])
        # make a prediction
        prediction = model.predict(input_data_features)
        # determine the result based on the prediction
        result = 'Ham mail' if prediction[0] == 1 else 'Spam mail'
        with output:
            output.clear_output()
            # display the classification result
            print(f"The email is classified as: {result}")

# interface elements
text = widgets.Textarea(description="Email's text:")
btn = widgets.Button(description="Classify")
output = widgets.Output()

# assign the button click event handler
btn.on_click(classify_email)

# display the interface
display(text, btn, output)


Textarea(value='', description="Email's text:")

Button(description='Classify', style=ButtonStyle())

Output()