<a href="https://colab.research.google.com/github/tachoflash/project/blob/main/fourth_year_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#CIT-223-022/2021
#EDWIN KIPNGETICH
#FOURTH YEAR PROJECT
#BACHELOR OF SCIENCE IN COMPUTER SCIENCE
# In this project, my aim is to come up with a spam detector using the existing algorithm by optimizing it and making it more robust to prevent intrusions. The accuracy is very key in dealing with the spammers.





#THE DATA SOURCE

In [None]:
# get the data from google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.listdir('/content/drive/MyDrive/')

['ccsetup604.exe',
 'IMPORTANCE OF A BALANCED DIET.docx',
 'spam.csv.txt',
 'AnyScanner_04_30_2024(5).pdf',
 'Colab Notebooks',
 'requirements.txt',
 'Extra spam.txt']

#IMPORTING THE NEEDED LIBRARIES


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.colab import files
import io
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

#OBJECTIVE 1: ANALYZING THE NATURE OF SPAM AND HAM MESSAGES
#This is the study of the samples of text which are posing as either spam or ham.
#It gives the various characteristics and key distinctions between the texts.

In [None]:
data=files.upload()

Saving spam.csv.txt to spam.csv.txt


In [None]:
# we ensure the file name is maintained
filename = 'spam.csv.txt'  # This is likely the correct key in 'data'

df = pd.read_csv(io.StringIO(data[filename].decode('utf-8')))
# Access the data using the actual filename as the key.

In [None]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
if 'v1' in df.columns and 'v2' in df.columns:
    df.rename(columns={"v1": "Category", "v2": "Message"}, inplace=True)

In [None]:
 #Convert labels to numeric values
df['spam'] = df['Category'].apply(lambda x: 1 if x.lower() == 'spam' else 0)

In [None]:
 #Split dataset
x_train, x_test, y_train, y_test = train_test_split(df['Message'],df['spam'], test_size=0.2, random_state=42)

In [None]:
# Apply TF-IDF Vectorization and Train Naïve Bayes Model
pipeline = make_pipeline(TfidfVectorizer(), MultinomialNB())

#OBJECTIVE 2: TO IMPROVE THE ROBUSTNESS OF THE MODEL
#This is done by introducing adversarial examples which will swap the letters of words longer than three letters.
#This will ensure that the spam detector cannot be tricked by spammers

In [None]:

# Function to generate adversarial examples by swapping adjacent letters
def generate_adversarial_text(text):
    words = text.split()
    new_words = []
    for word in words:
        if len(word) > 3:
            idx = np.random.randint(0, len(word) - 1)
            word_list = list(word)
            word_list[idx], word_list[idx + 1] = word_list[idx + 1], word_list[idx]
            new_words.append(''.join(word_list))
        else:
            new_words.append(word)
    return " ".join(new_words)

# Augment training data with adversarial examples
x_train_adv = df['Message'].apply(generate_adversarial_text)

# Combine original and adversarial data
x_train_aug = pd.concat([df['Message'], x_train_adv], ignore_index=True)
y_train_aug = pd.concat([df['spam'], df['spam']], ignore_index=True)

print("Original training examples:", len(df['Message']))
print("Augmented training examples:", len(x_train_aug))


Original training examples: 5572
Augmented training examples: 11144


In [None]:

# Split dataset
x_train, x_test, y_train, y_test = train_test_split(x_train_aug, y_train_aug, test_size=0.2, random_state=42)

# Convert text to numerical data using CountVectorizer
cv = CountVectorizer()
x_train_count = cv.fit_transform(x_train.values)

# Train the model
model = MultinomialNB(alpha=1)
model.fit(x_train_count, y_train)

# Test model accuracy
x_test_count = cv.transform(x_test)
print("Model accuracy:", model.score(x_test_count, y_test))


Model accuracy: 0.9816061013907582


#OBJECTIVE 3: IMPROVING THE ACCURACY OF THE MODEL
# By introducing pipeline with the tf-idf pipeline to increase the accuracy of the model

In [None]:

# Create a pipeline with TF-IDF and Naive Bayes
pipeline = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Define parameter grid for tuning
param_grid = {
    'tfidfvectorizer__max_df': [0.9, 0.95, 1.0],
    'tfidfvectorizer__min_df': [1, 2, 5],
    'multinomialnb__alpha': [0.1, 0.5, 1, 5, 10]
}

# Grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=2)
grid_search.fit(x_train_aug, y_train_aug)

print("Best parameters:", grid_search.best_params_)
print("Best accuracy score:", grid_search.best_score_)


Fitting 5 folds for each of 45 candidates, totalling 225 fits
[CV] END multinomialnb__alpha=0.1, tfidfvectorizer__max_df=0.9, tfidfvectorizer__min_df=1; total time=   0.4s
[CV] END multinomialnb__alpha=0.1, tfidfvectorizer__max_df=0.9, tfidfvectorizer__min_df=1; total time=   0.4s
[CV] END multinomialnb__alpha=0.1, tfidfvectorizer__max_df=0.9, tfidfvectorizer__min_df=1; total time=   0.4s
[CV] END multinomialnb__alpha=0.1, tfidfvectorizer__max_df=0.9, tfidfvectorizer__min_df=1; total time=   0.4s
[CV] END multinomialnb__alpha=0.1, tfidfvectorizer__max_df=0.9, tfidfvectorizer__min_df=1; total time=   0.4s
[CV] END multinomialnb__alpha=0.1, tfidfvectorizer__max_df=0.9, tfidfvectorizer__min_df=2; total time=   0.3s
[CV] END multinomialnb__alpha=0.1, tfidfvectorizer__max_df=0.9, tfidfvectorizer__min_df=2; total time=   0.3s
[CV] END multinomialnb__alpha=0.1, tfidfvectorizer__max_df=0.9, tfidfvectorizer__min_df=2; total time=   0.2s
[CV] END multinomialnb__alpha=0.1, tfidfvectorizer__max_df

# OBJECTIVE 4: COMING UP WITH A DESIRED SPAM DETECTOR
#This is the implementation the more accurate and robust spam detector to be tested by the user

In [None]:
def detect_spam_message(sentence):
    email = [sentence.lower()]
    email_count = cv.transform(email)
    result = model.predict(email_count)

    if result[0] == 1:
        print("⚠️ Warning: This message is classified as SPAM!")
    else:
        print("✅ This message is classified as HAM (not spam).")

# Test user input
while True:
    word = input("Enter a sentence to check for spam (or type 'exit' to quit): ").strip()
    if word.lower() == "exit":
        break
    detect_spam_message(word)


Enter a sentence to check for spam (or type 'exit' to quit): exit


In [None]:
!pip install ipywidgets




In [None]:
# Import the necessary modules from ipywidgets
import ipywidgets as widgets
from IPython.display import display

# Create input box and output area
input_box = widgets.Text(
    description="Message:",
    placeholder="Type your message here...",
    layout=widgets.Layout(width="70%")
)

output_area = widgets.Output()

# Function to classify message and display result
def classify_message(change):
    output_area.clear_output()
    message = [change['new'].lower()]
    message_count = cv.transform(message)
    result = model.predict(message_count)[0]

    with output_area:
        if result == 1:
            display(widgets.HTML("<b style='color: red;'>⚠️ SPAM DETECTED! Please avoid sending spam messages.</b>"))
        else:
            display(widgets.HTML("<b style='color: green;'>✅ This message is safe (ham).</b>"))

# Attach function to input box
input_box.observe(classify_message, names='value')

# Display UI components
display(input_box, output_area)

Text(value='', description='Message:', layout=Layout(width='70%'), placeholder='Type your message here...')

Output()