In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Step 2: Load the Dataset
# Assuming your dataset is in the form of text files under a directory in Google Drive
spam_folder = '/content/drive/MyDrive/Dataset/Spam/Spam/Phising'  # Update this path
ham_folder = '/content/drive/MyDrive/Dataset/Ham/Ham'      # Update this path

spam_files = [os.path.join(spam_folder, f) for f in os.listdir(spam_folder)]
ham_files = [os.path.join(ham_folder, f) for f in os.listdir(ham_folder)]

In [None]:
# Read files and label them
data = []
for file in spam_files:
    with open(file, 'r') as f:
        data.append((f.read(), 'spam'))

for file in ham_files:
    with open(file, 'r') as f:
        data.append((f.read(), 'ham'))

# Create a DataFrame
df = pd.DataFrame(data, columns=['text', 'label'])

In [None]:
# Count number of files
num_files = len(spam_files)+len(ham_files)
# Count number of files
num_files = len(spam_files)+len(ham_files)
print(f'Number of files: {num_files}')

Number of files: 2393


In [None]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
df['label_num'] = df['label'].map({
    'spam' : 0,
    'ham': 1,
})
df.head(10)

Unnamed: 0,text,label,label_num
0,Doctor ibed and Medically Supe\nTestosterone &...,spam,0
1,first announcement and call for papers for the...,ham,1
2,Acquisition news sends XTPT flying!!!\n\nTrade...,spam,0
3,"french-american colloquium "" the syntax - sema...",ham,1
4,preliminary call for papers second internation...,ham,1
5,@ Viagra $3.33\n@ valium = $1.21\n& Cialis $3....,spam,0
6,does anyone have information about the next in...,ham,1
7,"“AIR CLEAN LINE"" presents\n\nDo you want to bu...",spam,0
8,WATCH OUT\nHERE COMES THE BIG ONE!\nWEDNESDAY ...,spam,0
9,translation into the second language stuart ca...,ham,1


In [None]:

#check the distribution of labels
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,1469
spam,924


In [None]:
import spacy
import re

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    # Convert to lowercase and remove newline characters, extra spaces, and special characters
    text = text.lower()
    text = re.sub(r'\n+', ' ', text)  # Remove newline characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters

    # Process text with spaCy
    doc = nlp(text)

    # Remove stop words and lemmatize the text
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)

In [None]:
df['preprocessed_txt'] = df['text'].apply(preprocess)

In [None]:
df.head()

In [None]:
df.text[0]

In [None]:
df.preprocessed_txt[0]

In [None]:
import seaborn as sns # import the library and alias it as sns
import matplotlib.pyplot as plt # import for using plt.title and plt.show


In [None]:
sns.countplot(x='label', data=df)
plt.title('Email Type Distribution')
plt.show()

In [None]:
# Step 3: Prepare the Data for Training
# Split the dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(
    df.preprocessed_txt,
    df.label_num,
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.label_num
)

In [None]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),        #using the ngram_range parameter
     ('LogisticRegression', LogisticRegression())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
from matplotlib import pyplot as plt
import seaborn as sn
plt.figure(figsize = (10,7))
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Prediction')
plt.ylabel('Truth')