In [1]:
import csv
import os  
import pandas as pd
import io
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC                                       
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score,confusion_matrix 

In [2]:
# Define the directory containing the Dataset
dataset_dir = '\\Users\\zaine\\Desktop\\PFE\\Dataset1'

In [3]:
# Create a list to hold the data from all the files
data = []
    
# Loop through each enron file
for i in range(1,7):
    # Define the enron file directory
    enron_dir = os.path.join(dataset_dir, f'enron{i}')
    print(enron_dir)
    
     # Loop over each ham file in the current sub-dataset
    for filename in os.listdir(os.path.join(enron_dir, "ham")):
        # Read the contents of the current file
        with open(os.path.join(enron_dir, "ham", filename), "r") as f:
            content = f.read()
        
        # Add the file content to the data list along with a label 
        data.append([content, "ham"])
    
    # Loop over each spam file in the current sub-dataset
    for filename in os.listdir(os.path.join(enron_dir, "spam")):
        # Read the contents of the current file
        with open(os.path.join(enron_dir, "spam", filename), "r", encoding='ISO-8859-1') as f:
            content =f.read()
        
        # Add the file content to the data list along with a label
        data.append([content, "spam"])
            
# Write the data to a CSV file
with open('merged_dataset.csv', 'w', newline='',encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['text', 'label'])
    writer.writerows(data)
    

\Users\zaine\Desktop\PFE\Dataset1\enron1
\Users\zaine\Desktop\PFE\Dataset1\enron2
\Users\zaine\Desktop\PFE\Dataset1\enron3
\Users\zaine\Desktop\PFE\Dataset1\enron4
\Users\zaine\Desktop\PFE\Dataset1\enron5
\Users\zaine\Desktop\PFE\Dataset1\enron6


In [4]:
datas = pd.read_csv('\\Users\\zaine\\Desktop\\PFE\\merged_dataset.csv')
datas.head(5)

Unnamed: 0,text,label
0,Subject: christmas tree farm pictures\n,ham
1,"Subject: vastar resources , inc .\ngary , prod...",ham
2,Subject: calpine daily gas nomination\n- calpi...,ham
3,Subject: re : issue\nfyi - see note below - al...,ham
4,Subject: meter 7268 nov allocation\nfyi .\n- -...,ham


In [5]:
datas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33715 entries, 0 to 33714
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    33715 non-null  object
 1   label   33715 non-null  object
dtypes: object(2)
memory usage: 526.9+ KB


In [6]:
# Download NLTK datasets
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zaine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zaine\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
# Clean and preprocess data
datas['text'] = datas['text'].str.lower()
datas['text'] = datas['text'].str.replace(r'\d+', '')
datas['text'] = datas['text'].str.replace(r'[^\w\s]', '')
stopwords = nltk.corpus.stopwords.words('english')
datas['text'] = datas['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))


  datas['text'] = datas['text'].str.replace(r'\d+', '')
  datas['text'] = datas['text'].str.replace(r'[^\w\s]', '')


In [8]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(datas['text'], datas['label'], test_size=0.2, random_state=42)

In [9]:
# Vectorize text data using a count vectorizer
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [10]:
# Train a Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vec, y_train)

In [11]:
# Predict on test data
y_pred = nb_classifier.predict(X_test_vec)

In [12]:
# Evaluate the model
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:', classification_report(y_test, y_pred))
print('f1-score:', f1_score(y_test, y_pred, pos_label='spam'))

Accuracy: 0.9862079193237432
Classification Report:               precision    recall  f1-score   support

         ham       0.98      0.99      0.99      3282
        spam       0.99      0.98      0.99      3461

    accuracy                           0.99      6743
   macro avg       0.99      0.99      0.99      6743
weighted avg       0.99      0.99      0.99      6743

f1-score: 0.9865236922185191


In [13]:
# Train an SVM classifier
svm_classifier = SVC()
svm_classifier.fit(X_train_vec, y_train)

In [14]:
# Predict on test data using the SVM classifier
y_pred_svm = svm_classifier.predict(X_test_vec)

In [15]:
# Evaluate the SVM classifier
print('SVM Classifier:')
print('Accuracy:', accuracy_score(y_test, y_pred_svm))
print('Classification Report:', classification_report(y_test, y_pred_svm))
print('f1-score:', f1_score(y_test, y_pred_svm, pos_label='spam')) 

SVM Classifier:
Accuracy: 0.9663354589945128
Classification Report:               precision    recall  f1-score   support

         ham       0.99      0.94      0.96      3282
        spam       0.95      0.99      0.97      3461

    accuracy                           0.97      6743
   macro avg       0.97      0.97      0.97      6743
weighted avg       0.97      0.97      0.97      6743

f1-score: 0.9679875899026935
