In [2]:
# Import the necessary libraries for data manipulation, visualization, and machine learning

# NumPy is used for numerical operations on large, multi-dimensional arrays and matrices.
import numpy as np
# Pandas is used for data manipulation and analysis, providing data structures like DataFrames.
import pandas as pd

# Matplotlib is a plotting library used for creating static, animated, and interactive visualizations.
import matplotlib.pyplot as plt
# Matplotlib style is used to customize the look of the plots.
plt.style.use('fivethirtyeight')

# Seaborn is a statistical data visualization library based on Matplotlib, making it easier to create informative and attractive visualizations.
import seaborn as sns
# NLTK (Natural Language Toolkit) is a library for working with human language data (text processing).
import nltk 
# To ignore warnings that might occur during the execution of the code, making the output cleaner.
import warnings
warnings.filterwarnings('ignore')
# Importing performance metrics from sklearn to evaluate machine learning models:
# accuracy_score - measures the ratio of correctly predicted instances.
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# precision_score - measures the ratio of correctly predicted positive observations to the total predicted positives.
# recall_score - measures the ratio of correctly predicted positive observations to the all observations in actual class.
# f1_score - the weighted average of Precision and Recall.


In [3]:
# Load the dataset from a CSV file and print sample data
file_path = r'C:\Users\khawl\Desktop\5SAE2\PFE\CRM\backend\IA\DATASET\spam.csv'
# Use pandas to read a CSV file from the specified path.
df = pd.read_csv(file_path, encoding='latin-1')  
# The encoding parameter is set to 'latin-1' to properly read characters that are encoded in this format.

# Print the first five rows of the dataset to get an overview of the data.
print(df.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [4]:
# Drop the non-relevant unnamed columns from the dataset

# Use the drop() method to remove columns 'Unnamed: 2', 'Unnamed: 3', and 'Unnamed: 4'.
# The axis=1 parameter indicates that we are dropping columns, not rows.
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
# Rename the columns 'v1' and 'v2' to 'Category' and 'Text' respectively for better understanding

# Use the rename() method to change the column names:
# 'v1' is renamed to 'Category' (indicating whether the email is spam or not),
# 'v2' is renamed to 'Text' (containing the email content).
df = df.rename(columns={'v1': 'Category', 'v2': 'Text'})
# Print the first five rows of the modified dataset to verify the changes.
print(df.head())

  Category                                               Text
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


###  Exploratory Data Analysis


In [10]:
# Check the total number of rows in the dataset

# Use the len() function to get the number of rows in the DataFrame.
# Print the total number of rows in the dataset.
total_rows = len(df)  

print(f'the total number of rows in the dataset: {total_rows}')
# Get a descriptive summary of the dataset
# Use the describe() method to generate descriptive statistics of the dataset.
# This includes metrics like count, mean, std deviation, min, max, and quartile values for numeric columns.
descriptive = df.describe()
print(descriptive)
# Create a new column 'Length' to store the length of each email text and plot a histogram to check the distribution

# Use the apply() method to apply the len() function to the 'Text' column.
# This calculates the length of each email text and stores it in a new column 'Length'.
df['Length'] = df['Text'].apply(len)
# Display the first five rows of the modified dataset to verify the changes.
print(df.head())
# Plot the distribution of the data using Plotly Express

# Import Plotly Express for interactive plotting.
import plotly.express as px
# Create a histogram to visualize the distribution of text lengths.
# The 'marginal' parameter adds a rug plot to the histogram for better distribution understanding.
fig = px.histogram(df, x='Length',title='Distribution of  Text Lengths', marginal='rug')

# Update the layout of the plot to add titles to the axes and show the legend.
fig.update_layout(xaxis_title='Length of Email Text', yaxis_title='Frequency')

# Show the plot.

fig.show()  



the total number of rows in the dataset: 5572
            Length
count  5572.000000
mean     80.118808
std      59.690841
min       2.000000
25%      36.000000
50%      61.000000
75%     121.000000
max     910.000000
  Category                                               Text  Length
0      ham  Go until jurong point, crazy.. Available only ...     111
1      ham                      Ok lar... Joking wif u oni...      29
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...     155
3      ham  U dun say so early hor... U c already then say...      49
4      ham  Nah I don't think he goes to usf, he lives aro...      61


In [8]:
# Import necessary modules from scikit-learn

# CountVectorizer is used to convert a collection of text documents to a matrix of token counts (bag-of-words model).
from sklearn.feature_extraction.text import CountVectorizer

# train_test_split is used to split arrays or matrices into random train and test subsets.
from sklearn.model_selection import train_test_split
# Convert the text data into a bag-of-words model using CountVectorizer
vectorizer = CountVectorizer()
# Initialize CountVectorizer, which will tokenize the text and build a vocabulary of known words.

# Fit the CountVectorizer to the text data and transform it into a matrix of token counts.
X = vectorizer.fit_transform(df['Text'])
# Split the data into training and testing sets

# Use train_test_split to split the data into training and testing sets.
# 'text' contains the features (token counts) and 'df['Category']' contains the labels (spam or not spam).
# 'test_size=0.30' specifies that 30% of the data should be used for testing and 70% for training.
# 'random_state=100' ensures the split is reproducible.

X_train, X_test, y_train, y_test = train_test_split(X, df['Category'], test_size=0.30, random_state=100)
# Print the dimensions of the training and testing datasets

# Display the shape of the training feature set.

# Display the shape of the testing feature set.

# Display the shape of the training labels.

# Display the shape of the testing labelds.
print(f'Display the shape of the training feature set.: {X_train.shape}')
print(f'Display the shape of the testing feature set: {X_test.shape}')
print(f'Display the shape of the training labels.: {y_train.shape}')
print(f'Display the shape of the testing labelds: {y_test.shape}')



Display the shape of the training feature set.: (3900, 8672)
Display the shape of the testing feature set: (1672, 8672)
Display the shape of the training labels.: (3900,)
Display the shape of the testing labelds: (1672,)
