# 0. Packages Import


In [92]:
# Import necessary libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations
from sklearn.naive_bayes import MultinomialNB  # Naive Bayes classifier
from sklearn.feature_extraction.text import CountVectorizer  # Convert text data to bag-of-words
from sklearn.model_selection import train_test_split  # Split data into training and testing sets
from win32com.client import Dispatch  # Windows-specific tasks (consider removing if not needed)

# 1. Business and Data Understanding

In [93]:

# Read the CSV file into a DataFrame
# The encoding="latin-1" parameter specifies the character encoding of the CSV file.
# Latin-1 (ISO-8859-1) is a common encoding for Western European languages.
# It is used to handle text data with accented letters, diacritics, and other special characters.
data = pd.read_csv('C:/Users/ASUS/Downloads/project/spam.csv', encoding="latin-1")


In [94]:
#Display first 5 rows
data.head()

Unnamed: 0,class,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [95]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   class       5572 non-null   object
 1   message     5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [96]:
data.columns

Index(['class', 'message', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

#  2. Data Preparation

This state aims to data:
    *cleaning ;
    *visualization ;
    *normalisation; 
    *partitioning.
In order to address the problems of missed  values we have to apply data imputation and data incoding.

## 2.1. Data imputation 

We use drop() to drop specified columns from the DataFrame

In [97]:
# Drop specified columns from the DataFrame
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)


In [98]:
#Display first 5 rows
data.head()

Unnamed: 0,class,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [99]:
# Assuming 'data' is a DataFrame with a column named 'class' containing labels like 'ham' and 'spam'
# This line of code creates a new column 'class' where 'ham' is mapped to 0 and 'spam' is mapped to 1.
# Map the values in the 'class' column to numerical values
# {'ham': 0, 'spam': 1} is a dictionary specifying the mapping of values
data['class'] = data['class'].map({'ham': 0, 'spam': 1})


In [100]:
#Display first 5 rows
data.head()

Unnamed: 0,class,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [101]:
# Assuming 'data' is a DataFrame with columns 'message' and 'class'

# Extract the 'message' column as the feature variable X
X = data['message']

# Extract the 'class' column as the target variable y
y = data['class']


In [102]:
# Assuming 'X' is your feature variable (e.g., 'message' column)
X.shape


(5572,)

In [103]:
# Assuming 'y' is your target variable
y.shape

(5572,)

In [104]:
#To visualize the NAN values
data.isnull().sum()

class      0
message    0
dtype: int64


## 2.2. Data Normalisation

In [105]:
# Create a CountVectorizer instance
cv = CountVectorizer()
# Fit and transform the text data
X = cv.fit_transform(X)

# Now, X is a sparse matrix representing the bag-of-words representation of the text data

## 2.3. Data Partionning

In [106]:
# Assuming X and y are your feature matrix and target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now, X_train and y_train are your training sets, and X_test and y_test are your testing sets

In [107]:
#  This is a common operation in machine learning to check the dimensions of your training data before training a model.
X_train.shape

(4457, 8672)

In [108]:
 #It is used to check the dimensions of your testing data before evaluating your machine learning model on unseen data.
X_test.shape

(1115, 8672)

# 3. Machine Learning

## 3.1. Multinomial Naive Bayes

In [109]:
# Create a Multinomial Naive Bayes classifier
model = MultinomialNB()


In [110]:
# Training the model on the training data
model.fit(X_train, y_train)

In [111]:
# Evaluating the accuracy on the testing data
model.score(X_test, y_test)

0.97847533632287

In [112]:
# Assuming you have already trained your model, created the CountVectorizer (cv), and have a new message to predict
msg = "You Won 500$"
data = [msg]

# Transform the new data using CountVectorizer
vect = cv.transform(data).toarray()

# Make predictions using the trained model
my_prediction = model.predict(vect)

# Print the prediction
print(f"Prediction: {my_prediction[0]}")


In [113]:
vect

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)