#### Load Dataset


In [1]:
import pandas as pd

df = pd.read_csv("./dataset/spam.csv")

#### Preprocess Data

In [2]:
import re

def clean_text(text):
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\n', ' ', text) 
    # text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'\d+', '', text)       
    return text.lower()

In [3]:
df['Message'] = df['Message'].apply(clean_text)

In [4]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bhand\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def remove_stop_words(text):
    filtered_words = [word for word in text.split() if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [6]:
df['Message'] = df['Message'].apply(remove_stop_words)

#### Feature extraction or Vectorization

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer() 

In [8]:

X_Message = vectorizer.fit_transform(df['Message'])
Message_df = pd.DataFrame.sparse.from_spmatrix(X_Message, columns=vectorizer.get_feature_names_out())

#### Encoding Labels

In [9]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

In [10]:
df['Label'] = label_encoder.fit_transform(df['Label'])

#### Save Vectorization

In [11]:
import joblib

In [12]:
joblib.dump(vectorizer, "./pickle/vectorizer.pkl")

['./pickle/vectorizer.pkl']

#### Split Data

In [13]:
from sklearn.model_selection import train_test_split

X = Message_df
Y = df['Label']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

#### Train Model   

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

#### Save Model

In [17]:
joblib.dump(model, "./pickle/model.pkl")

['./pickle/model.pkl']

#### Evaluate Model

In [18]:
from sklearn.metrics import accuracy_score, classification_report

In [19]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9695067264573991
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115

