# Main Notebook (i.e: 1)

In [1]:
# import libraries that are going to be used
import os
import gdown
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [2]:
# Define the folder path at the root level
repo_root = os.path.dirname(os.getcwd())  # Moves one level up from `wine_model_notebooks`
data_folder = os.path.join(repo_root, 'DataSets')

# Ensure the DataSets folder exists at the root level
if not os.path.exists(data_folder):
    os.makedirs(data_folder)

# Define file download details
file_id = '1QuR2MJhxOtqdAZz6WJ_9LaK2-zWs3vLS'
url = f'https://drive.google.com/uc?id={file_id}'
output = os.path.join(data_folder, 'spam.csv')

# Download and save file
gdown.download(url, output, quiet=False)

# Load the CSV into a pandas dataframe with a specified encoding
spam = pd.read_csv(output, encoding='latin-1')  # Use encoding='latin-1' or other encoding if needed
df = spam.copy()
df.info()
df

Downloading...
From: https://drive.google.com/uc?id=1QuR2MJhxOtqdAZz6WJ_9LaK2-zWs3vLS
To: /Users/yanellyhernandez/Desktop/Streamlit_Projects/DataSets/spam.csv
100%|██████████| 504k/504k [00:00<00:00, 4.64MB/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB





Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [3]:
# Rename labels to label and message for clarity:
df = df[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'message'})
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Downloading the the 'punkt' and 'stopwords'
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yanellyhernandez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yanellyhernandez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Tokenize the messages
df['message'] = df['message'].apply(nltk.wordpunct_tokenize)
df['message'] = df['message'].map(lambda x: ' '.join(x))  # Convert list of tokens back to string

In [6]:
# initializing the TfidfVectorizer w it's stop words
tfidf = TfidfVectorizer(stop_words=stopwords.words('english'))
# calling fit_transform on message label:
X = tfidf.fit_transform(df['message']) 

In [7]:
# train test split: 
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, random_state=42)

In [8]:
# running the model:
model = RandomForestClassifier(random_state=1)
model.fit(X_train, y_train)

In [9]:
# predicting on X_test
y_pred = model.predict(X_test)
# Eval Metrics:
class_report = classification_report(y_test, y_pred)
accuracy= accuracy_score(y_test, y_pred)
# Printing classification Report:
print(class_report)
# Printing the accuracy:
print(accuracy)

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       965
        spam       0.99      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

0.9775784753363229


In [10]:
# If running in a Jupyter Notebook, use the current working directory or specify your base path
base_dir = os.getcwd()  # Get the current working directory

# Define the folder to save the pickled model
pickled_data_folder = os.path.join(base_dir, 'pickled_data')

# Make sure the directory exists, create it if it doesn't
os.makedirs(pickled_data_folder, exist_ok=True)

In [11]:
# Define the path where the model will be saved
tfidf_save_path = os.path.join(pickled_data_folder, 'tfidf_Vectorizer_Spam.pkl')

# Print the path to verify
print(f"Saving model at: {tfidf_save_path}")
# Save the tfidf to use for streamlit:
joblib.dump(tfidf, tfidf_save_path)

Saving model at: /Users/yanellyhernandez/Desktop/Streamlit_Projects/Spam Model Notebooks/pickled_data/tfidf_Vectorizer_Spam.pkl


['/Users/yanellyhernandez/Desktop/Streamlit_Projects/Spam Model Notebooks/pickled_data/tfidf_Vectorizer_Spam.pkl']

In [12]:
# Define the path where the model will be saved
model_save_path = os.path.join(pickled_data_folder, 'Spam_model.pkl')

# Print the path to verify
print(f"Saving model at: {model_save_path}")

# Save the pipeline to the specified path
joblib.dump(model, model_save_path)

Saving model at: /Users/yanellyhernandez/Desktop/Streamlit_Projects/Spam Model Notebooks/pickled_data/Spam_model.pkl


['/Users/yanellyhernandez/Desktop/Streamlit_Projects/Spam Model Notebooks/pickled_data/Spam_model.pkl']

In [13]:
# Define the path where the model will be saved
eval_save_path = os.path.join(pickled_data_folder, 'Evaluation_Metrics_Spam.pkl')

# Print the path to verify
print(f"Saving model at: {eval_save_path}")

# Save the pipeline to the specified path
joblib.dump((accuracy,class_report), eval_save_path)

Saving model at: /Users/yanellyhernandez/Desktop/Streamlit_Projects/Spam Model Notebooks/pickled_data/Evaluation_Metrics_Spam.pkl


['/Users/yanellyhernandez/Desktop/Streamlit_Projects/Spam Model Notebooks/pickled_data/Evaluation_Metrics_Spam.pkl']

So I tried to use logistic regression and this is the output scores:

Accuracy: 0.9443946188340807

              precision    recall  f1-score   support

           0       0.94      0.99      0.97       965
           1       0.95      0.62      0.75       150

    accuracy                           0.94      1115
   macro avg       0.95      0.81      0.86      1115
weighted avg       0.94      0.94      0.94      1115


We can see that random forest classifier here is way better than Logistic regression.



## findings

For the Streamlit app, I selected the second-best model (Model 1) despite the first-best model (Model 4) achieving slightly better performance metrics. This decision was based on several practical considerations:

1. Ease of Deployment: Model 1 required fewer preprocessing steps, relying on a RandomForestClassifier with a TF-IDF vectorizer. In contrast, Model 4 used Logistic Regression with a CountVectorizer and additional steps to convert data into dense DataFrame structures, which increased complexity.

2. Efficiency: The performance difference between the two models was marginal. Given this, the simplicity of Model 1 made it faster to implement and easier to maintain, which is essential for a real-time application like a Streamlit app.

3. Balance of Accuracy and Usability: While Model 4 offered slightly higher metrics, Model 1 provided a comparable level of prediction accuracy while remaining lightweight and user-friendly.

By choosing the second-best model, the Streamlit app achieves a balance between efficiency and performance, ensuring high usability without compromising prediction quality.