# Notebook 3

In [1]:
# import libraries:
import os
import gdown
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import spacy
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Define the folder path at the root level
repo_root = os.path.dirname(os.getcwd())  # Moves one level up from `wine_model_notebooks`
data_folder = os.path.join(repo_root, 'DataSets')

# Ensure the DataSets folder exists at the root level
if not os.path.exists(data_folder):
    os.makedirs(data_folder)

# Define file download details
file_id = '1QuR2MJhxOtqdAZz6WJ_9LaK2-zWs3vLS'
url = f'https://drive.google.com/uc?id={file_id}'
output = os.path.join(data_folder, 'spam.csv')

# Download and save file
gdown.download(url, output, quiet=False)

# Load the CSV into a pandas dataframe with a specified encoding
spam = pd.read_csv(output, encoding='latin-1')  # Use encoding='latin-1' or other encoding if needed
df = spam.copy()
df.info()
df

Downloading...
From: https://drive.google.com/uc?id=1QuR2MJhxOtqdAZz6WJ_9LaK2-zWs3vLS
To: /Users/yanellyhernandez/Library/Mobile Documents/com~apple~CloudDocs/Desktop/Learning Fuze/Streamlit_Projects/DataSets/spam.csv
100%|██████████| 504k/504k [00:00<00:00, 3.84MB/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB





Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [2]:
# Loading the csv:
spam = pd.read_csv('spam.csv', encoding='latin-1') 
df = spam.copy()
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [3]:
# Rename labels to label and message for clarity:
df = df[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'message'})
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Downloading the 'punkt' and 'stopwords':
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yanellyhernandez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yanellyhernandez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Tokenize the messages:
df['message'] = df['message'].apply(nltk.wordpunct_tokenize)
df['message']

0       [Go, until, jurong, point, ,, crazy, .., Avail...
1                [Ok, lar, ..., Joking, wif, u, oni, ...]
2       [Free, entry, in, 2, a, wkly, comp, to, win, F...
3       [U, dun, say, so, early, hor, ..., U, c, alrea...
4       [Nah, I, don, ', t, think, he, goes, to, usf, ...
                              ...                        
5567    [This, is, the, 2nd, time, we, have, tried, 2,...
5568     [Will, Ì_, b, going, to, esplanade, fr, home, ?]
5569    [Pity, ,, *, was, in, mood, for, that, ., So, ...
5570    [The, guy, did, some, bitching, but, I, acted,...
5571                  [Rofl, ., Its, true, to, its, name]
Name: message, Length: 5572, dtype: object

In [6]:
# Assigning binary to detect spam:
df['label'] = df['label'].apply(lambda x: 1 if x == 'spam' else 0)
df

Unnamed: 0,label,message
0,0,"[Go, until, jurong, point, ,, crazy, .., Avail..."
1,0,"[Ok, lar, ..., Joking, wif, u, oni, ...]"
2,1,"[Free, entry, in, 2, a, wkly, comp, to, win, F..."
3,0,"[U, dun, say, so, early, hor, ..., U, c, alrea..."
4,0,"[Nah, I, don, ', t, think, he, goes, to, usf, ..."
...,...,...
5567,1,"[This, is, the, 2nd, time, we, have, tried, 2,..."
5568,0,"[Will, Ì_, b, going, to, esplanade, fr, home, ?]"
5569,0,"[Pity, ,, *, was, in, mood, for, that, ., So, ..."
5570,0,"[The, guy, did, some, bitching, but, I, acted,..."


In [7]:
# checking the length of the stopwords:
len(stopwords.words("english"))

179

In [8]:
# a list if the stopwords:
stop_words = stopwords.words("english")
stop_words 

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [9]:
# Including a , and . in the stop words y appending it to the list:
stop_words.append(',')
stop_words.append('.')

In [10]:
# Checking the list again to see if adding , and . was successful:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [11]:
# Function to remove stopwords
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

# Apply stopwords removal to the description column
df['message_cleaned'] = df['message'].apply(remove_stopwords)

# Display the cleaned description
df[['message', 'message_cleaned']].head()

Unnamed: 0,message,message_cleaned
0,"[Go, until, jurong, point, ,, crazy, .., Avail...","[Go, jurong, point, crazy, .., Available, bugi..."
1,"[Ok, lar, ..., Joking, wif, u, oni, ...]","[Ok, lar, ..., Joking, wif, u, oni, ...]"
2,"[Free, entry, in, 2, a, wkly, comp, to, win, F...","[Free, entry, 2, wkly, comp, win, FA, Cup, fin..."
3,"[U, dun, say, so, early, hor, ..., U, c, alrea...","[U, dun, say, early, hor, ..., U, c, already, ..."
4,"[Nah, I, don, ', t, think, he, goes, to, usf, ...","[Nah, I, ', think, goes, usf, lives, around, t..."


In [12]:
# Initialize the ps:
ps = PorterStemmer()

In [13]:
# function for the tokens:
def stem_token(tokens):
    return [ps.stem(word) for word in tokens]

In [14]:
# Apply the stem_tokens to the message_cleaned:
df['Cleaned_Stem_message'] = df['message_cleaned'].apply(stem_token)
df[['message', 'Cleaned_Stem_message']].head()

Unnamed: 0,message,Cleaned_Stem_message
0,"[Go, until, jurong, point, ,, crazy, .., Avail...","[go, jurong, point, crazi, .., avail, bugi, n,..."
1,"[Ok, lar, ..., Joking, wif, u, oni, ...]","[ok, lar, ..., joke, wif, u, oni, ...]"
2,"[Free, entry, in, 2, a, wkly, comp, to, win, F...","[free, entri, 2, wkli, comp, win, fa, cup, fin..."
3,"[U, dun, say, so, early, hor, ..., U, c, alrea...","[u, dun, say, earli, hor, ..., u, c, alreadi, ..."
4,"[Nah, I, don, ', t, think, he, goes, to, usf, ...","[nah, i, ', think, goe, usf, live, around, tho..."


In [15]:
df

Unnamed: 0,label,message,message_cleaned,Cleaned_Stem_message
0,0,"[Go, until, jurong, point, ,, crazy, .., Avail...","[Go, jurong, point, crazy, .., Available, bugi...","[go, jurong, point, crazi, .., avail, bugi, n,..."
1,0,"[Ok, lar, ..., Joking, wif, u, oni, ...]","[Ok, lar, ..., Joking, wif, u, oni, ...]","[ok, lar, ..., joke, wif, u, oni, ...]"
2,1,"[Free, entry, in, 2, a, wkly, comp, to, win, F...","[Free, entry, 2, wkly, comp, win, FA, Cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin..."
3,0,"[U, dun, say, so, early, hor, ..., U, c, alrea...","[U, dun, say, early, hor, ..., U, c, already, ...","[u, dun, say, earli, hor, ..., u, c, alreadi, ..."
4,0,"[Nah, I, don, ', t, think, he, goes, to, usf, ...","[Nah, I, ', think, goes, usf, lives, around, t...","[nah, i, ', think, goe, usf, live, around, tho..."
...,...,...,...,...
5567,1,"[This, is, the, 2nd, time, we, have, tried, 2,...","[This, 2nd, time, tried, 2, contact, u, U, å, ...","[thi, 2nd, time, tri, 2, contact, u, u, å, £, ..."
5568,0,"[Will, Ì_, b, going, to, esplanade, fr, home, ?]","[Will, Ì_, b, going, esplanade, fr, home, ?]","[will, ì_, b, go, esplanad, fr, home, ?]"
5569,0,"[Pity, ,, *, was, in, mood, for, that, ., So, ...","[Pity, *, mood, So, ..., suggestions, ?]","[piti, *, mood, so, ..., suggest, ?]"
5570,0,"[The, guy, did, some, bitching, but, I, acted,...","[The, guy, bitching, I, acted, like, ', intere...","[the, guy, bitch, i, act, like, ', interest, b..."


In [16]:
# loading spacy:
glove_model = spacy.load('en_core_web_md')

In [17]:
# Function to get vector for a document using SpaCy
def get_spacy_embedding(doc):
    return glove_model(doc).vector

# Apply the function to your cleaned messages
df['spacy_embedding'] = df['Cleaned_Stem_message'].apply(lambda x: get_spacy_embedding(' '.join(x)))

# Check the resulting dataframe with embeddings
df[['message', 'spacy_embedding']].head()


Unnamed: 0,message,spacy_embedding
0,"[Go, until, jurong, point, ,, crazy, .., Avail...","[1.534947, 1.2171406, -0.20959924, -0.9020937,..."
1,"[Ok, lar, ..., Joking, wif, u, oni, ...]","[0.6256751, 1.2977538, -0.169725, -1.5171912, ..."
2,"[Free, entry, in, 2, a, wkly, comp, to, win, F...","[-0.25552154, -1.6689239, 1.870893, -0.0309503..."
3,"[U, dun, say, so, early, hor, ..., U, c, alrea...","[-0.35308, 1.7301891, 0.024863632, -1.11274, -..."
4,"[Nah, I, don, ', t, think, he, goes, to, usf, ...","[1.4641945, 1.2061578, 0.045541123, -1.4963901..."


In [18]:
# Convert the embeddings to a 2D NumPy array
X = np.vstack(df['spacy_embedding'].values) 
# the v2stack stacks the vectors so that the form a 2d array

# Assigning the labels to y: 
y = df['label'].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [19]:
# running the model:
model = RandomForestClassifier(random_state=1)
model.fit(X_train, y_train)
# predicting on X_test
y_pred = model.predict(X_test)
# Eval Metrics:
class_report = classification_report(y_test, y_pred)
accuracy= accuracy_score(y_test, y_pred)
# Printing classification Report:
print(class_report)
# Printing the accuracy:
print(accuracy)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1202
           1       0.97      0.76      0.86       191

    accuracy                           0.96      1393
   macro avg       0.97      0.88      0.92      1393
weighted avg       0.97      0.96      0.96      1393

0.964824120603015
