# Notebook 2

In [1]:
# import libraries that are going to be used
import os
import gdown
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [2]:
# Define the folder path at the root level
repo_root = os.path.dirname(os.getcwd())  # Moves one level up from `wine_model_notebooks`
data_folder = os.path.join(repo_root, 'DataSets')

# Ensure the DataSets folder exists at the root level
if not os.path.exists(data_folder):
    os.makedirs(data_folder)

# Define file download details
file_id = '1QuR2MJhxOtqdAZz6WJ_9LaK2-zWs3vLS'
url = f'https://drive.google.com/uc?id={file_id}'
output = os.path.join(data_folder, 'spam.csv')

# Download and save file
gdown.download(url, output, quiet=False)

# Load the CSV into a pandas dataframe with a specified encoding
spam = pd.read_csv(output, encoding='latin-1')  # Use encoding='latin-1' or other encoding if needed
df = spam.copy()
df.info()
df

Downloading...
From: https://drive.google.com/uc?id=1QuR2MJhxOtqdAZz6WJ_9LaK2-zWs3vLS
To: /Users/yanellyhernandez/Library/Mobile Documents/com~apple~CloudDocs/Desktop/Learning Fuze/Streamlit_Projects/DataSets/spam.csv
100%|██████████| 504k/504k [00:00<00:00, 3.17MB/s]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [2]:
# Loading the csv:
spam = pd.read_csv('spam.csv', encoding='latin-1') 
df = spam.copy() # making a copy of spam:
df.info()
# Rename labels to label and message for clarity:
df = df[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'message'})
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
# Downloading the the 'punkt' and 'stopwords'
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yanellyhernandez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yanellyhernandez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Tokenize the messages:
df['message'] = df['message'].apply(nltk.wordpunct_tokenize)
df['message'] = df['message'].map(lambda x: ' '.join(x)) # Convert list of tokens back to string

In [5]:
# Assigning binary to detect spam:
df['label'] = df['label'].apply(lambda x: 1 if x == 'spam' else 0)

In [6]:
# initializing the CountVectorizer w it's stop words:
vect = CountVectorizer(stop_words='english', lowercase=False)

In [7]:
# Assigning X and y:
X = df['message'].values
y = df['label']

In [8]:
# Splitting into train and test:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

In [9]:
x_train

array(['This message is from a great Doctor in India :-): 1 ) Do not drink APPY FIZZ . It contains Cancer causing age',
       'Fighting with the world is easy , u either win or lose bt fightng with some1 who is close to u is dificult if u lose - u lose if u win - u still lose .',
       'Congratulations - Thanks to a good friend U have WON the å £ 2 , 000 Xmas prize . 2 claim is easy , just call 08712103738 NOW ! Only 10p per minute . BT - national - rate',
       ...,
       'Pls clarify back if an open return ticket that i have can be preponed for me to go back to kerala .',
       'Please leave this topic .. sorry for telling that ..',
       'Jesus armand really is trying to tell everybody he can find'],
      dtype=object)

In [10]:
# only fitting the x_train:
X = vect.fit(x_train)


In [11]:
vect.vocabulary_ # grabbing the vocab of the vector

{'This': 3289,
 'message': 6504,
 'great': 5618,
 'Doctor': 1406,
 'India': 1977,
 'Do': 1404,
 'drink': 5011,
 'APPY': 772,
 'FIZZ': 1555,
 'It': 1991,
 'contains': 4651,
 'Cancer': 1185,
 'causing': 4390,
 'age': 3774,
 'Fighting': 1599,
 'world': 8819,
 'easy': 5065,
 'win': 8760,
 'lose': 6327,
 'bt': 4264,
 'fightng': 5311,
 'some1': 7870,
 'close': 4527,
 'dificult': 4907,
 'Congratulations': 1252,
 'Thanks': 3268,
 'good': 5585,
 'friend': 5446,
 'WON': 3515,
 '000': 1,
 'Xmas': 3632,
 'prize': 7185,
 'claim': 4502,
 'just': 6069,
 '08712103738': 92,
 'NOW': 2439,
 'Only': 2556,
 '10p': 262,
 'minute': 6535,
 'BT': 980,
 'national': 6675,
 'rate': 7295,
 'That': 3272,
 'means': 6469,
 'got': 5593,
 'epi': 5143,
 'fine': 5328,
 'She': 3027,
 'How': 1900,
 'noe': 6740,
 'da': 4771,
 'car': 4356,
 'Later': 2152,
 'lar': 6179,
 'wearing': 8699,
 'shorts': 7733,
 'Buy': 1093,
 'Space': 3088,
 'Invaders': 1987,
 'chance': 4411,
 'orig': 6859,
 'Arcade': 881,
 'Game': 1693,
 'console':

In [12]:
# Sort the vectors vocab to store later
sorted(vect.vocabulary_)

['00',
 '000',
 '000pes',
 '008704050406',
 '0089',
 '01223585236',
 '01223585334',
 '0125698789',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '021',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '0578',
 '06',
 '07',
 '07008009200',
 '07046744435',
 '07090298926',
 '07099833605',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '07753741225',
 '07781482378',
 '07786200117',
 '077xxx',
 '07801543489',
 '07808',
 '07808726822',
 '07815296484',
 '07821230901',
 '078498',
 '07880867867',
 '07946746291',
 '07973788240',
 '07XXXXXXXXX',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08006344447',
 '0808',
 '08081560665',
 '0825',
 '083',
 '0844',
 '08448350055',
 '0845',
 '08450542832',
 '08452810071',
 '08452810073',
 '08452810075over18',
 '0870',
 '08700435505150p',
 '08700469649',
 '08700621170150p',
 '08701213186',
 '08701417012',
 '08701417012150p',
 

In [13]:
# transforming x_train and assigning it to a vector:
X_train_vect = vect.transform(x_train)

In [14]:
print(X_train_vect)

  (0, 772)	1
  (0, 1185)	1
  (0, 1404)	1
  (0, 1406)	1
  (0, 1555)	1
  (0, 1977)	1
  (0, 1991)	1
  (0, 3289)	1
  (0, 3774)	1
  (0, 4390)	1
  (0, 4651)	1
  (0, 5011)	1
  (0, 5618)	1
  (0, 6504)	1
  (1, 1599)	1
  (1, 4264)	1
  (1, 4527)	1
  (1, 4907)	1
  (1, 5065)	1
  (1, 5311)	1
  (1, 6327)	4
  (1, 7870)	1
  (1, 8760)	2
  (1, 8819)	1
  (2, 1)	1
  :	:
  (4174, 2927)	1
  (4174, 2945)	1
  (4174, 3488)	1
  (4175, 1900)	1
  (4175, 6951)	1
  (4175, 8758)	1
  (4175, 8923)	1
  (4176, 2686)	1
  (4176, 4505)	1
  (4176, 6094)	1
  (4176, 6833)	1
  (4176, 7155)	1
  (4176, 7465)	1
  (4176, 8314)	1
  (4177, 2684)	1
  (4177, 6211)	1
  (4177, 7895)	1
  (4177, 8228)	1
  (4177, 8383)	1
  (4178, 2032)	1
  (4178, 3914)	1
  (4178, 5179)	1
  (4178, 7324)	1
  (4178, 8227)	1
  (4178, 8449)	1


In [15]:
# transforming x_test and assigning it to a vector:
X_test_vect = vect.transform(x_test)

In [16]:
print(X_test_vect)

  (0, 3937)	1
  (0, 4578)	1
  (0, 4984)	2
  (0, 6134)	1
  (0, 6255)	1
  (0, 6551)	1
  (0, 7262)	1
  (1, 3569)	1
  (1, 7776)	1
  (1, 8061)	1
  (1, 8475)	1
  (2, 3084)	1
  (2, 6188)	1
  (2, 6294)	1
  (2, 6480)	1
  (3, 3272)	1
  (3, 3564)	1
  (3, 4823)	1
  (3, 4890)	1
  (3, 5618)	1
  (4, 1182)	1
  (4, 4132)	1
  (4, 5733)	1
  (4, 6480)	1
  (4, 7013)	1
  :	:
  (1389, 3208)	2
  (1389, 3573)	1
  (1389, 4073)	1
  (1389, 4580)	2
  (1389, 4981)	1
  (1389, 8368)	2
  (1389, 8678)	1
  (1389, 8804)	1
  (1390, 817)	1
  (1390, 3699)	1
  (1390, 5025)	1
  (1390, 5066)	1
  (1391, 3278)	1
  (1391, 6633)	1
  (1392, 838)	1
  (1392, 893)	1
  (1392, 1174)	1
  (1392, 1175)	1
  (1392, 2295)	1
  (1392, 2721)	1
  (1392, 4329)	1
  (1392, 4673)	1
  (1392, 5447)	1
  (1392, 7423)	1
  (1392, 7680)	1


In [17]:
# turing the x_train vector to a datafram to use when running the model:
# using the columns of the sorted vector vocab: 
x_train_df = pd.DataFrame(X_train_vect.todense(), columns=sorted(vect.vocabulary_))

In [18]:
x_train_df

Unnamed: 0,00,000,000pes,008704050406,0089,01223585236,01223585334,0125698789,02,0207,...,Ûªve,ÛÏHarry,ÛÒ,ÛÓwell,åÈ10,åÐ,åÒHarry,åÒIt,åÔMORROW,åÔrents
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4174,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4175,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4176,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4177,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# turing the x_test vector to a datafram to use when running the model:
# using the same columns in x_train_df to x_test_df: 
X_test_df = pd.DataFrame(X_test_vect.todense(), columns=vect.get_feature_names_out())
X_test_df

Unnamed: 0,00,000,000pes,008704050406,0089,01223585236,01223585334,0125698789,02,0207,...,Ûªve,ÛÏHarry,ÛÒ,ÛÓwell,åÈ10,åÐ,åÒHarry,åÒIt,åÔMORROW,åÔrents
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1388,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1389,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1390,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1391,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Initialize the classifier
model = LogisticRegression()

# Train the model
model.fit(x_train_df, y_train)

# Predict on the test set
y_pred = model.predict(X_test_df)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9856424982053122
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1224
           1       0.99      0.89      0.94       169

    accuracy                           0.99      1393
   macro avg       0.99      0.94      0.96      1393
weighted avg       0.99      0.99      0.99      1393



When running RandomForestClassifier the output was:

Accuracy: 0.9741564967695621
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      1213
           1       0.99      0.81      0.89       180

    accuracy                           0.97      1393
   macro avg       0.98      0.90      0.94      1393
weighted avg       0.97      0.97      0.97      1393

We can see that Logistic regression here is slightly better than random forest classifier.
