# 1. Data

Dataset: SMS Spam Collection Dataset
[Dataset Link](https://kaggle.com/datasets/uciml/sms-spam-collection-dataset)

In [None]:
!file -bi '/content/drive/MyDrive/regression/spam_data.csv'
!iconv -f latin1 -t utf8 '/content/drive/MyDrive/regression/spam_data.csv' -o '/content/drive/MyDrive/regression/spam_data_utf8.csv'

text/csv; charset=unknown-8bit


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

csv_file_path = '/content/drive/MyDrive/regression/spam_data_utf8.csv'
df = pd.read_csv(csv_file_path)
df.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [None]:
# Check for class imbalance
print(df['v1'].value_counts())


v1
ham     4825
spam     747
Name: count, dtype: int64


In [None]:
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
df.head(10)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


# 2. Model

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert labels to binary (0 for ham, 1 for spam)
df['v1'] = df['v1'].map({'ham': 0, 'spam': 1})

# Apply TF-IDF vectorization to the SMS messages
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df['v2'])
y = df['v1']


### Test split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Naive Bayes Model

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predict and evaluate
nb_pred = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_pred)
conf_matrix_nb = confusion_matrix(y_test, nb_pred)
print("Naive Bayes Accuracy:", nb_accuracy)
print('Confusion Matrix:')
print(conf_matrix_nb)

Naive Bayes Accuracy: 0.9623318385650225
Confusion Matrix:
[[965   0]
 [ 42 108]]


### Logistic Regression Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# Create a Logistic Regression model
model = LogisticRegression()

# Train the model using the training data
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Generate classification report
class_report = classification_report(y_test, y_pred)

# Print evaluation metrics
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

Accuracy: 0.9623318385650225
Confusion Matrix:
[[965   0]
 [ 42 108]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.72      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



### Support Vector Machine (SVM):

In [None]:
from sklearn.svm import SVC

# Train SVM model
svm_model = SVC(kernel='linear')  # Use a linear kernel for text data
svm_model.fit(X_train, y_train)

# Predict and evaluate
svm_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_pred)
conf_matrix_svm = confusion_matrix(y_test, svm_pred)
print("SVM Accuracy:", svm_accuracy)
print('Confusion Matrix:')
print(conf_matrix_svm)

SVM Accuracy: 0.979372197309417
Confusion Matrix:
[[963   2]
 [ 21 129]]
