<a href="https://colab.research.google.com/github/tarakantaacharya/spamdetection/blob/main/smsspamdetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#SMS Spam Detector

In [None]:
!pip install scikit-learn==1.3.0  # Install a compatible scikit-learn version

Collecting scikit-learn==1.3.0
  Downloading scikit_learn-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mlxtend 0.23.3 requires scikit-learn>=1.3.1, but you have scikit-learn 1.3.0 which is incompatible.
imbalanced-learn 0.13.0 requires scikit-learn<2,>=1.3.2, but you have scikit-learn 1.3.0 which is incompatible.[0m[31m
[0mSuccessful

---

##Step : 1

### Downloading the Dataset

In [None]:
! pip install kaggle # Install the package of kaggle



Upload the kaggle.json file which is available in Kaggle

In [None]:
import os
import shutil

# Create the .kaggle directory
os.makedirs('/root/.kaggle', exist_ok=True)

# Now copy the kaggle.json file to the .kaggle folder
shutil.copy('/content/kaggle.json', '/root/.kaggle/kaggle.json')

'/root/.kaggle/kaggle.json'

In [None]:
! kaggle datasets download -d uciml/sms-spam-collection-dataset  # Dowload the dataset

Dataset URL: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset
License(s): unknown
Downloading sms-spam-collection-dataset.zip to /content
  0% 0.00/211k [00:00<?, ?B/s]
100% 211k/211k [00:00<00:00, 69.2MB/s]


Unzip the Dataset for further processing

In [None]:
import zipfile
# Replace 'your_zip_file.zip' with the actual path to your zip file
with zipfile.ZipFile('/content/sms-spam-collection-dataset.zip', 'r') as zip_ref:
    zip_ref.extractall('extracted_files') # Replace 'extracted_files' with desired extraction location

---

## Step : 2

### Cleaning the Dataset

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/extracted_files/spam.csv',encoding='latin-1')

In [None]:
df.shape

(5572, 5)

In [None]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [None]:
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)

In [None]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df['label'] = df['v1']
df.drop('v1',axis=1,inplace=True)

In [None]:
df['text'] = df['v2']
df.drop('v2',axis=1,inplace=True)

In [None]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.isna().sum()  #Counting the NaN values in columns

Unnamed: 0,0
label,0
text,0


In [None]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,4825
spam,747


---

## Step : 3

### Preprocessing

In [None]:
import nltk
nltk.download('stopwords')  # Dowloading the stopwords pack

from nltk.corpus import stopwords  # Importing stopwords
import re  # Importing regular expression
stop = stopwords.words('english') # Collecting all stopwords and assigned to stop

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def clean_text(text):
    text=text.lower() #Converting into lower case
    text=re.sub('[^a-zA-Z]', ' ', text)
    text=text.split()
    text = [word for word in text if word not in stop]  #Collecting the non-stop words
    text=' '.join(text)   #adding the stopwords
    return text  #returning the text

In [None]:
df['text']= df['text'].apply(clean_text)

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Assuming 'df' is your DataFrame and 'label' is the column you want to one-hot encode
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)  # sparse=False for dense output
encoded_labels = encoder.fit_transform(df[['label']])

# Create a new DataFrame with the one-hot encoded columns
encoded_df = pd.DataFrame(encoded_labels, columns=encoder.get_feature_names_out(['label']))

# Concatenate the encoded columns with the original DataFrame
df = pd.concat([df, encoded_df], axis=1)

In [None]:
df['text'] = df['text'].str.replace('subject','')

In [None]:
df['label_num'] = df['label'].apply(lambda x: 0 if x == 'spam' else 1)

In [None]:
df.head()

Unnamed: 0,label,text,label_ham,label_spam,label_num
0,ham,go jurong point crazy available bugis n great ...,1.0,0.0,1
1,ham,ok lar joking wif u oni,1.0,0.0,1
2,spam,free entry wkly comp win fa cup final tkts st ...,0.0,1.0,0
3,ham,u dun say early hor u c already say,1.0,0.0,1
4,ham,nah think goes usf lives around though,1.0,0.0,1


---

##  Step : 4

### Feature Extraction

In [None]:
from sklearn.model_selection import train_test_split
test_size = 0.23
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label_num'], test_size=test_size,
    random_state=0, shuffle=True, stratify=df['label_num']
)

# TF-IDF vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
# Saving the vectorizer for use in the evaluation
import joblib
joblib.dump(vectorizer, 'vectorizer.pkl')  # Save the vectorizer

['vectorizer.pkl']

## Step : 5

### Model Training

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
models = [
    ('Random Forest',RandomForestClassifier(
        n_estimators=200,  # More trees for better generalization
        max_depth=20,  # Limits tree depth to prevent overfitting
        min_samples_split=10,  # Minimum samples needed to split a node
        min_samples_leaf=5,  # Minimum samples in leaf nodes
        oob_score=True,  # Out-of-bag score for validation
        random_state=42,  # Ensures reproducibility
        class_weight='balanced',  # Handles class imbalance
        n_jobs=-1  # Parallel processing
        )),
    ('Naive Bayes',MultinomialNB(alpha=0.5)),  # Smoothing parameter to handle zero probabilities
    ('Logistic Regression',LogisticRegression(
        solver='liblinear',  # Good for small datasets and binary classification
        penalty='l2',  # L2 regularization for simplicity
        C=1.0,  # Regularization strength (smaller values = stronger regularization)
        class_weight='balanced',  # Adjusts for class imbalance
        max_iter=200,  # Increase iterations to ensure convergence
        random_state=42  # Ensures reproducibility
        )),
    ('SVM',SVC(
        kernel='linear',  # Linear kernel works well for text classification
        C=1.0,  # Regularization parameter (higher values reduce margin slack)
        probability=True,  # Enables probability estimates
        class_weight='balanced',  # Handles class imbalance
        random_state=42  # Ensures reproducibility
        )),
    ('XGBoost', XGBClassifier(
        n_estimators=200,  # Number of boosting rounds
        max_depth=6,  # Maximum tree depth
        learning_rate=0.1,  # Shrinks contribution of each tree
        subsample=0.8,  # Subsample ratio of training instances
        colsample_bytree=0.8,  # Subsample ratio of columns for each tree
        scale_pos_weight=1,  # Balances the positive class weight
        eval_metric='logloss',  # For binary classification
        random_state=42,  # Ensures reproducibility
        n_jobs=-1  # Parallel processing
        )),
    ('KNN', KNeighborsClassifier(n_neighbors=5,weights='distance'))
]

In [None]:
# Create the Voting Classifier (Max Voting - hard voting)
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=models, voting='hard')

In [None]:
voting_clf.fit(X_train_tfidf, y_train)
# Save the model and vectorizer
joblib.dump(voting_clf, 'voting_classifier.pkl')  # Save the VotingClassifier model

['voting_classifier.pkl']

---

##Step : 6

### Evaluation Metrics

In [None]:
# Load the saved VotingClassifier model and vectorizer
voting_clf = joblib.load('voting_classifier.pkl')
vectorizer = joblib.load('vectorizer.pkl')

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, confusion_matrix, classification_report,
    f1_score, roc_auc_score, recall_score
)
from tabulate import tabulate

# Make predictions
predictions = voting_clf.predict(X_test_tfidf)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

try:
    roc_auc = roc_auc_score(y_test, predictions)
except ValueError:
    roc_auc = "N/A"  # Handle errors when ROC AUC is not defined (e.g., only one class in y_test)

classification_rep = classification_report(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)

# Print results as a table for the current model
print(f"\nVoting Classifier Performance:")
results = [
    [accuracy, precision, recall, f1, roc_auc, conf_matrix]
]
print(tabulate(
    results,
    headers=["Accuracy", "Precision", "Recall", "F1", "ROC AUC", "Confusion Matrix"],
    tablefmt="fancy_grid"
))
print("\nClassification Report:\n", classification_rep)


Voting Classifier Performance:
╒════════════╤═════════════╤══════════╤══════════╤═══════════╤════════════════════╕
│   Accuracy │   Precision │   Recall │       F1 │   ROC AUC │ Confusion Matrix   │
╞════════════╪═════════════╪══════════╪══════════╪═══════════╪════════════════════╡
│     0.9922 │    0.992177 │   0.9922 │ 0.992141 │  0.975843 │ [[ 164    8]       │
│            │             │          │          │           │  [   2 1108]]      │
╘════════════╧═════════════╧══════════╧══════════╧═══════════╧════════════════════╛

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.95      0.97       172
           1       0.99      1.00      1.00      1110

    accuracy                           0.99      1282
   macro avg       0.99      0.98      0.98      1282
weighted avg       0.99      0.99      0.99      1282



---