In [None]:
pip install pandas numpy scikit-learn xgboost nltk




In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import nltk
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

df = pd.read_csv('spam.csv', encoding='latin-1')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def clean_text(text):
    text = text.lower()

    text = re.sub(r'[^a-z\s]', '', text)

    text = ' '.join([word for word in text.split() if word not in stop_words])

    return text

In [None]:
df['cleaned_text'] = df['v2'].apply(clean_text)


In [None]:
def create_features(df, vectorizer_type="bow"):
    if vectorizer_type == "bow":
        vectorizer = CountVectorizer()
    else:
        vectorizer = TfidfVectorizer()

    X = vectorizer.fit_transform(df['cleaned_text'])

    return X, vectorizer

In [None]:
X_clean, _ = create_features(df, vectorizer_type="bow")
X_raw, _ = create_features(df, vectorizer_type="bow")
X_clean_tfidf, _ = create_features(df, vectorizer_type="tfidf")
X_raw_tfidf, _ = create_features(df, vectorizer_type="tfidf")

y = df['v1'].apply(lambda x: 1 if x == 'spam' else 0)


In [None]:
X_train_clean, X_test_clean, y_train, y_test = train_test_split(X_clean, y, test_size=0.2, random_state=42)
X_train_raw, X_test_raw = train_test_split(X_raw, test_size=0.2, random_state=42)
X_train_clean_tfidf, X_test_clean_tfidf = train_test_split(X_clean_tfidf, test_size=0.2, random_state=42)
X_train_raw_tfidf, X_test_raw_tfidf = train_test_split(X_raw_tfidf, test_size=0.2, random_state=42)

In [None]:
def train_models(X_train, X_test, y_train, y_test, vectorizer_type="bow"):
    print(f"Training with {vectorizer_type.upper()} features...\n")

    nb_model = MultinomialNB()
    nb_model.fit(X_train, y_train)
    nb_pred = nb_model.predict(X_test)
    nb_accuracy = accuracy_score(y_test, nb_pred)
    print(f"Naive Bayes accuracy: {nb_accuracy:.4f}")

    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    rf_pred = rf_model.predict(X_test)
    rf_accuracy = accuracy_score(y_test, rf_pred)
    print(f"Random Forest accuracy: {rf_accuracy:.4f}")

    xgb_model = XGBClassifier(eval_metric="mlogloss", use_label_encoder=False)
    xgb_model.fit(X_train, y_train)
    xgb_pred = xgb_model.predict(X_test)
    xgb_accuracy = accuracy_score(y_test, xgb_pred)
    print(f"XGBoost accuracy: {xgb_accuracy:.4f}")

    ensemble_model = VotingClassifier(estimators=[('nb', nb_model), ('rf', rf_model), ('xgb', xgb_model)], voting='hard')
    ensemble_model.fit(X_train, y_train)
    ensemble_pred = ensemble_model.predict(X_test)
    ensemble_accuracy = accuracy_score(y_test, ensemble_pred)
    print(f"Ensemble model (Voting Classifier) accuracy: {ensemble_accuracy:.4f}\n")


In [None]:
train_models(X_train_clean, X_test_clean, y_train, y_test, vectorizer_type="bow")
train_models(X_train_raw, X_test_raw, y_train, y_test, vectorizer_type="bow")
train_models(X_train_clean_tfidf, X_test_clean_tfidf, y_train, y_test, vectorizer_type="tfidf")
train_models(X_train_raw_tfidf, X_test_raw_tfidf, y_train, y_test, vectorizer_type="tfidf")

Training with BOW features...

Naive Bayes accuracy: 0.9677
Random Forest accuracy: 0.9722


Parameters: { "use_label_encoder" } are not used.



XGBoost accuracy: 0.9722


Parameters: { "use_label_encoder" } are not used.



Ensemble model (Voting Classifier) accuracy: 0.9785

Training with BOW features...

Naive Bayes accuracy: 0.9677
Random Forest accuracy: 0.9722


Parameters: { "use_label_encoder" } are not used.



XGBoost accuracy: 0.9722


Parameters: { "use_label_encoder" } are not used.



Ensemble model (Voting Classifier) accuracy: 0.9785

Training with TFIDF features...

Naive Bayes accuracy: 0.9677
Random Forest accuracy: 0.9722


Parameters: { "use_label_encoder" } are not used.



XGBoost accuracy: 0.9776


Parameters: { "use_label_encoder" } are not used.



Ensemble model (Voting Classifier) accuracy: 0.9740

Training with TFIDF features...

Naive Bayes accuracy: 0.9677
Random Forest accuracy: 0.9722


Parameters: { "use_label_encoder" } are not used.



XGBoost accuracy: 0.9776


Parameters: { "use_label_encoder" } are not used.



Ensemble model (Voting Classifier) accuracy: 0.9740

