In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

# Load prepared data

In [2]:
def load_data(train_path='train.csv', validation_path='validation.csv', test_path='test.csv'):
    train_data = pd.read_csv(train_path)
    validation_data = pd.read_csv(validation_path)
    test_data = pd.read_csv(test_path)
    return train_data, validation_data, test_data

# Training and Evaluating a model

In [3]:
def train_and_evaluate(X_train, y_train, X_val, y_val, pipeline):
    """
    Train a model and evaluate it on the training and validation datasets.

    Parameters:
    - X_train: Training features
    - y_train: Training labels
    - X_val: Validation features
    - y_val: Validation labels
    - pipeline: A pipeline that includes the TF-IDF vectorization and a classifier.
    """
    pipeline.fit(X_train, y_train)
    print("Training accuracy: ", pipeline.score(X_train, y_train))
    print("Validation accuracy: ", pipeline.score(X_val, y_val))
    y_pred = pipeline.predict(X_val)
    print("Classification report:\n", classification_report(y_val, y_pred))

# Benchmarking different models

In [4]:
def benchmark_models(train_data, validation_data, test_data):
    X_train, y_train = train_data['text'], train_data['spam']
    X_val, y_val = validation_data['text'], validation_data['spam']
    X_test, y_test = test_data['text'], test_data['spam']
    
    models = [
        ("Logistic Regression", LogisticRegression(max_iter=1000)),
        ("Multinomial NB", MultinomialNB()),
        ("Random Forest", RandomForestClassifier()),
        ("SGD Classifier", SGDClassifier())
    ]
    
    for name, model in models:
        print(f"\n{name}:")
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english')),
            ('classifier', model),
        ])
        train_and_evaluate(X_train, y_train, X_val, y_val, pipeline)
        print(f"Test accuracy for {name}: {pipeline.score(X_test, y_test)}")

# Main function to execute training and evaluation

In [5]:
def main():
    train_data, validation_data, test_data = load_data()  
    benchmark_models(train_data, validation_data, test_data)

In [6]:
if __name__ == '__main__':
    main()


Logistic Regression:
Training accuracy:  0.9951986032300305
Validation accuracy:  0.9733163913595934
Classification report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       589
           1       0.99      0.90      0.94       198

    accuracy                           0.97       787
   macro avg       0.98      0.95      0.96       787
weighted avg       0.97      0.97      0.97       787

Test accuracy for Logistic Regression: 0.9777158774373259

Multinomial NB:
Training accuracy:  0.9484941073766914
Validation accuracy:  0.8856416772554002
Classification report:
               precision    recall  f1-score   support

           0       0.87      1.00      0.93       589
           1       1.00      0.55      0.71       198

    accuracy                           0.89       787
   macro avg       0.93      0.77      0.82       787
weighted avg       0.90      0.89      0.87       787

Test accuracy for Multinomial NB: 0.91364

### Based on the Validation Accuracy and the Test Accuracy of the above 4 models, we can say that the SGD Classifier model is the best model

#### Now since the achieved  best Test Accuracy is already greater than 99%, I am not fine-tuning the model any further