In [1]:
import pandas as pd
# randomly splits data into training and testing datasets
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer # more refined than the bag-of-words model; results in higher accuracy
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load the data
df = pd.read_csv("C:/Users/Joshua/Desktop/spam detection/datasets/trec07p_data.csv")

In [3]:
# Splitting the data into training (80%) and testing (20%)
x = df['text']
y = df['label']

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # You can adjust TF-IDF parameters as needed
    ('clf', LinearSVC())
])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=2)

In [4]:
# Define parameter grid for GridSearchCV
param_grid = {
    'tfidf__max_df': [0.5, 0.75, 0.85, 1.0],
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # Unigrams and bigrams
    'tfidf__sublinear_tf': [True, False],
    'clf__C': [0.01, 0.1, 1, 10, 100]  # Regularization parameter
}

In [5]:
# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=10, n_jobs=-1, scoring='accuracy')

# Train the model
grid_search.fit(x_train, y_train)

# Predict on the test set
y_pred = grid_search.predict(x_test)

In [None]:
# Print the best parameters and classification report
print("Best Parameters:", grid_search.best_params_)
print(classification_report(y_test, y_pred))

# Print results
print(classification_report(y_test, y_pred, target_names=['Spam', 'Ham']))
print('Classification accuracy {:.3%}'.format(accuracy_score(y_test, y_pred)))

Best Parameters: {'clf__C': 10, 'tfidf__max_df': 0.75, 'tfidf__ngram_range': (1, 2)}
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      4760
           1       1.00      1.00      1.00      5974

    accuracy                           1.00     10734
   macro avg       1.00      1.00      1.00     10734
weighted avg       1.00      1.00      1.00     10734

              precision    recall  f1-score   support

        Spam       1.00      0.99      1.00      4760
         Ham       1.00      1.00      1.00      5974

    accuracy                           1.00     10734
   macro avg       1.00      1.00      1.00     10734
weighted avg       1.00      1.00      1.00     10734

Classification accuracy 99.590%


In [None]:
# Save the model and the vectorizer
import joblib
joblib.dump(grid_search.best_estimator_, 'spam_classifier.pkl')

['spam_classifier.pkl']

In [None]:
# Additional diagnostics: cross-validation scores
cv_scores = cross_val_score(grid_search.best_estimator_, x_train, y_train, cv=10)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())