##Preprocessing and Training

In [None]:
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from textblob import TextBlob
import joblib

# Load data
df = pd.read_csv('spam.csv', encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'text']

# Preprocessing
def clean_and_correct(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text
df['text'] = df['text'].apply(clean_and_correct)

# Encode labels
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label_num'], test_size=0.2, random_state=42, stratify=df['label_num']
)

# Weighted upsampling on the training set
train_df = pd.DataFrame({'text': X_train, 'label': y_train})
count_ham = sum(train_df['label'] == 0)
count_spam = sum(train_df['label'] == 1)

if count_spam < count_ham:
    # Upsample spam
    spam_df = train_df[train_df['label'] == 1]
    ham_df = train_df[train_df['label'] == 0]
    spam_upsampled = spam_df.sample(count_ham, replace=True, random_state=42)
    train_balanced = pd.concat([ham_df, spam_upsampled])
else:
    # Upsample ham (rare, but for completeness)
    spam_df = train_df[train_df['label'] == 1]
    ham_df = train_df[train_df['label'] == 0]
    ham_upsampled = ham_df.sample(count_spam, replace=True, random_state=42)
    train_balanced = pd.concat([spam_df, ham_upsampled])

train_balanced = train_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

X_train_bal = train_balanced['text']
y_train_bal = train_balanced['label']

# Vectorize text
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train_bal)
X_test_vec = vectorizer.transform(X_test)

# Train models
# 1. Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_vec, y_train_bal)

# 2. Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_vec, y_train_bal)

# Evaluate
print("Logistic Regression Results:")
print(classification_report(y_test, lr.predict(X_test_vec)))

print("Naive Bayes Results:")
print(classification_report(y_test, nb.predict(X_test_vec)))

# Save models and vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
joblib.dump(lr, 'logistic_regression_model.joblib')
joblib.dump(nb, 'naive_bayes_model.joblib')

# Load and predict
def predict_message(model_path, vectorizer_path, message):
    vectorizer = joblib.load(vectorizer_path)
    model = joblib.load(model_path)
    message_clean = clean_and_correct(message)
    message_vec = vectorizer.transform([message_clean])
    pred = model.predict(message_vec)
    return 'spam' if pred[0] == 1 else 'ham'




Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       965
           1       0.94      0.84      0.89       180

    accuracy                           0.97      1145
   macro avg       0.96      0.92      0.94      1145
weighted avg       0.97      0.97      0.97      1145

Naive Bayes Results:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       965
           1       0.87      0.91      0.89       180

    accuracy                           0.96      1145
   macro avg       0.93      0.94      0.93      1145
weighted avg       0.96      0.96      0.96      1145



##EXAMPLE

In [None]:
print(predict_message('logistic_regression_model.joblib', 'tfidf_vectorizer.joblib', "win iphone, click this link"))

spam
