In [2]:
pip install pandas numpy scikit-learn

Collecting pandas
  Downloading pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp310-cp310-macosx_12_0_arm64.whl.metadata (13 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp310-cp310-macosx_14_0_arm64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV



In [2]:
import os
# Function to load data from the folder and assign labels
def load_data_from_folder(folder_path):
    texts = []
    labels = []
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                texts.append(content)
                # Assign label based on filename starting with "spmsga" (spam)
                if filename.startswith("spmsg"):
                    labels.append(1)  # Spam
                else:
                    labels.append(0)  # Not spam
    return texts, labels

# Load training and test data
train_texts, train_labels = load_data_from_folder('/Users/praveshjain/Desktop/Portfolio/train_test_mails/train-mails')
test_texts, test_labels = load_data_from_folder('/Users/praveshjain/Desktop/Portfolio/train_test_mails/test-mails')

# Convert labels to pandas Series for compatibility with sklearn
train_labels = pd.Series(train_labels)
test_labels = pd.Series(test_labels)

# Vectorization (TF-IDF)
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)

# Fit and transform the training data, transform the test data
X_train = tfidf.fit_transform(train_texts)
X_test = tfidf.transform(test_texts)

# Initialize models
models = {
    "Naive Bayes": MultinomialNB(),
    "SVM": SVC(kernel='linear'),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "KNN": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

# Train and evaluate each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, train_labels)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate performance
    print(f"Results for {model_name}:")
    print(f"Accuracy: {accuracy_score(test_labels, y_pred)}")
    print("Classification Report:")
    print(classification_report(test_labels, y_pred))
    print("\n" + "="*60 + "\n")


Results for Naive Bayes:
Accuracy: 0.9846153846153847
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       130
           1       0.98      0.99      0.98       130

    accuracy                           0.98       260
   macro avg       0.98      0.98      0.98       260
weighted avg       0.98      0.98      0.98       260



Results for SVM:
Accuracy: 0.9807692307692307
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       130
           1       0.98      0.98      0.98       130

    accuracy                           0.98       260
   macro avg       0.98      0.98      0.98       260
weighted avg       0.98      0.98      0.98       260



Results for Random Forest:
Accuracy: 0.9807692307692307
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       130
        

In [3]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib  # for saving models

# Load data from folder (assuming the function load_data_from_folder is already defined)
train_texts, train_labels = load_data_from_folder('/Users/praveshjain/Desktop/Portfolio/train_test_mails/train-mails')
test_texts, test_labels = load_data_from_folder('/Users/praveshjain/Desktop/Portfolio/train_test_mails/test-mails')

# Convert labels to pandas Series for compatibility with sklearn
train_labels = pd.Series(train_labels)
test_labels = pd.Series(test_labels)

# Vectorization (TF-IDF)
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)

# Fit and transform the training data, transform the test data
X_train = tfidf.fit_transform(train_texts)
X_test = tfidf.transform(test_texts)

# Initialize the Naive Bayes model
naive_bayes_model = MultinomialNB()

# Train the Naive Bayes model
naive_bayes_model.fit(X_train, train_labels)

# Predict on the test set
y_pred = naive_bayes_model.predict(X_test)

# Evaluate performance
print(f"Results for Naive Bayes:")
print(f"Accuracy: {accuracy_score(test_labels, y_pred)}")
print("Classification Report:")
print(classification_report(test_labels, y_pred))

# Save the trained Naive Bayes model to a .pkl file
model_filename = 'naive_bayes_model.pkl'
joblib.dump(naive_bayes_model, model_filename)

# Save the TF-IDF vectorizer too, since it's needed for inference
tfidf_filename = 'tfidf_vectorizer.pkl'
joblib.dump(tfidf, tfidf_filename)

print(f"Naive Bayes model and TF-IDF vectorizer saved as {model_filename} and {tfidf_filename}.")


Results for Naive Bayes:
Accuracy: 0.9846153846153847
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       130
           1       0.98      0.99      0.98       130

    accuracy                           0.98       260
   macro avg       0.98      0.98      0.98       260
weighted avg       0.98      0.98      0.98       260

Naive Bayes model and TF-IDF vectorizer saved as naive_bayes_model.pkl and tfidf_vectorizer.pkl.


In [4]:
import joblib
import os

# Load the saved Naive Bayes model and TF-IDF vectorizer
model_filename = 'naive_bayes_model.pkl'
tfidf_filename = 'tfidf_vectorizer.pkl'

naive_bayes_model = joblib.load(model_filename)
tfidf_vectorizer = joblib.load(tfidf_filename)

# Function to predict whether a given text is spam or not
def predict_spam(text):
    # Transform the text using the saved TF-IDF vectorizer
    text_transformed = tfidf_vectorizer.transform([text])
    
    # Predict using the loaded Naive Bayes model
    prediction = naive_bayes_model.predict(text_transformed)
    
    # Return the result
    return "Spam" if prediction == 1 else "Not Spam"

# Example usage
new_text = "Congratulations! You've won a free iPhone. Click here to claim your prize."
result = predict_spam(new_text)
print(f"The prediction for the given text is: {result}")


The prediction for the given text is: Spam
