# Import Necessary Libraries

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ujandasgupta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Load the Data

In [5]:
data = pd.read_csv('emails.csv')
data.head()  # Adjust column names as needed


Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


# Preprocess the Data

In [6]:
import re
from nltk.corpus import stopwords

def preprocess_text(text):
    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    text_words = text.split()
    stop_words = set(stopwords.words('english'))
    text_words = [word for word in text_words if word not in stop_words]
    # Join words back to string
    text = ' '.join(text_words)
    return text

# Apply preprocessing to email texts
data['processed_text'] = data['text'].apply(preprocess_text)  # Adjust 'text_column_name'


# Splitting the Data

In [7]:
def split_data(data, test_size=0.2, validation_size=0.25):
    # Splitting data into train and temp data (which will be further split into validation and test)
    train_data, temp_data = train_test_split(data, test_size=test_size, random_state=42)
    # Adjusting validation size based on the new size of temp_data
    validation_size_adjusted = validation_size / (1 - test_size)
    validation_data, test_data = train_test_split(temp_data, test_size=validation_size_adjusted, random_state=42)
    
    return train_data, validation_data, test_data

In [8]:
def store_splits(train_data, validation_data, test_data, train_path='train.csv', validation_path='validation.csv', test_path='test.csv'):
    train_data.to_csv(train_path, index=False)
    validation_data.to_csv(validation_path, index=False)
    test_data.to_csv(test_path, index=False)

In [9]:
data.to_csv('raw_data.csv', index=False)

In [11]:
train_data, validation_data, test_data = split_data(data)
store_splits(train_data, validation_data, test_data)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    data['processed_text'], data['spam'], test_size=0.2, random_state=42)  # Adjust 'label_column_name'


# Create a Pipeline and Train the Model

#### In the previous assignments, I've found SGDClassifier to be the best model. So, I am using SGDClassifier as the best model

In [13]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('sgd', SGDClassifier(loss='log_loss', random_state=42)),
])

pipeline.fit(X_train, y_train)

# Evaluate the Model

In [14]:
predictions = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:", classification_report(y_test, predictions))


Accuracy: 0.987783595113438
Classification Report:               precision    recall  f1-score   support

           0       0.99      1.00      0.99       856
           1       0.99      0.96      0.98       290

    accuracy                           0.99      1146
   macro avg       0.99      0.98      0.98      1146
weighted avg       0.99      0.99      0.99      1146



# Save the Model

In [15]:
import joblib
joblib.dump(pipeline, 'bestModelSGD.joblib')


['bestModelSGD.joblib']