# Import Necessary Libraries

In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
import joblib

# Preprocess the Data

In [2]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

In [3]:
def preprocess_data(data):
    data['text'] = data['text'].apply(preprocess_text)
    return data

# Load the data

In [4]:
data = pd.read_csv('emails.csv')

In [5]:
# Preprocess the data
preprocessed_data = preprocess_data(data)

# Split the data into training and test sets

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    preprocessed_data['text'], preprocessed_data['spam'], test_size=0.2, random_state=42)


# Save the train and test sets to CSV files

In [7]:
X_train.to_frame().join(y_train).to_csv('train_set.csv', index=False)
X_test.to_frame().join(y_test).to_csv('test_set.csv', index=False)

# Vectorize the text and train the model

In [8]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)

In [9]:
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train_tfidf, y_train)

# Save the model and vectorizer to disk

In [10]:
joblib.dump(model, 'xgboost_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [11]:
print("Model training and saving completed.")

Model training and saving completed.
