In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [2]:
df=pd.read_csv('spam.csv',encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
df=df[['v1','v2']]
df.columns=["label","message"]


In [4]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
import string
import nltk
from nltk.corpus import stopwords 
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text
df['cleaned_message'] = df['message'].apply(clean_text)
le=LabelEncoder()
df['label'] = le.fit_transform(df['label'])

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
X=tfidf.fit_transform(df['cleaned_message'])
y=df["label"]


In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.svm import LinearSVC
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [8]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Confusion Matrix:
 [[963   2]
 [ 22 128]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.98      0.85      0.91       150

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Accuracy: 0.97847533632287


In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
param_grid = {
    'C': [0.01,0.1, 1, 10, 100]
}

In [10]:
grid=GridSearchCV(LinearSVC(max_iter=5000), param_grid, cv=5, scoring='accuracy')
grid.fit(X_train,y_train)

In [11]:
print("Best Paameters:", grid.best_params_)
print("Best Cross Validated Accuracy:", grid.best_score_)

Best Paameters: {'C': 10}
Best Cross Validated Accuracy: 0.9822741616463591


In [12]:
best_svm=grid.best_estimator_
y_pred=best_svm.predict(X_test)

In [13]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Confusion Matrix:
 [[963   2]
 [ 21 129]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.98      0.86      0.92       150

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Accuracy: 0.979372197309417


In [14]:
import joblib
joblib.dump(best_svm, 'EmailSpamDetection.joblib')
joblib.dump(tfidf,'tfidf_vectorizer.joblib')


['tfidf_vectorizer.joblib']

In [15]:
svm_model = joblib.load('EmailSpamDetection.joblib')
tfidf = joblib.load('tfidf_vectorizer.joblib')