In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

##Load datset

In [12]:
df=pd.read_csv("../data/spam.csv",sep="\t",names=["label","message"])


In [13]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


##counting how many  ham messagaes-spam messages (help detect class imbalance)

In [16]:
df["label"].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

#encoding - labels to numbers 

In [18]:
df["label"]=df["label"].map({"ham":0,"spam":1})

##separate features and target

In [20]:
X=df["message"]
y=df["label"]

##train-test split

In [21]:
from sklearn.model_selection import train_test_split

In [23]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.8,random_state=42,stratify=y)

##convert text to numbers 

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
tfidf=TfidfVectorizer(stop_words='english',max_features=3000)

In [28]:
X_train_tfidf=tfidf.fit_transform(X_train)

In [29]:
X_test_tfidf=tfidf.transform(X_test)

##Linear svm (baseline model)

In [31]:
from sklearn.svm import SVC

In [32]:
linear_svm=SVC(kernel="linear")

In [33]:
linear_svm.fit(X_train_tfidf,y_train)

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


## Non Linear SVM

In [35]:
rbf_svm=SVC(kernel="rbf")

In [36]:
rbf_svm.fit(X_train_tfidf,y_train)

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


hyperparameter tuning

In [40]:
from sklearn.model_selection import GridSearchCV

In [37]:
param_grid={"C":[0.1,1,10],"gamma":[0.01,0.1,1]}

In [45]:
grid=GridSearchCV(SVC(kernel="rbf"),param_grid,cv=5,scoring="f1",n_jobs=-1)

In [47]:
grid.fit(X_train_tfidf,y_train)

0,1,2
,estimator,SVC()
,param_grid,"{'C': [0.1, 1, ...], 'gamma': [0.01, 0.1, ...]}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,C,10
,kernel,'rbf'
,degree,3
,gamma,0.1
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


best model

In [49]:
best_svm=grid.best_estimator_

final evaluation

In [51]:
from sklearn.metrics import classification_report

In [52]:
y_pred=best_svm.predict(X_test_tfidf)

In [53]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3860
           1       0.97      0.84      0.90       598

    accuracy                           0.98      4458
   macro avg       0.97      0.92      0.94      4458
weighted avg       0.98      0.98      0.98      4458



save model and vectorizer

In [54]:
import joblib

In [56]:
joblib.dump(best_svm,"spam_model.joblib")

['spam_model.joblib']

In [57]:
joblib.dump(tfidf,"tfidf_vectorizer.joblib")

['tfidf_vectorizer.joblib']