In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import pickle

In [3]:
df = pd.read_csv(r"D:\python_project\data_science\spam_filtering\mail_data.csv")
df.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [4]:
df.isnull().sum()
df.shape

(5572, 2)

In [5]:
df = df.where((pd.notnull(df)),'')

In [6]:
x = df['Message']
y = df['Category']
label = LabelEncoder()
y = label.fit_transform(y)

In [7]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [8]:
feature_extraction = TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)
x_train_extraction = feature_extraction.fit_transform(x_train)
x_test_extraction = feature_extraction.transform(x_test)

In [9]:
print(x_train_extraction)

  (0, 5818)	0.22682143517864364
  (0, 2497)	0.2442158912653505
  (0, 694)	0.3171299579602537
  (0, 6264)	0.1898892037332199
  (0, 5800)	0.17558937755823417
  (0, 3262)	0.33791755486732394
  (0, 2049)	0.3034375179183143
  (0, 7300)	0.24288153842988894
  (0, 2724)	0.3544175987866074
  (0, 354)	0.3544175987866074
  (0, 7162)	0.2550284465664535
  (0, 258)	0.2379428657041507
  (0, 7222)	0.2173884735352799
  (0, 5512)	0.1898892037332199
  (1, 2555)	0.3840709491751004
  (1, 3804)	0.1902902346515268
  (1, 3932)	0.24325511357721427
  (1, 4509)	0.4028245991060671
  (1, 2440)	0.33870544648398715
  (1, 3333)	0.20665394084233096
  (1, 5650)	0.360444144470318
  (1, 2335)	0.2162321275166079
  (1, 6738)	0.28986069568918
  (1, 6109)	0.3239762634465801
  (1, 3267)	0.2678713077029217
  :	:
  (4452, 2438)	0.4574160733416501
  (4452, 7280)	0.3968991650168732
  (4452, 3978)	0.4574160733416501
  (4452, 3290)	0.26370969643076225
  (4452, 3084)	0.22948428918295163
  (4452, 2236)	0.2676662072392096
  (4453, 387

In [10]:
model = LogisticRegression()
model.fit(x_train_extraction,y_train)

In [11]:
y_predict = model.predict(x_train_extraction)
accuracy_score(y_train,y_predict)*100

96.61207089970833

In [12]:
y_predict = model.predict(x_test_extraction)
accuracy_score(y_test,y_predict)*100

96.7713004484305

In [13]:
input_mail = ['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...']
input_mail_extraction = feature_extraction.transform(input_mail)
y_predict = model.predict(input_mail_extraction) 

y_predict

 

if y_predict == 0:
    print('Not a spam')
else: 
    print("Spam")

Not a spam


In [14]:
try:
    with open('model.pkl', 'wb') as model_file:
        pickle.dump(model, model_file)
    print("Model saved successfully as 'model.pkl'")

    with open('feature_extractor.pkl', 'wb') as fe_file:
        pickle.dump(feature_extraction, fe_file)
    print("Feature extractor saved successfully as 'feature_extractor.pkl'")
except Exception as e:
    print(f"Error saving files: {e}")

Model saved successfully as 'model.pkl'
Feature extractor saved successfully as 'feature_extractor.pkl'
