In [1]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('mail_data.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Display all duplicate rows in the DataFrame
duplicates = df[df.duplicated(keep=False)]
print(duplicates)

     Category                                            Message
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
7         ham  As per your request 'Melle Melle (Oru Minnamin...
8        spam  WINNER!! As a valued network customer you have...
9        spam  Had your mobile 11 months or more? U R entitle...
11       spam  SIX chances to win CASH! From 100 to 20,000 po...
...       ...                                                ...
5524     spam  You are awarded a SiPix Digital Camera! call 0...
5535      ham  I know you are thinkin malaria. But relax, chi...
5539      ham                         Just sleeping..and surfing
5553      ham                        Hahaha..use your brain dear
5558      ham                             Sorry, I'll call later

[704 rows x 2 columns]


In [4]:
# Display all null values

null_rows = df[df.isnull().any(axis=1)]
print(null_rows)

Empty DataFrame
Columns: [Category, Message]
Index: []


In [5]:
df.shape

(5572, 2)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [7]:
# Binary encoding
df['Category'] = df['Category'].map({'spam': 0, 'ham': 1})
df.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
# Split the data
X = df['Message']
y = df['Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


In [10]:
# Transform data into vectors
vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True) # filter out rarer terms, remove common english words, convert all tokens to lower case

# Fit and transform the training data and transform the test data in the same way
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

In [11]:
print(X_train_vectors)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 34840 stored elements and shape (4457, 7440)>
  Coords	Values
  (0, 5512)	0.1898892037332199
  (0, 7222)	0.2173884735352799
  (0, 258)	0.2379428657041507
  (0, 7162)	0.2550284465664535
  (0, 354)	0.3544175987866074
  (0, 2724)	0.3544175987866074
  (0, 7300)	0.24288153842988894
  (0, 2049)	0.3034375179183143
  (0, 3262)	0.33791755486732394
  (0, 5800)	0.17558937755823417
  (0, 6264)	0.1898892037332199
  (0, 694)	0.3171299579602537
  (0, 2497)	0.2442158912653505
  (0, 5818)	0.22682143517864364
  (1, 3267)	0.26787130770292167
  (1, 6109)	0.32397626344658004
  (1, 6738)	0.28986069568917994
  (1, 2335)	0.21623212751660786
  (1, 5650)	0.3604441444703179
  (1, 3333)	0.20665394084233094
  (1, 2440)	0.3387054464839871
  (1, 4509)	0.40282459910606705
  (1, 3932)	0.24325511357721422
  (1, 3804)	0.19029023465152678
  (1, 2555)	0.3840709491751003
  :	:
  (4452, 3084)	0.22948428918295163
  (4452, 3290)	0.26370969643076225
  (4452, 3978)	0

In [12]:
# Training the model
model = LogisticRegression()
model.fit(X_train_vectors, y_train)

In [13]:
# Prediction
y_pred = model.predict(X_test_vectors)

In [14]:
accuracy_model = accuracy_score(y_test, y_pred)
print(f"accuracy_model {accuracy_model:.3f}")

accuracy_model 0.968


In [15]:
# Removing duplicate for comparison purposes
new_df = df.drop_duplicates()
new_df.shape

(5157, 2)

In [16]:
# Split the data
X1 = new_df['Message']
y1 = new_df['Category']

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)

In [17]:
print(X1.shape)
print(X_train1.shape)
print(X_test1.shape)

(5157,)
(4125,)
(1032,)


In [18]:
# Transform data into vectors
vectorizer1 = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True) # filter out rarer terms, remove common english words, convert all tokens to lower case

# Fit and transform the training data and transform the test data in the same way
X_train_vectors1 = vectorizer1.fit_transform(X_train1)
X_test_vectors1 = vectorizer1.transform(X_test1)

In [19]:
# Training the model
model1 = LogisticRegression()
model1.fit(X_train_vectors1, y_train1)

In [20]:
# Prediction
y_pred1 = model1.predict(X_test_vectors1)

In [21]:
accuracy_model1 = accuracy_score(y_test1, y_pred1)
print(f"accuracy_model {accuracy_model1:.3f}")

accuracy_model 0.953


In [22]:
with open("log_model.pkl", "wb") as f:
    pickle.dump(model, f)
    
with open("vector.pkl", "wb") as f:
    pickle.dump(vectorizer, f)