In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection and Pre-Processing

In [25]:
# load data into a dataframe
mail_data = pd.read_csv('./mail_data.csv')
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [32]:
# check if data is missing
mail_data.isna().sum()

Category    0
Message     0
dtype: int64

In [33]:
#check number of rows and column of dataframe
mail_data.shape

(5572, 2)

In [34]:
# Label encode the category column. spam = 1 ham = 0
mail_data['Category'] = mail_data['Category'].map({'ham' : 0, 'spam': 1})

In [47]:
mail_data['Category'].dtype

dtype('int64')

In [36]:
mail_data.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


Split Data into X and Y values for predictions

In [40]:
X = mail_data['Message']
y = mail_data['Category']

In [41]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=4)
X_train.shape

(4457,)

Feature Extraction

In [50]:
# transform test into feature vectors
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [53]:
print(X_train_features)

  (0, 4076)	0.21814163878169243
  (0, 4189)	0.40714919169918795
  (0, 5531)	0.3423416769137198
  (0, 1705)	0.3274543662048457
  (0, 4787)	0.29950623963635054
  (0, 7166)	0.23411798769212422
  (0, 6220)	0.40714919169918795
  (0, 3094)	0.1960160348955552
  (0, 3189)	0.2695003791316419
  (0, 3068)	0.20722043882108684
  (0, 6023)	0.31295501407186926
  (1, 5240)	0.49842861309340514
  (1, 5773)	0.7151217422337083
  (1, 4061)	0.4900712309801611
  (2, 4744)	0.18868130288704416
  (2, 3339)	0.25960869981277335
  (2, 5823)	0.25061469947311094
  (2, 312)	0.1771020551801762
  (2, 299)	0.19209527463945028
  (2, 407)	0.2722850313233416
  (2, 7114)	0.2722850313233416
  (2, 1408)	0.18464166862372272
  (2, 5092)	0.18560370392241915
  (2, 2034)	0.18192137275151332
  (2, 5687)	0.20537868697819087
  :	:
  (4454, 2350)	0.3603401295867597
  (4455, 1263)	0.36319212343381085
  (4455, 1810)	0.36319212343381085
  (4455, 6428)	0.36319212343381085
  (4455, 6158)	0.34628357823652833
  (4455, 3285)	0.317378229301447

Build and Train the Model

In [55]:
model = LogisticRegression()


In [56]:
model.fit(X_train_features, y_train)

LogisticRegression()

Evaluate the trained model

In [65]:
prediction_on_training_data = model.predict(X_train_features)
acc_on_training_data = accuracy_score(prediction_on_training_data, y_train)
# display accuracy score on training data
print('Accuracy of model on training data', accuracy_on_training_data)

Accuracy of model on training data 0.9685887368184878


In [64]:
prediction_on_test_data = model.predict(X_test_features)
acc_on_test_data = accuracy_score(prediction_on_test_data, y_test)
# display accuracy score on test data
print('Accuracy of model on test data: ', acc_on_test_data * 100)

Accuracy of model on test data:  95.06726457399103


Build a predictive model

In [74]:
input_list = ["Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."]
#convert to feature vectors
input_data_features = feature_extraction.transform(input_list)

# make prediction
prediction = model.predict(input_data_features)

if prediction[0] == 1:
    print('Email is  spam')
else:
    print('Email is not spam')

Email is not spam
