In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
#data collection and preprocessing 
raw_mail_data = pd.read_csv('mail_data.csv')

In [3]:
raw_mail_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
#replace the null value with null string 
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),"")

In [5]:
mail_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
mail_data.shape

(5572, 2)

In [7]:
# Encoding the label spam mail: 0 and ham mail: 1
mail_data.loc[mail_data['Category']=='spam', 'Category'] = 0
mail_data.loc[mail_data['Category']=='ham', 'Category'] = 1

In [8]:
mail_data

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


In [9]:
## splitting the data in features and labels 
x = mail_data['Message']
y = mail_data['Category']

In [10]:
x.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [11]:
y.head()

0    1
1    1
2    0
3    1
4    1
Name: Category, dtype: object

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [13]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(4457,)
(1115,)
(4457,)
(1115,)


**Feature Extraction**

It has two parts: Term Frequency(TF) and Inverse Document Frequency(IDF). The term frequency indicates the frequency of each of the words present in the document or dataset. The second part is — inverse document frequency. IDF actually tells us how important the word is to the document.

In [22]:
## To convert text into meaningfull numerical values called as feature extraction 
 

vectorizer = TfidfVectorizer(min_df= 1, stop_words='english',lowercase = 'True')
x_train_features = vectorizer.fit_transform(x_train)
x_test_features = vectorizer.transform(x_test)


#converting the Y_train and Y-test  into integer values

y_train = y_train.astype('int32') 
y_test = y_test.astype('int32')

In [32]:
print(x_train_features)

  (0, 5818)	0.22682143517864364
  (0, 2497)	0.2442158912653505
  (0, 694)	0.3171299579602537
  (0, 6264)	0.1898892037332199
  (0, 5800)	0.17558937755823417
  (0, 3262)	0.33791755486732394
  (0, 2049)	0.3034375179183143
  (0, 7300)	0.24288153842988894
  (0, 2724)	0.3544175987866074
  (0, 354)	0.3544175987866074
  (0, 7162)	0.2550284465664535
  (0, 258)	0.2379428657041507
  (0, 7222)	0.2173884735352799
  (0, 5512)	0.1898892037332199
  (1, 2555)	0.3840709491751004
  (1, 3804)	0.1902902346515268
  (1, 3932)	0.24325511357721427
  (1, 4509)	0.4028245991060671
  (1, 2440)	0.33870544648398715
  (1, 3333)	0.20665394084233096
  (1, 5650)	0.360444144470318
  (1, 2335)	0.2162321275166079
  (1, 6738)	0.28986069568918
  (1, 6109)	0.3239762634465801
  (1, 3267)	0.2678713077029217
  :	:
  (4452, 2438)	0.4574160733416501
  (4452, 7280)	0.3968991650168732
  (4452, 3978)	0.4574160733416501
  (4452, 3290)	0.26370969643076225
  (4452, 3084)	0.22948428918295163
  (4452, 2236)	0.2676662072392096
  (4453, 387

**Training our logistic regression**

In [27]:
model = LogisticRegression()
model.fit(x_train_features,y_train)

LogisticRegression()

Evaluating The Trained Model 

In [37]:
check = x_train_features[0:5]
print(check)

  (0, 5818)	0.22682143517864364
  (0, 2497)	0.2442158912653505
  (0, 694)	0.3171299579602537
  (0, 6264)	0.1898892037332199
  (0, 5800)	0.17558937755823417
  (0, 3262)	0.33791755486732394
  (0, 2049)	0.3034375179183143
  (0, 7300)	0.24288153842988894
  (0, 2724)	0.3544175987866074
  (0, 354)	0.3544175987866074
  (0, 7162)	0.2550284465664535
  (0, 258)	0.2379428657041507
  (0, 7222)	0.2173884735352799
  (0, 5512)	0.1898892037332199
  (1, 2555)	0.3840709491751004
  (1, 3804)	0.1902902346515268
  (1, 3932)	0.24325511357721427
  (1, 4509)	0.4028245991060671
  (1, 2440)	0.33870544648398715
  (1, 3333)	0.20665394084233096
  (1, 5650)	0.360444144470318
  (1, 2335)	0.2162321275166079
  (1, 6738)	0.28986069568918
  (1, 6109)	0.3239762634465801
  (1, 3267)	0.2678713077029217
  :	:
  (2, 1847)	0.3494431183302565
  (3, 5977)	0.3038019515813577
  (3, 5718)	0.2622294439163908
  (3, 6641)	0.17031540072986187
  (3, 5960)	0.29764860146805006
  (3, 4261)	0.2093116750833727
  (3, 2265)	0.3038019515813577

In [38]:
prediction = model.predict(check)
prediction

array([0, 1, 1, 1, 0], dtype=int32)

In [39]:
## Now checking the accuracy of our model
prediction_train = model.predict(x_train_features)
accuracy_train = accuracy_score(y_train,prediction_train)

In [44]:
print("Accuracy of the Train data: ",accuracy_train)

Accuracy of the Train data:  0.9661207089970832


In [41]:
## prediction on the test data
prediction_test = model.predict(x_test_features)
accuracy_test = accuracy_score(y_test,prediction_test)


In [47]:
print("Accuracy on the Test data: ",accuracy_test)

Accuracy on the Test data:  0.967713004484305


Building a pridictive system

In [48]:
input_mail = ["WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."]

In [50]:
input_mail_features  = vectorizer.transform(input_mail)
print(input_mail_features)

  (0, 7232)	0.23349031480215404
  (0, 6973)	0.2418158640349298
  (0, 6969)	0.21949259159360465
  (0, 5787)	0.2108165557645179
  (0, 5568)	0.26903548829216267
  (0, 5426)	0.3006682571466212
  (0, 5222)	0.18383019342996207
  (0, 4576)	0.21211945126486964
  (0, 3797)	0.3006682571466212
  (0, 3375)	0.2229477361242192
  (0, 2061)	0.19318151011860882
  (0, 1820)	0.21635200626172546
  (0, 1762)	0.34326811967429915
  (0, 721)	0.24874354485129482
  (0, 280)	0.2418158640349298
  (0, 200)	0.3006682571466212


In [51]:
input_mail_prediction = model.predict(input_mail_features)
input_mail_prediction

array([0], dtype=int32)