In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
raw_df = pd.read_csv('C://Users//mahesh//Downloads//mail_data.csv',na_values = 'Null')

In [3]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [4]:
raw_df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [5]:
raw_df.shape

(5572, 2)

In [6]:
raw_df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


**Data Cleaning**

In [7]:
raw_df.isnull().sum()

Category    0
Message     0
dtype: int64

In [8]:
df = raw_df.where((pd.notnull(raw_df)),'')   # where is helpful when we use pd.notnull then it removes null values with ''

In [9]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


**LabelEncoder**

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
le = LabelEncoder()

df['Category'] = le.fit_transform(df['Category'])

In [12]:
df['Category']

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Category, Length: 5572, dtype: int32

In [13]:
x = df['Message']

y = df['Category']

In [14]:
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [15]:
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Category, Length: 5572, dtype: int32

**train_test_split**

In [16]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size = 0.3)

In [17]:
X_train

4440                 i want to grasp your pretty booty :)
41      Did I forget to tell you ? I want you , I need...
3851    I to am looking forward to all the sex cuddlin...
2106                              I fetch yun or u fetch?
2570             Ultimately tor motive tui achieve korli.
                              ...                        
4290                   Okay, good, no problem, and thanx!
3156                                                Ok...
3836    I'm thinking that chennai forgot to come for a...
591     For ur chance to win a £250 wkly shopping spre...
2085               How are you. Wish you a great semester
Name: Message, Length: 3900, dtype: object

In [18]:
features = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)


X_train_features = features.fit_transform(X_train)
X_test_features = features.transform(X_test)


In [19]:
print(X_train_features)

  (0, 1281)	0.5885938168820166
  (0, 4865)	0.46561829280598716
  (0, 2910)	0.5885938168820166
  (0, 6619)	0.3005232817286475
  (1, 6926)	0.35857315930617895
  (1, 4094)	0.35857315930617895
  (1, 5818)	0.3763678001172525
  (1, 949)	0.3763678001172525
  (1, 5987)	0.2655211868996353
  (1, 3813)	0.19631515539673947
  (1, 1855)	0.3213878114573099
  (1, 4269)	0.20089882891836458
  (1, 6077)	0.20786581696851364
  (1, 2657)	0.28793978300133355
  (1, 2076)	0.2093048634061129
  (1, 6619)	0.1921652643709315
  (2, 5605)	0.4969872567450317
  (2, 1896)	0.4969872567450317
  (2, 5444)	0.40371708880439355
  (2, 2675)	0.45681798184543027
  (2, 3788)	0.3665203636035377
  (3, 6927)	0.4244258293803631
  (3, 2538)	0.9054627078763602
  (4, 3576)	0.408248290463863
  (4, 734)	0.408248290463863
  :	:
  (3896, 4438)	1.0
  (3897, 1585)	0.47982236300233294
  (3897, 2662)	0.4330194506467287
  (3897, 6151)	0.47451829733373047
  (3897, 1026)	0.5145711931576671
  (3897, 1718)	0.3038314137679891
  (3898, 119)	0.2717430

In [20]:
print(X_test_features)

  (0, 6691)	0.2378963166710904
  (0, 6107)	0.1526579094856178
  (0, 6087)	0.27292074918076303
  (0, 5103)	0.27292074918076303
  (0, 4961)	0.27292074918076303
  (0, 4869)	0.28288110134583955
  (0, 3757)	0.26519489982382094
  (0, 3131)	0.28288110134583955
  (0, 3069)	0.3767198261329908
  (0, 2693)	0.144346572627006
  (0, 577)	0.28288110134583955
  (0, 498)	0.1988257162045253
  (0, 300)	0.28288110134583955
  (0, 46)	0.23488374119219882
  (0, 1)	0.220845413280455
  (1, 4023)	0.529111106073222
  (1, 1369)	0.8485525543123252
  (2, 6606)	0.578935887579241
  (2, 4052)	0.7318399836628194
  (2, 1967)	0.3595044872949999
  (3, 4305)	0.425057570447515
  (3, 3588)	0.5834608962361701
  (3, 3489)	0.2756288659899256
  (3, 1801)	0.6347662346105443
  (4, 6872)	0.4534105552827749
  :	:
  (1666, 5975)	0.3604787569486579
  (1666, 5809)	0.28179901933391455
  (1666, 5406)	0.2596851083357959
  (1666, 3777)	0.3313428584184347
  (1666, 3167)	0.3604787569486579
  (1666, 3125)	0.24393516282776517
  (1666, 1637)	0.

**Logistic Regression**

In [21]:
lr = LogisticRegression()

lr.fit(X_train_features,y_train)

In [22]:
from sklearn.metrics import accuracy_score 

In [23]:
X_train_features_pred = lr.predict(X_train_features)
X_train_features_acc = accuracy_score(y_train,X_train_features_pred)

In [43]:
print('Accuracy for Training data is : ',round((X_train_features_acc),3)*100,'%') 

Accuracy for Training data is :  96.6 %


In [44]:
X_test_features_pred = lr.predict(X_test_features)
X_test_features_acc = accuracy_score(y_test,X_test_features_pred)

In [46]:
print('Accuracy for Test data is : ',round((X_test_features_acc),3)*100,'%') 

Accuracy for Test data is :  96.0 %


**Prediction**

In [48]:
new_input_raw = ["Congratulations! You have won a $1,000 gift card. Click the link to claim your prize: "]



new_input = features.transform(new_input_raw)

new_input_pred = lr.predict(new_input)
new_input_pred


if new_input_pred[0] == 1 :
    print('it is a Spam Mail')
else :
    print('it is Not a Spam Mail')

it is a Spam Mail


In [49]:


def spam_ham_pred(user_input_raw) :

    #user_input_raw_list = user_input_raw.to_list()
    new_input = features.transform(user_input_raw)
    new_input_pred = lr.predict(new_input)
    
    

    if new_input_pred[0] == 1 :
        print('it is a Spam Mail')
    else :
        print('it is Not a Spam Mail')


In [50]:
spam_ham_pred(["Hi, are we still on for the meeting at 3 PM today? Let me know."])

it is Not a Spam Mail


In [52]:
spam_ham_pred(["Congratulations! You have won a $1,000 gift card. Click the link to claim your prize: "])

it is a Spam Mail


In [53]:
spam_ham_pred(["Please review the attached document and share your feedback."])

it is Not a Spam Mail


In [54]:
spam_ham_pred(["Can you send me the report by end of the day? Thanks!"])

it is Not a Spam Mail


In [55]:
spam_ham_pred(["Reminder: Your package will be delivered today between 4 PM and 6 PM."])

it is Not a Spam Mail


In [57]:
spam_ham_pred(["Congratulations! You've been selected to receive a free iPhone. Click here to claim"])

it is a Spam Mail


In [150]:
new_input_raw = ["Exclusive deal! Buy 1, get 3 free. Offer expires tonight!"]



new_input = features.transform(new_input_raw)

new_input_pred = lr.predict(new_input)
new_input_pred


if new_input_pred[0] == 1 :
    print('it is a Spam Mail')
else :
    print('it is Not a Spam Mail')

it is Not a Spam Mail


In [59]:
from sklearn.metrics import f1_score,r2_score,precision_score,recall_score,classification_report,confusion_matrix

In [62]:
print('accuracy_score :',accuracy_score(y_test,X_test_features_pred))
print('f1_score :',f1_score(y_test,X_test_features_pred))
print('r2_score :',r2_score(y_test,X_test_features_pred))
print('precision_score :',precision_score(y_test,X_test_features_pred))
print('recall_score :',recall_score(y_test,X_test_features_pred))
print('classification_report :',classification_report(y_test,X_test_features_pred))
print('confusion_matrix :')
print(confusion_matrix(y_test,X_test_features_pred))

accuracy_score : 0.9599282296650717
f1_score : 0.8329177057356608
r2_score : 0.6658862407430053
precision_score : 0.9940476190476191
recall_score : 0.7167381974248928
classification_report :               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1439
           1       0.99      0.72      0.83       233

    accuracy                           0.96      1672
   macro avg       0.98      0.86      0.91      1672
weighted avg       0.96      0.96      0.96      1672

confusion_matrix :
[[1438    1]
 [  66  167]]
