In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [42]:
data = pd.read_csv('/content/spam_ham_dataset.csv')

In [43]:
data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [44]:
data.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [45]:
data.isnull().sum()#no null values are present

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

In [46]:
data.columns

Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')

In [47]:
data = data.drop(['Unnamed: 0','label_num'],axis=1)#these two are no necessary 

In [48]:
data

Unnamed: 0,label,text
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...
...,...,...
5166,ham,Subject: put the 10 on the ft\r\nthe transport...
5167,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168,ham,Subject: calpine daily gas nomination\r\n>\r\n...
5169,ham,Subject: industrial worksheets for august 2000...


In [49]:
data['label'].unique()

array(['ham', 'spam'], dtype=object)

In [50]:
#ham indicates emails that are not spam
#spam indicates emails that are spam 

In [51]:
data['label']=data['label'].map({'spam':0,'ham':1})#labeling the ham as 1 and spam as 0

In [52]:
data

Unnamed: 0,label,text
0,1,Subject: enron methanol ; meter # : 988291\r\n...
1,1,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,1,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,0,"Subject: photoshop , windows , office . cheap ..."
4,1,Subject: re : indian springs\r\nthis deal is t...
...,...,...
5166,1,Subject: put the 10 on the ft\r\nthe transport...
5167,1,Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168,1,Subject: calpine daily gas nomination\r\n>\r\n...
5169,1,Subject: industrial worksheets for august 2000...


In [53]:
x = data['label']

In [54]:
x

0       1
1       1
2       1
3       0
4       1
       ..
5166    1
5167    1
5168    1
5169    1
5170    0
Name: label, Length: 5171, dtype: int64

In [55]:
y = data['text']

In [56]:
y

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object

In [57]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=10)

In [58]:
x_train = x_train.astype('int')
x_test = x_test.astype('int')

In [59]:
print(x.shape)
print(x_train.shape)
print(x_test.shape)

(5171,)
(4136,)
(1035,)


In [60]:
feature_extraction = TfidfVectorizer(min_df=1,stop_words='english')

In [61]:
y_train_feature = feature_extraction.fit_transform(y_train)
y_test_feature = feature_extraction.transform(y_test)

In [62]:
print(y_train_feature)

  (0, 689)	0.1653363799711991
  (0, 22059)	0.2409653776301331
  (0, 26214)	0.24343906523658793
  (0, 26206)	0.22646339166164928
  (0, 13473)	0.18055584198524424
  (0, 19293)	0.12264904667090396
  (0, 438)	0.32018497357631426
  (0, 1)	0.4087711220509585
  (0, 1222)	0.17206058287920536
  (0, 39106)	0.20306004048486453
  (0, 39271)	0.19706087045336393
  (0, 967)	0.10645159906229842
  (0, 341)	0.14291082004535052
  (0, 36336)	0.24036604629563116
  (0, 4523)	0.19238314641824092
  (0, 21717)	0.3736750758884378
  (0, 16671)	0.32948933287747906
  (0, 38409)	0.04884489094253212
  (1, 6371)	0.1141241027098648
  (1, 28842)	0.05843573968212522
  (1, 12592)	0.09412228855640664
  (1, 39330)	0.10884564116514943
  (1, 111)	0.07294538418843977
  (1, 36766)	0.10484660965378771
  (1, 132)	0.07678427529616587
  :	:
  (4134, 341)	0.13113591464748253
  (4134, 38409)	0.04482039531767333
  (4135, 15108)	0.1634763393643177
  (4135, 3430)	0.1974915577650012
  (4135, 3414)	0.1974915577650012
  (4135, 3330)	0.379

In [63]:
#Traning the model
model = LogisticRegression()

In [64]:
model.fit(y_train_feature,x_train)

In [65]:
prediction = model.predict(y_train_feature)#prediction on train data to check the accuracy

In [66]:
accuracy= accuracy_score(x_train,prediction)
print(accuracy)#accuracy of train data

0.9956479690522244


In [67]:
prediction_on_test_data = model.predict(y_test_feature)
accuracy_on_test_data = accuracy_score(x_test,prediction_on_test_data)
print(accuracy_on_test_data)

0.991304347826087


In [68]:
#predicting the mail
input_mail = ["Subject: re : epgtgloria , the difference between the two pipes for july 2000 is the actuals came in lower than what was nominated and scheduled on mops . there isn ' t anything we can do about that difference , hopefully there is some kind of oba that takes those variances .sabrafrom : barkowsky , gloria g .sent : friday , june 22 , 2001 4 : 50 pmto : garcia , clarissa ; farmer , daren j . ; dinari , sabra l subject : epgtclarissa - thanks so much for all your help with this pipe ! everything looks great . i just have a couple of pathsthat i need to finish it :january 2000 - i need deal # 854688 pathed for epgt and for tetc . according to the invoice , we should have11 , 129 dth on the interconnect .february 2000 - i need deal # 871184 pathed for hpl and chan . hpl should have 3 , 600 dth and chan shouldhave 11 , 500 dth on the interconnect .july 2000 - deal # 871172 has an interconnect issue . according to mops contract # 105124 , they received 8 , 275 dthon the matagorda 624 , but according to epgt , they delivered 10 , 362 dth to hpl ( ? ) could this possiblyneed to be split somehow , or do you have any other ideas ?let me know . thanks , gloria 3 - 7118"]
input_mail1 = ["The Exam fee is 200"]
predict = feature_extraction.transform(input_mail1)
final_prediction = model.predict(predict)
print(final_prediction)
if final_prediction[0]== 1:
    print('Ham mail')
else:
    print('Spam mail')  

[1]
Ham mail
