In [1]:
#importing the dependencies
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
#text to feature vector
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
#Data collection and preprocessing
#csv to pandas data frame
data = pd.read_csv('D:\\Projects\\Machine Learning\\Spam Mail Detection\\mail_data.csv')

In [3]:
print (data)

      Category                                            Message
0          ham  Go until jurong point, crazy.. Available only ...
1          ham                      Ok lar... Joking wif u oni...
2         spam  Free entry in 2 a wkly comp to win FA Cup fina...
3          ham  U dun say so early hor... U c already then say...
4          ham  Nah I don't think he goes to usf, he lives aro...
...        ...                                                ...
10738      ham  Subject: put the 10 on the ft\r\nthe transport...
10739      ham  Subject: 3 / 4 / 2000 and following noms\r\nhp...
10740      ham  Subject: calpine daily gas nomination\r\n>\r\n...
10741      ham  Subject: industrial worksheets for august 2000...
10742     spam  Subject: important online banking alert\r\ndea...

[10743 rows x 2 columns]


In [4]:
#replace the null value from null strings
mail_data = data.where((pd.notnull(data)),'')

In [5]:
#printing first five rows to view the sample of the data we have
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
#checking the number of rows and columns
mail_data.shape

(10743, 2)

In [7]:
#encoding the labels that is the ham as 1 and spam as 0
mail_data.loc[mail_data['Category']=='spam','Category'] = 0
mail_data.loc[mail_data['Category']=='ham','Category'] = 1

In [8]:
# sapm are 0 and ham ar 1
#slpiting the data into text and labels
#taking input as message(x) and output as y

X = mail_data['Message']
Y = mail_data['Category']

In [9]:
print(X)


0        Go until jurong point, crazy.. Available only ...
1                            Ok lar... Joking wif u oni...
2        Free entry in 2 a wkly comp to win FA Cup fina...
3        U dun say so early hor... U c already then say...
4        Nah I don't think he goes to usf, he lives aro...
                               ...                        
10738    Subject: put the 10 on the ft\r\nthe transport...
10739    Subject: 3 / 4 / 2000 and following noms\r\nhp...
10740    Subject: calpine daily gas nomination\r\n>\r\n...
10741    Subject: industrial worksheets for august 2000...
10742    Subject: important online banking alert\r\ndea...
Name: Message, Length: 10743, dtype: object


In [10]:
print(Y)

0        1
1        1
2        0
3        1
4        1
        ..
10738    1
10739    1
10740    1
10741    1
10742    0
Name: Category, Length: 10743, dtype: object


In [11]:
# Splitting the data into training data and test data

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state = 4)

In [12]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(10743,)
(8594,)
(2149,)


In [13]:
feature_extraction = TfidfVectorizer(min_df =1, stop_words='english',lowercase = True) 
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

#converting y train and y test values as integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [14]:
print(X_train)

7336     Subject: fw : revised : gas logistics netco re...
3511     I just saw ron burgundy captaining a party boa...
1127     Not tonight mate. Catching up on some sleep. T...
1178                            Just nw i came to hme da..
3057                              Webpage s not available!
                               ...                        
6017     Subject: goat % rdn _ word wheresoever\r\ninve...
709      To review and KEEP the fantastic Nokia N-Gage ...
10679    Subject: special promotion - get a las vegas v...
8366     Subject: info\r\nneh b 27 q 71 tojlmjuob 2 wj ...
1146     Thank you, winner notified by sms. Good Luck! ...
Name: Message, Length: 8594, dtype: object


In [15]:
print(X_train_features)

  (0, 5298)	0.07913618892190016
  (0, 37812)	0.10054727903309521
  (0, 13596)	0.058632116658492134
  (0, 34569)	0.07112473409930432
  (0, 7506)	0.04937728199057548
  (0, 37689)	0.07961878987009147
  (0, 10733)	0.11191432721482159
  (0, 11373)	0.04450669693259767
  (0, 28519)	0.2156381360562637
  (0, 26901)	0.10905770670285507
  (0, 2213)	0.07836652231487516
  (0, 474)	0.04666179255895615
  (0, 1271)	0.09217992155843856
  (0, 209)	0.06000763489347967
  (0, 25817)	0.0662835378736576
  (0, 44534)	0.07244551590205553
  (0, 39629)	0.055697382295382514
  (0, 38327)	0.062479798046863336
  (0, 42269)	0.1773810279994227
  (0, 29948)	0.055129684977550227
  (0, 33054)	0.06344441627549591
  (0, 9473)	0.12206810314027552
  (0, 7229)	0.08444135748075131
  (0, 38673)	0.08667406724528615
  (0, 34572)	0.09178988717154238
  :	:
  (8592, 2179)	0.06047555245281106
  (8592, 1344)	0.04723642110448271
  (8592, 2968)	0.12762887499251446
  (8592, 1846)	0.09447284220896542
  (8592, 1453)	0.05114360770442875
  (

In [16]:
print(Y_train)


7336     1
3511     1
1127     1
1178     1
3057     1
        ..
6017     0
709      0
10679    0
8366     0
1146     0
Name: Category, Length: 8594, dtype: int32


In [17]:
#Logistic Regression

model = LogisticRegression()

In [18]:
#training the logistic regressionmodel with the training daat

model.fit(X_train_features, Y_train)

In [19]:
# Prediction on training data
predictTrain = model.predict(X_train_features)
accuracy_on_trainigData = accuracy_score(Y_train,predictTrain)

In [20]:
print('Accuracy on trining data = ',accuracy_on_trainigData)

Accuracy on trining data =  0.9605538747963696


In [21]:
#Checking the prediction with Test Data

predictTest = model.predict(X_test_features)
accuracy_on_testingData = accuracy_score(Y_test,predictTest)

In [22]:
print('Accuracy on trining data = ',accuracy_on_testingData)

Accuracy on trining data =  0.946021405304793


In [23]:
#Therefore precition is very good

In [24]:
#Building a Predictive system


input_mail = ["Cloud space meets ASUS Second Brain.Let AI make your work twice as efficient.Be the first to experience the three major features of xBrain.Natural language search.Interact with AI through natural language, enhancing search precision and work efficiency.Summarize document quickly.Fast summary of key information for selected documents, capture the essence of the files.Support multi-language document translation.Translate selected documents into Chinese or English, speeding up cross-language communication.Exclusive offer at US$4.5/month for ASUS members.3 Days Only!"]

#convert text to feature vector
input_data_feature = feature_extraction.transform(input_mail)
#making preciction

prediction = model.predict(input_data_feature)



if (prediction[0]==1):
    print('Ham Mail')
else:
    print('Spam Mail')


Ham Mail
