In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer


In [3]:
raw_mail_data=pd.read_csv("/content/mail_data.csv")

In [4]:
raw_mail_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
mail_data=raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [6]:
mail_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


Labeling data

In [7]:
#label spam mail as 0 and not spam mail as 1

In [8]:
mail_data.loc[mail_data['Category']=='spam','Category',]= 0
mail_data.loc[mail_data['Category']=='ham','Category',]=1

In [9]:
#seperating data as texts and labels

In [10]:
x=mail_data["Message"]
y=mail_data['Category']

In [11]:
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

Training the data

In [12]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=1)



In [13]:
#converting the text into numerical values using feature extraction

In [14]:
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)

In [15]:
x_train_features=feature_extraction.fit_transform(x_train)
x_test_features=feature_extraction.transform(x_test)

In [16]:
print(x_train_features)

  (0, 2177)	0.23309557828846914
  (0, 3264)	0.2631513993394141
  (0, 3619)	0.3488783966153327
  (0, 6409)	0.3326362323256634
  (0, 5295)	0.3326362323256634
  (0, 1516)	0.3488783966153327
  (0, 910)	0.3326362323256634
  (0, 1025)	0.3326362323256634
  (0, 3599)	0.31217352532901926
  (0, 5115)	0.3048700690764884
  (1, 5318)	0.6366191779536516
  (1, 4363)	0.3858585853754368
  (1, 2916)	0.6677044064207481
  (2, 6463)	0.3522847311040471
  (2, 5011)	0.5648129493203621
  (2, 260)	0.3791945214013413
  (2, 4356)	0.5648129493203621
  (2, 6675)	0.3067240579580042
  (3, 4425)	0.7814747758273017
  (3, 3477)	0.6239368355416024
  (4, 7233)	0.30283664176761355
  (4, 1525)	0.2709757410506353
  (4, 2232)	0.23277522691456723
  (4, 4600)	0.22842310336648117
  (4, 5849)	0.2709757410506353
  :	:
  (4453, 4685)	0.3013920827617892
  (4453, 7009)	0.20821674441221627
  (4453, 312)	0.14076796933756813
  (4453, 3788)	0.09911204230079218
  (4453, 6904)	0.11983290605228093
  (4453, 7238)	0.26264651983729037
  (4453,

training the data

In [17]:
y_train.value_counts()

1    3859
0     598
Name: Category, dtype: int64

In [18]:
model=LogisticRegression()

In [19]:
y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [20]:
model.fit(x_train_features,y_train)

In [21]:
#testing the accuracy

In [22]:
x_train_prediction=model.predict(x_train_features)

In [23]:
x_train_accuracy=accuracy_score(x_train_prediction,y_train)

In [24]:
x_train_accuracy

0.9685887368184878

In [25]:
x_test_prediction=model.predict(x_test_features)
x_test_accuracy=accuracy_score(x_test_prediction,y_test)

In [26]:
x_test_accuracy

0.9614349775784753

EVALUATING THE MODEL

In [28]:
input_mail = ["U dun say so early hor... U c already then say..."]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')


[1]
Ham mail
