In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import TfidfVectorizer as tv
from sklearn.linear_model import LogisticRegression as lr
from sklearn.metrics import accuracy_score as acs

### Collection of Data set

In [2]:
mail_data = pd.read_csv('mail_data.csv')

In [3]:
mail_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


### Pre-Processing

In [4]:
#replace the null values with null string
data = mail_data.where((pd.notnull(mail_data)),'')

In [5]:
data[0:5]

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
#checking the number of rows and column in the dataframe
data.shape

(5572, 2)

### Appying Spam Filters 

In [38]:
data.loc[data['Category']=='spam','Category',]=0

In [39]:
data.loc[data['Category']=='ham','Category',]=1

In [40]:
data

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


In [41]:
#splitting the data in label(y) and texts(x)
x= data['Message']
y=data['Category']

In [42]:
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [43]:
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

### Splitting Data into 70%training and 30%testing data sets

In [44]:
xTrain,xTest,yTrain,yTest = tts(x,y,test_size=0.2,random_state=3) 

In [45]:
print(x.shape)
print(xTest.shape)
print(xTrain.shape)

(5572,)
(1115,)
(4457,)


### Feature Selection

In [46]:
feaExtract = tv(min_df=1,stop_words='english',lowercase='True')

In [47]:
xTrainFea =  feaExtract.fit_transform(xTrain)
xTestFea = feaExtract.transform(xTest)

In [48]:
#converting train values as integers
yTrain =yTrain.astype('int')
yTest = yTest.astype('int')

In [49]:
print(xTrainFea)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

### Training the model

In [50]:
model = lr()

In [51]:
model.fit(xTrainFea,yTrain)

LogisticRegression()

### Evaluating the Train model

In [52]:
#predict training data
predictionOnTraining=model.predict(xTrainFea)
accuracyOnTraining = acs(yTrain,predictionOnTraining)

In [53]:
print('Accuracy on Training data: ',accuracyOnTraining*100)

Accuracy on Training data:  96.70181736594121


In [54]:
#predict test data
predictionOnTestTraining=model.predict(xTestFea)
accuracyOnTrainingTest = acs(yTest,predictionOnTestTraining)


In [55]:
print('Accuracy on Training data: ',accuracyOnTrainingTest*100)

Accuracy on Training data:  96.59192825112108
