In [56]:
# import the dependencies or modules

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split           # which data is splitted into train and test data
from sklearn.feature_extraction.text import TfidfVectorizer    # which is used to convert text-data into numerical values
from sklearn.linear_model import LogisticRegression            # to classify mail is spam or not
from sklearn.metrics import accuracy_score                   # we can Evaluate the data,inorder how our model is working perfect


# Data collection and Preprocessing

In [57]:
# load the data from .csv file to a pandas DataFrame

raw_mail_data=pd.read_csv("D:\TSP\spam.csv")
print(raw_mail_data)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [10]:
# replace the NaN values with a NULL(empty) string
# raw_mail_data.isnull()

mail_data = raw_mail_data.where(pd.notnull(raw_mail_data),'')  # here if any NaN replace by ''

In [11]:
# first 5 rows of data
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
# checking the no.of rows and cols in mail_data df

mail_data.shape

(5572, 2)

# Label Encoding 


# spam=0
# ham=1

In [32]:
# labelling spam=0 and (Non-spam)ham=1 

mail_data.loc[mail_data['Category']=='spam',"Category",] = 0
mail_data.loc[mail_data['Category']=='ham',"Category",] = 1   

In [33]:
# now separating the data as texts(message) and label(ham or spam)

X= mail_data["Message"]   #text msg
Y= mail_data["Category"]   #labels (Category)


In [58]:
print("X is Only messages: \n",X,'\n')

X is Only messages: 
 0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object 



In [59]:
print("Y is Only labels :\n",Y)

Y is Only labels :
 0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


# splitting data into training data  &  testing(evaluate) data

In [60]:
# here test_size=0.2 means 20% is testing data and remaining 80% is training data
# diff times u run this train_test_split the data splitting if diff so random_state=3

X_train , X_test ,Y_train , Y_test = train_test_split(X , Y ,test_size= 0.2,random_state=3)
print(X.shape)   #total
print(X_train.shape)   #training 80%
print(X_test.shape)    #testing 20%


(5572,)
(4457,)
(1115,)


# Feature Extraction*

In [61]:
# transform the text data to feature vectors(numerical values) that can be input of Logistic reg. model
# score is given to words accr to their repeatance and compare them with spam or ham
# fitting the data,  convert into features  ,  
feature_extraction = TfidfVectorizer (min_df =1 ,stop_words ='english' ,lowercase = True)

# convert dataset into featured vectors

X_train_features = feature_extraction.fit_transform(X_train)   # fitting= Training 
X_test_features = feature_extraction.transform(X_test)        

# spam=0
# ham=1  these are objects(Strings)

Y_train= Y_train.astype('int')
Y_test= Y_test.astype('int')

In [41]:
print(X_train_features)  # machines understands numbers better

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

# Training Model with Featured data
# LogisticRegression


In [42]:
# now create model instance

model = LogisticRegression()


In [50]:
# training the LogisticRegression model with the training data 

model.fit(X_train_features,Y_train)  #trained (numerical) with labels(Y_train= labels(0,1))


# Evaluating the Trained Model
# checking accuracy on both training & testing data

In [45]:
# prediction doing on training data

predict_on_training_data = model.predict(X_train_features)  #compare the model value to the true value

accuracy_on_training_data= accuracy_score(Y_train , predict_on_training_data)


In [47]:
print("Accuracy on training data : ",accuracy_on_training_data)

Accuracy on training data :  0.9670181736594121


In [48]:
#here tells about the testing is very much accurate or over-fitted

predict_on_test_data = model.predict(X_test_features) 

accuracy_on_test_data= accuracy_score(Y_test , predict_on_test_data)


In [49]:
print("Accuracy on test data : ",accuracy_on_test_data)

Accuracy on test data :  0.9659192825112107


# Building Predictive Model(System)
# spam=0
# ham=1

In [55]:

input_sentence=["URGENT! You have won a 1 week FREE membership in our £100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18"]

# convert given text to feature vectors

input_data_features = feature_extraction.transform(input_sentence)

# making prediction 
 
prediction = model.predict(input_data_features)  #it stores the results into a list

print(prediction) 

if (prediction[0] == 1):
    print("It is a Ham (Non-spam) Mail ")
    
else:
    print("It is a Spam Mail ")
    

[0]
It is a Spam Mail 
