Importing the Dependencies

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
pd.set_option('display.max_colwidth', None)

Data Collection & Pre-Processing

In [4]:
data = pd.read_csv('mail_data.csv')

In [5]:
data.shape

(5572, 2)

In [6]:
data.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"
5,spam,"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"
6,ham,Even my brother is not like to speak with me. They treat me like aids patent.
7,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
8,spam,WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
9,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030


In [7]:
data.isnull().sum()

Category    0
Message     0
dtype: int64

In [8]:
data.duplicated().sum()

415

In [9]:
data.drop_duplicates(inplace=True)
data.duplicated().sum()

0

In [10]:
data.shape

(5157, 2)

In [11]:
# label spam mail as 0;  ham mail as 1;

data['Category'] = data['Category'].replace({'spam' : 1 , 'ham' : 0})
data.head()

  data['Category'] = data['Category'].replace({'spam' : 1 , 'ham' : 0})


Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives around here though"


In [12]:
data.Category.value_counts()

Category
0    4516
1     641
Name: count, dtype: int64

In [13]:
# separating the data as texts and label

X = data['Message']

Y = data['Category']

In [14]:
X

0                                                        Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
1                                                                                                                                          Ok lar... Joking wif u oni...
2            Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3                                                                                                                      U dun say so early hor... U c already then say...
4                                                                                                          Nah I don't think he goes to usf, he lives around here though
                                                                                      ...                                                                  

In [15]:
# transform the text data to feature vectors that can be used as input to the Logistic regression

vectorizer = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True,  token_pattern=r'(?u)\b\w+\b')

X = vectorizer.fit_transform(X)

# convert Y_train and Y_test values as integers

Y = Y.astype('int')

In [16]:
X

<5157x8478 sparse matrix of type '<class 'numpy.float64'>'
	with 43595 stored elements in Compressed Sparse Row format>

In [17]:
Y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Category, Length: 5157, dtype: int32



Splitting the data into training data & test data

In [19]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X, Y = sm.fit_resample(X, Y)

Y.value_counts()

Category
0    4516
1    4516
Name: count, dtype: int64

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)


Training the Model

Logistic Regression

In [22]:
model = LogisticRegression()
model.fit(X_train, Y_train)

In [23]:
# prediction on training data

Y_pred = model.predict(X_train)
print('Accuracy on train data : ',accuracy_score(Y_train, Y_pred))

Accuracy on train data :  0.9936332179930796


In [24]:
# prediction on test data

Y_prediction = model.predict(X_test)
print('Accuracy on test data : ',accuracy_score(Y_test, Y_prediction))

Accuracy on test data :  0.9883785279468733


In [25]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]

# convert text to feature vectors
X = vectorizer.transform(input_mail)

# making prediction
prediction = model.predict(X)
print(prediction)

if (prediction[0]==0):
  print('Not a Spam Mail')

else:
  print('Spam Mail')

[0]
Not a Spam Mail


In [26]:
import pickle

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer,f)

with open('model.pkl', 'wb') as file:
    pickle.dump(model,file)

In [27]:
with open('tfidf_vectorizer.pkl', 'rb') as file:
    vectorizer1 = pickle.load(file)

with open('model.pkl', 'rb') as file:
    model1 = pickle.load(file)

In [28]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]

# convert text to feature vectors
X = vectorizer1.transform(input_mail)

# making prediction
prediction = model1.predict(X)
print(prediction)

if (prediction[0]==0):
  print('Not a Spam Mail')

else:
  print('Spam Mail')

[0]
Not a Spam Mail


In [56]:
df = data.tail(20)
df

Unnamed: 0,Category,Message
5550,0,"Cool, what time you think you can get here?"
5551,0,Wen did you get so spiritual and deep. That's great
5552,0,Have a safe trip to Nigeria. Wish you happiness and very soon company to share moments with
5554,0,Well keep in mind I've only got enough gas for one more round trip barring a sudden influx of cash
5555,0,Yeh. Indians was nice. Tho it did kane me off a bit he he. We shud go out 4 a drink sometime soon. Mite hav 2 go 2 da works 4 a laugh soon. Love Pete x x
5556,0,Yes i have. So that's why u texted. Pshew...missing you so much
5557,0,No. I meant the calculation is the same. That &lt;#&gt; units at &lt;#&gt; . This school is really expensive. Have you started practicing your accent. Because its important. And have you decided if you are doing 4years of dental school or if you'll just do the nmde exam.
5559,0,if you aren't here in the next &lt;#&gt; hours imma flip my shit
5560,0,Anything lor. Juz both of us lor.
5561,0,Get me out of this dump heap. My mom decided to come to lowes. BORING.


In [58]:
df.to_csv("Mails_In_CSV_File.csv")