# Import Libraries

In [1]:
import pandas as pd
import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import stopwords
import numpy as np
import string
import tensorflow as tf
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score


# Import Data

In [2]:
data=pd.read_csv('spam_or_not_spam.csv')
data

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0
...,...,...
2995,abc s good morning america ranks it the NUMBE...,1
2996,hyperlink hyperlink hyperlink let mortgage le...,1
2997,thank you for shopping with us gifts for all ...,1
2998,the famous ebay marketing e course learn to s...,1


# Preparing data for Model

In [3]:
data["label"].value_counts()

0    2500
1     500
Name: label, dtype: int64

In [4]:
data1=data.iloc[::, 0: -1]
data1

Unnamed: 0,email
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...
1,martin a posted tassos papadopoulos the greek ...
2,man threatens explosion in moscow thursday aug...
3,klez the virus that won t die already the most...
4,in adding cream to spaghetti carbonara which ...
...,...
2995,abc s good morning america ranks it the NUMBE...
2996,hyperlink hyperlink hyperlink let mortgage le...
2997,thank you for shopping with us gifts for all ...
2998,the famous ebay marketing e course learn to s...


In [5]:
data2=data.iloc[:: ,-1]
data2

0       0
1       0
2       0
3       0
4       0
       ..
2995    1
2996    1
2997    1
2998    1
2999    1
Name: label, Length: 3000, dtype: int64

Lowercase of words

In [6]:
data1['email']=data1['email'].str.lower()
data1['email']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data1['email']=data1['email'].str.lower()


0        date wed number aug number number number numb...
1       martin a posted tassos papadopoulos the greek ...
2       man threatens explosion in moscow thursday aug...
3       klez the virus that won t die already the most...
4        in adding cream to spaghetti carbonara which ...
                              ...                        
2995     abc s good morning america ranks it the numbe...
2996     hyperlink hyperlink hyperlink let mortgage le...
2997     thank you for shopping with us gifts for all ...
2998     the famous ebay marketing e course learn to s...
2999     hello this is chinese traditional 子 件 number世...
Name: email, Length: 3000, dtype: object

remove stop words

In [7]:
stop_words=set(stopwords.words('english'))
stop_words
','.join(stop_words)
def remove_stop(x):
 return " " .join([word for word in str(x).split() if word not in stop_words])
data1['email']=data1['email'].apply(lambda x: remove_stop(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data1['email']=data1['email'].apply(lambda x: remove_stop(x))


In [8]:
stop_words=set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [10]:
','.join(stop_words)

"once,it,all,shouldn,m,where,doing,and,is,aren't,ve,hasn,hers,doesn't,again,myself,some,did,too,ain,she,that,to,wasn't,we,you,through,t,didn't,me,yourself,when,by,o,can,them,than,you're,wasn,been,so,our,it's,from,because,be,most,shan,who,you've,re,don,wouldn't,no,he,such,ourselves,y,d,those,shan't,while,wouldn,you'd,couldn't,won't,few,they,i,she's,am,why,a,the,here,over,both,but,own,are,as,against,which,this,with,in,was,mustn't,didn,having,my,had,that'll,more,between,should,whom,weren't,other,itself,mightn,shouldn't,mustn,him,himself,after,does,above,should've,needn,your,yours,if,at,being,into,mightn't,have,hadn,will,how,isn't,same,or,do,their,up,off,hadn't,until,isn,herself,what,yourselves,about,for,you'll,nor,each,on,of,these,any,don't,needn't,just,theirs,below,now,out,ll,not,his,ours,only,down,under,its,doesn,ma,weren,hasn't,before,an,there,were,haven,during,couldn,has,her,further,then,haven't,themselves,aren,s,won,very"

In [9]:
def remove_stop(x):
 return " " .join([word for word in str(x).split() if word not in stop_words])
data1['email']=data1['email'].apply(lambda x: remove_stop(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data1['email']=data1['email'].apply(lambda x: remove_stop(x))


In [10]:
p=string.punctuation
p

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

#Remove Punctuation

In [11]:
def remove_punct(x):
 return x.translate(str.maketrans("","",p))
data1['email']=data1['email'].apply(lambda x: remove_punct(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data1['email']=data1['email'].apply(lambda x: remove_punct(x))


In [14]:
data1


Unnamed: 0,email
0,date wed number aug number number number numbe...
1,martin posted tassos papadopoulos greek sculpt...
2,man threatens explosion moscow thursday august...
3,klez virus die already prolific virus ever kle...
4,adding cream spaghetti carbonara effect pasta ...
...,...
2995,abc good morning america ranks number christma...
2996,hyperlink hyperlink hyperlink let mortgage len...
2997,thank shopping us gifts occasions free gift nu...
2998,famous ebay marketing e course learn sell comp...


# OneHotEncoding

In [12]:
import sklearn.preprocessing as sp
encoder_x=sp.OneHotEncoder()
x_1=encoder_x.fit_transform(data1).toarray()
x_1
pd.DataFrame(x_1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2862,2863,2864,2865,2866,2867,2868,2869,2870,2871
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
encoder_y=sp.OneHotEncoder()
y_1=encoder_y.fit_transform(np.c_[data2]).toarray()
y_1


array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]])

In [14]:
y_1=pd.DataFrame(y_1)
y_1

Unnamed: 0,0,1
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
...,...,...
2995,0.0,1.0
2996,0.0,1.0
2997,0.0,1.0
2998,0.0,1.0


In [15]:
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y=train_test_split(x_1,y_1,random_state=42)

In [16]:
from tensorflow.keras import models
from tensorflow.keras import layers


network = models.Sequential()
network.add(layers.Flatten())
network.add(layers.Dense(1000 , input_dim=(2872,1) , activation = 'relu' ))
network.add(layers.Dense(512  , activation = 'relu' ))
network.add(layers.Dense(256, activation = 'relu' ))

network.add(layers.Dense(2, activation = 'softmax'))



In [17]:
network.compile(loss = 'categorical_crossentropy',optimizer = 'adam',metrics=['acc'])

In [18]:
network.fit(train_x , train_y , epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2bab3e348e0>

# Model Prediction

In [55]:
a=network.predict(test_x)
a



array([[0.7797355 , 0.22026457],
       [0.9844549 , 0.01554508],
       [0.94582576, 0.05417424],
       ...,
       [0.8846397 , 0.11536036],
       [0.11940142, 0.88059855],
       [0.9666999 , 0.03330004]], dtype=float32)

In [56]:
b=encoder_y.inverse_transform(a)
b


array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
    

# Model Save and Load

In [59]:
network.save("model6.h5")

In [60]:
model=tf.keras.models.load_model('model6.h5')
model

<keras.engine.sequential.Sequential at 0x1e4b3b07dc0>

In [47]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_1 (Flatten)         (None, 2872)              0         
                                                                 
 dense_4 (Dense)             (None, 1000)              2873000   
                                                                 
 dense_5 (Dense)             (None, 512)               512512    
                                                                 
 dense_6 (Dense)             (None, 256)               131328    
                                                                 
 dense_7 (Dense)             (None, 2)                 514       
                                                                 
Total params: 3,517,354
Trainable params: 3,517,354
Non-trainable params: 0
_________________________________________________________________


# Model Evaluation

In [67]:
model.evaluate(train_x, train_y)[1]
model.evaluate(test_x, test_y)[1]



0.7906666398048401