In [184]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [186]:
data=pd.read_csv("spam_ham_dataset (1).csv")
data.head()

Unnamed: 0,label,text
0,ham,enron methanol ; meter # : 988291\r\nthis is ...
1,ham,"hpl nom for january 9 , 2001\r\n( see attache..."
2,ham,"neon retreat\r\nho ho ho , we ' re around to ..."
3,spam,"photoshop , windows , office . cheap . main t..."
4,ham,re : indian springs\r\nthis deal is to book t...


In [187]:
data.columns = ["Category","Message"]

In [188]:
data.isnull().sum()

Category    0
Message     0
dtype: int64

In [189]:
data["Message Length"]=data["Message"].apply(len)

In [190]:
ham_desc=data[data["Category"]=="ham"]["Message Length"].describe()
spam_desc=data[data["Category"]=="spam"]["Message Length"].describe()

print("Ham Messege Length Description:\n",ham_desc)
print()
print()


print()
print()
print("Spam Message Length Description:\n",spam_desc)

Ham Messege Length Description:
 count     3672.000000
mean       968.998366
std       1382.830333
min         10.000000
25%        223.750000
50%        522.000000
75%       1219.250000
max      32250.000000
Name: Message Length, dtype: float64




Spam Message Length Description:
 count     1499.000000
mean      1215.244163
std       1825.991729
min          3.000000
25%        275.000000
50%        568.000000
75%       1245.500000
max      22065.000000
Name: Message Length, dtype: float64


In [191]:
data.describe(include="all")

Unnamed: 0,Category,Message,Message Length
count,5171,5171,5171.0
unique,2,4993,
top,ham,calpine daily gas nomination\r\n>\r\nricky a ...,
freq,3672,20,
mean,,,1040.381551
std,,,1528.517097
min,,,3.0
25%,,,236.0
50%,,,532.0
75%,,,1229.0


In [192]:
data["Category"].value_counts()


ham     3672
spam    1499
Name: Category, dtype: int64

In [193]:
ham_count=data["Category"].value_counts()[0]
spam_count=data["Category"].value_counts()[1]

total_count=data.shape[0]

print("Ham contains:{:.2f}% of total data.".format(ham_count/total_count*100))
print("Spam contains:{:.2f}% of total data.".format(spam_count/total_count*100))

Ham contains:71.01% of total data.
Spam contains:28.99% of total data.


In [194]:
#compute the length of majority & minority class
minority_len=len(data[data["Category"]=="spam"])
majority_len=len(data[data["Category"]=="ham"])

#store the indices of majority and minority class
minority_indices=data[data["Category"]=="spam"].index
majority_indices=data[data["Category"]=="ham"].index

#generate new majority indices from the total majority_indices
#with size equal to minority class length so we obtain equivalent number of indices length



random_majority_indices=np.random.choice(
    majority_indices,
    size=minority_len,
    replace=False
)

#concatenate the two indices to obtain indices of new dataframe
undersampled_indices=np.concatenate([minority_indices,random_majority_indices])

#create df using new indices
df=data.loc[undersampled_indices]

#shuffle the sample
df=df.sample(frac=1)

#reset the index as its all mixed
df=df.reset_index()



#drop the older index
df=df.drop(
    columns=["index"],
)

In [195]:
df.shape


(2998, 3)

In [196]:
df["Category"].value_counts()

ham     1499
spam    1499
Name: Category, dtype: int64

In [197]:
df.head()

Unnamed: 0,Category,Message,Message Length
0,ham,"re : mtr 6063 - mokeen lateral\r\nvance ,\r\n...",957
1,spam,= ? iso - 8859 - 7 ? q ? = 5 b = 3 f = 5 d _ ...,608
2,ham,june vacation\r\nplease submit your june vaca...,94
3,spam,alert : spam prevention\r\nr 3 mov 3\r\nsll 0...,212
4,ham,5 th changes @ duke and air liquide\r\n- - - ...,440


In [198]:
df["Label"]=df["Category"].map(
    {
        "ham":0,
        "spam":1
    }
)

In [199]:
df.head()

Unnamed: 0,Category,Message,Message Length,Label
0,ham,"re : mtr 6063 - mokeen lateral\r\nvance ,\r\n...",957,0
1,spam,= ? iso - 8859 - 7 ? q ? = 5 b = 3 f = 5 d _ ...,608,1
2,ham,june vacation\r\nplease submit your june vaca...,94,0
3,spam,alert : spam prevention\r\nr 3 mov 3\r\nsll 0...,212,1
4,ham,5 th changes @ duke and air liquide\r\n- - - ...,440,0


In [200]:
# Import libraries to perform word tokenization

In [201]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stemmer=PorterStemmer()

In [202]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sweta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [203]:
#declare empty list to store tokenized message
corpus=[]

#iterate through the df["Message"]
for message in df["Message"]:
    
    #replace every special characters, numbers etc.. with whitespace of message
    #It will help retain only letter/alphabets
    message=re.sub("[^a-zA-Z]"," ",message)
    
    #convert every letters to its lowercase
    message=message.lower()
    
    #split the word into individual word list
    message=message.split()
    message=[stemmer.stem(words)
        for words in message
         if words not in set(stopwords.words("english"))
        ]
    #join the word lists with the whitespace
    message=" ".join(message)

    #append the message in corpus list
    corpus.append(message)

In [204]:
from tensorflow.keras.preprocessing.text import one_hot
vocab_size=10000

oneHot_doc=[one_hot(words,n=vocab_size)
           for words in corpus
           ]

In [205]:
df["Message Length"].describe()

count     2998.00000
mean      1095.43429
std       1679.66201
min          3.00000
25%        243.25000
50%        538.50000
75%       1222.75000
max      32250.00000
Name: Message Length, dtype: float64

In [206]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
sentence_len=200
embedded_doc=pad_sequences(
    oneHot_doc,
    maxlen=sentence_len,
    padding="pre"
)

In [207]:
extract_features=pd.DataFrame(
    data=embedded_doc
)
target=df["Label"]

In [208]:
df_final=pd.concat([extract_features,target],axis=1)

In [209]:
df_final.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,Label
0,0,0,0,0,0,0,0,0,0,0,...,6698,9134,759,6698,9569,9699,8024,1174,2007,0
1,0,0,0,0,0,0,0,0,0,0,...,7373,8697,6277,2329,2523,7517,8782,7002,5298,1
2,0,0,0,0,0,0,0,0,0,0,...,9074,6698,8971,9796,9074,4714,8024,4990,2007,0
3,0,0,0,0,0,0,0,0,0,0,...,410,9286,5868,8758,3288,6364,3824,3027,3270,1
4,0,0,0,0,0,0,0,0,0,0,...,803,1748,7903,291,5644,1556,291,5644,1556,0


In [210]:
X=df_final.drop("Label",axis=1)
y=df_final["Label"]

In [211]:
from sklearn.model_selection import train_test_split

In [212]:
X_trainval,X_test,y_trainval,y_test=train_test_split(
    X,
    y,
    random_state=42,
    test_size=0.15
)

In [213]:
X_train,X_val,y_train,y_val=train_test_split(
    X_trainval,
    y_trainval,
    random_state=42,
    test_size=0.15
)

In [214]:
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential

In [215]:
model=Sequential()

In [216]:
feature_num=100
model.add(
    Embedding(
        input_dim=vocab_size,
        output_dim=feature_num,
        input_length=sentence_len
    )
)
model.add(
    LSTM(
    units=128
    )
)

model.add(
    Dense(
        units=1,
        activation="sigmoid"
    )
)


In [217]:
from tensorflow.keras.optimizers import Adam
model.compile(
    optimizer=Adam(
    learning_rate=0.001
    ),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

In [218]:
model.fit(
    X_train,
    y_train,
    validation_data=(
        X_val,
        y_val
    ),
    epochs=10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2c206b97580>

In [219]:
y_pred=model.predict(X_test)
y_pred=(y_pred>0.6)



In [220]:
from sklearn.metrics import accuracy_score

In [221]:
score=accuracy_score(y_test,y_pred)
print("Test Score:{:.2f}%".format(score*100))

Test Score:97.56%


In [222]:
#The function take model and message as parameter
def classify_message(model,message):
    
    #We will treat message as a paragraphs containing multiple sentences(lines)
    #we will extract individual lines
    for sentences in message:
        sentences=nltk.sent_tokenize(message)
        
        #Iterate over individual sentences
        for sentence in sentences:
            #replace all special characters
            words=re.sub("[^a-zA-Z]"," ",sentence)
            
            #perform word tokenization of all non-english-stopwords
            if words not in set(stopwords.words('english')):
                word=nltk.word_tokenize(words)
                word=" ".join(word)
        #perform one_hot on tokenized word            
    oneHot=[one_hot(word,n=vocab_size)]
    
    #create an embedded documnet using pad_sequences 
    #this can be fed to our model
    text=pad_sequences(oneHot,maxlen=sentence_len,padding="pre")
    
    #predict the text using model
    predict=model.predict(text)
    
    #if predict value is greater than 0.5 its a spam
    if predict>0.5:
        print("It is a spam")
    #else the message is not a spam    
    else:
        print("It is not a spam")

In [223]:
message1="I am having a bad day and I would like to have a break today"
message2="This is to inform you had won a lottery and the subscription will end in a week so call us."

In [224]:
classify_message(model,message1)

It is a spam


In [229]:
classify_message(model,message)

It is a spam


In [228]:
message = "You are awarded a Nikon Digital Camera. Call now"