In [1]:
#Load the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import nltk
from bs4 import BeautifulSoup
import re,string

import os
import warnings
warnings.filterwarnings('ignore')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
emails=pd.read_csv("/content/gdrive/MyDrive/emails.csv")

In [4]:
emails.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [4]:
#Collating all functions together and applying them for the 'IMDb reviews' dataset
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing emojis
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols                                                                         
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

#Text-encoding: UTF-8 encoder
def to_unicode(text):
    if isinstance(text, float):
        text = str(text)
    if isinstance(text, int):
        text = str(text)
    if not isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    return text

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)


#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

#Removing the noisy text
def denoise_text(text):
    text = to_unicode(text)
    text = strip_html(text)
    text = re.sub(r"http\S+", "", text)
    text = deEmojify(text)
    text = text.encode('ascii', 'ignore')
    text = to_unicode(text)
    text = remove_between_square_brackets(text)
    text = remove_special_characters(text)
    text = text.lower()
    return text


In [5]:
emails['text']=emails['text'].apply(denoise_text)

In [6]:
#Removing stopwords
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer

#Tokenization of text
tokenizer=ToktokTokenizer() #for every function

#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')

In [7]:
from nltk.tokenize import word_tokenize,sent_tokenize


stop=set(stopwords.words('english'))
print(stop)

#Removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

#Apply function on review column
emails['text']=emails['text'].apply(remove_stopwords)

{'just', 'doesn', 'she', 'about', 'again', 'them', 'until', 'then', "mustn't", "wasn't", 'when', 'i', 'himself', 'how', 's', "isn't", 'y', 'it', 'there', 'whom', 'mightn', 'hers', 'an', "couldn't", "needn't", 'down', 'while', "hadn't", 'has', 'with', "hasn't", 'but', 'all', 'don', 'aren', 'll', 'yourselves', 'into', 'mustn', 'be', 'through', 'no', 'its', 'after', 'their', 'which', 'my', "should've", 'a', "shouldn't", 'her', 'should', 'and', "it's", 'have', "didn't", 'theirs', 'we', 'on', 'here', "don't", 'such', 'each', 'needn', 'below', 'isn', 'some', 'out', 'what', 'from', 't', 'didn', 'more', 'to', 'can', 'is', 'am', 'above', 'before', 'by', 'too', "shan't", 'yourself', 'in', 'myself', 'off', 'hasn', 'ours', "you'd", 'because', 'his', 'once', 'very', 're', 've', 'they', 'where', 'd', "doesn't", 'ma', "mightn't", "you'll", 'm', "weren't", "haven't", 'o', 'as', 'if', 'any', 'against', 'couldn', 'shouldn', 'or', 'being', "wouldn't", 'further', "aren't", 'were', 'doing', 'herself', 'was

In [8]:
#Stemming and Lemmatization
from nltk.stem import WordNetLemmatizer,SnowballStemmer
nltk.download('wordnet')

def simple_stemmer(text):
    ps=SnowballStemmer(language='english')
    return ' '.join([ps.stem(word) for word in tokenizer.tokenize(text)])

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [9]:
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

#Lemmatizer example
def lemmatize_all(sentence):
    wnl = WordNetLemmatizer()
    for word, tag in pos_tag(word_tokenize(sentence)):
        if tag.startswith("NN"):
            yield wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            yield wnl.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            yield wnl.lemmatize(word, pos='a')
        else:
            yield word
            
def lemmatize_text(text):
    return ' '.join(lemmatize_all(text))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [10]:
emails['text']=emails['text'].apply(lemmatize_text)

In [11]:
#Creating features using Bag of words model and building the Logistic Regression model
#Transformed train reviews
norm_text=emails.text

In [12]:
from sklearn.preprocessing import LabelBinarizer

#Labelling the sentient data
lb=LabelBinarizer()

#Transformed sentiment data
spam=lb.fit_transform(emails['spam'])
print(spam.shape)

(5728, 1)


In [23]:
#Binarisation of spam or ham: ham: 0, spam: 1

In [13]:
#Fitting the bag of words model on the entire dataset
from sklearn.feature_extraction.text import CountVectorizer

#Creating a matrix with reviews in row and unique words as columns and frequency of word in review as values.
#Count vectorizer for bag of words
cv=CountVectorizer()

#Fitting model on entire data
cv_fit = cv.fit(norm_text)

In [14]:
#Normalised train reviews
norm_train_text=emails.text[:3500]
print('train:','\n',norm_train_text[0])
norm_train_cv_text=cv_fit.transform(norm_train_text)

#Normalised test reviews
norm_test_text=emails.text[3500:]
print('test:','\n',norm_test_text[3501])
norm_test_cv_text=cv_fit.transform(norm_test_text)

train: 
 subject naturally irresistible corporate identity lt really hard recollect company market full suqgestions information isoverwhelminq good catchy logo stylish statlonery outstanding website make task much easy promise havinq order iogo company automaticaily become world ieader isguite ciear without good product effective business organization practicable aim hotat nowadays market promise marketing effort become much effective list clear benefit creativeness hand make original logo specially do reflect distinctive company image convenience logo stationery provide format easy use content management system letsyou change website content even structure promptness see logo draft within three business day affordability marketing break make gap budget 100 satisfaction guarantee provide unlimited amount change extra fee surethat love result collaboration look portfolio _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ interested _ 

In [15]:
spam_train=spam[:3500]
spam_test=spam[3500:]

In [16]:
emails.spam.value_counts()

0    4360
1    1368
Name: spam, dtype: int64

In [17]:
from keras.models import Sequential
from keras.layers import Dense

In [18]:
n_t=norm_train_cv_text.shape[1]

In [19]:
model=Sequential()
model.add(Dense(50,input_shape=(n_t,),activation='relu'))
model.add(Dense(20,activation='relu'))
#model.add(Dense(25,activation='relu'))

model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [20]:
history=model.fit(norm_train_cv_text,spam_train,epochs=10,verbose=2,validation_data=(norm_test_cv_text,spam_test))

Epoch 1/10
110/110 - 4s - loss: 0.1993 - accuracy: 0.9380 - val_loss: 0.0390 - val_accuracy: 0.9919
Epoch 2/10
110/110 - 1s - loss: 0.0218 - accuracy: 0.9989 - val_loss: 0.0293 - val_accuracy: 0.9937
Epoch 3/10
110/110 - 1s - loss: 0.0063 - accuracy: 1.0000 - val_loss: 0.0373 - val_accuracy: 0.9933
Epoch 4/10
110/110 - 1s - loss: 0.0027 - accuracy: 1.0000 - val_loss: 0.0538 - val_accuracy: 0.9906
Epoch 5/10
110/110 - 1s - loss: 0.0014 - accuracy: 1.0000 - val_loss: 0.0556 - val_accuracy: 0.9910
Epoch 6/10
110/110 - 1s - loss: 8.5239e-04 - accuracy: 1.0000 - val_loss: 0.0641 - val_accuracy: 0.9910
Epoch 7/10
110/110 - 1s - loss: 5.6315e-04 - accuracy: 1.0000 - val_loss: 0.0673 - val_accuracy: 0.9910
Epoch 8/10
110/110 - 1s - loss: 3.9543e-04 - accuracy: 1.0000 - val_loss: 0.0722 - val_accuracy: 0.9910
Epoch 9/10
110/110 - 1s - loss: 2.9018e-04 - accuracy: 1.0000 - val_loss: 0.0779 - val_accuracy: 0.9910
Epoch 10/10
110/110 - 1s - loss: 2.2007e-04 - accuracy: 1.0000 - val_loss: 0.0800 - 

In [21]:
loss,acc=model.evaluate(norm_test_cv_text,spam_test)
print("acccuracy",acc*100,"%")

acccuracy 99.10233616828918 %


In [59]:
#unseeen external data
t={'text':["Dear Younus Ahmad Dar,Thank you for participating in the Psychometric Test - Gurgaon scheduled by WNS Global Services Private Limited . Your responses were successfully submitted on 15 Sep, 2021 11:49 AM .Best Regards,WNS Global Services Private Limited"]}
y=pd.DataFrame.from_dict(t,orient='columns')
test=cv_fit.transform(y['text'])

In [23]:
model.predict(test[0])

array([[0.59786606]], dtype=float32)

In [24]:
from sklearn.linear_model import LogisticRegression,SGDClassifier

#Training the model
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)

#Fitting the model for the bag of words
lr_bow=lr.fit(norm_train_cv_text,spam_train)
print(lr_bow)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


In [25]:
#Predicting the model for bag of words
lr_bow_predict=lr.predict(norm_test_cv_text)
print(lr_bow_predict)

[0 0 0 ... 0 0 0]


####unseen external data**

In [58]:
#unseeen external data
d={'text':["you will recive the bitcoins worth $3000 plz share  your bank details with us"]}
x=pd.DataFrame.from_dict(d,orient='columns')
test_1=cv_fit.transform(x['text'])

In [27]:
#Predicting the model for bag of words
lr_bow_predict=lr.predict(test_1)
print(lr_bow_predict)

[1]


In [60]:
frames=[y,x]
data=pd.concat(frames,axis=0,ignore_index=True)
#data=data.reshape(1,)

test_2=cv_fit.transform(data['text'])
#data[0][0]
data.head()

Unnamed: 0,text
0,"Dear Younus Ahmad Dar,Thank you for participat..."
1,you will recive the bitcoins worth $3000 plz s...


In [70]:
test_2.toarray().shape

  #print(i[x])
data_1=pd.DataFrame({"data":[data['text'][0],data['text'][1]]})
  #x=x+1
#data['text'][0]

In [71]:
test_2.toarray().shape
data_1.head()
#data['text'][1]

Unnamed: 0,data
0,"Dear Younus Ahmad Dar,Thank you for participat..."
1,you will recive the bitcoins worth $3000 plz s...


In [75]:
#Predicting the model for bag of words
#for i in data[0]:
  #da=cv_fit.fit_transform(i)
lr_bow_predict=lr.predict(test_2)
print(lr_bow_predict)
lr_bow_predict[0]
lr_bow_predict[1]

[0 1]


1

In [77]:

d={"pred":[lr_bow_predict[0],lr_bow_predict[1]]}
#data_1.head()

In [78]:
#lr_bow_predict.reshape(1,)
prediction=pd.DataFrame.from_dict(d,orient="columns",)
prediction.head()

Unnamed: 0,pred
0,0
1,1


In [79]:
#fram=[data_1,prediction]
pred_data=pd.concat([data_1,prediction],join='outer',axis=1)
pred_data.head()

Unnamed: 0,data,pred
0,"Dear Younus Ahmad Dar,Thank you for participat...",0
1,you will recive the bitcoins worth $3000 plz s...,1


In [45]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

#Accuracy score for bag of words
lr_bow_score=accuracy_score(spam_test,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)

lr_bow_score : 0.9874326750448833
