<a href="https://colab.research.google.com/github/venkateshchettiar/SpamFilter/blob/master/SpamFilterTraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##**Importing Package**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras import callbacks
from keras.callbacks import EarlyStopping
from keras.initializers import he_uniform
%matplotlib inline

  import pandas.util.testing as tm
Using TensorFlow backend.


#**Reading .csv file**

In [2]:
file = pd.read_csv("/content/drive/My Drive/DeepLearning/SpamFilter_Cleaned_Data.csv")

In [3]:
file.head()

Unnamed: 0,Question_Text_New,target
0,quebec nationalists see province nation,0
1,adopt dog would encourage people shop,0
2,velocity affect time space geometry,0
3,otto von guericke use magdeburg hemispheres,0
4,convert montra helicon mountain bike change tyres,0


In [4]:
file.shape

(1306122, 2)

In [5]:
file.isnull().sum()

Question_Text_New    404
target                 0
dtype: int64

In [6]:
file.dropna(axis=0, inplace=True)

In [7]:
file.isnull().any()

Question_Text_New    False
target               False
dtype: bool

In [8]:
file['word_count'] = file['Question_Text_New'].apply(lambda x:len(str(x).split(' ')))

In [9]:
file[['Question_Text_New','word_count']].head()

Unnamed: 0,Question_Text_New,word_count
0,quebec nationalists see province nation,5
1,adopt dog would encourage people shop,6
2,velocity affect time space geometry,5
3,otto von guericke use magdeburg hemispheres,6
4,convert montra helicon mountain bike change tyres,7


In [10]:
max(file['word_count']),min(file['word_count'])

(53, 1)

In [11]:
file['char_count'] = file['Question_Text_New'].str.len()

In [12]:
file[['Question_Text_New','char_count']].head()

Unnamed: 0,Question_Text_New,char_count
0,quebec nationalists see province nation,39
1,adopt dog would encourage people shop,37
2,velocity affect time space geometry,35
3,otto von guericke use magdeburg hemispheres,43
4,convert montra helicon mountain bike change tyres,49


In [13]:
max(file['char_count']),min(file['char_count'])

(335, 1)

In [14]:
file['#Tags'] = file['Question_Text_New'].apply(lambda x:len([x for x in x.split() if x.startswith('#')]))

In [15]:
file[['Question_Text_New','#Tags']].head()

Unnamed: 0,Question_Text_New,#Tags
0,quebec nationalists see province nation,0
1,adopt dog would encourage people shop,0
2,velocity affect time space geometry,0
3,otto von guericke use magdeburg hemispheres,0
4,convert montra helicon mountain bike change tyres,0


In [16]:
max(file['#Tags']),min(file['#Tags'])

(0, 0)

In [17]:
file['numeric'] = file['Question_Text_New'].apply(lambda x:len([x for x in x.split() if x.isdigit()]))

In [18]:
file[['Question_Text_New','numeric']].head()

Unnamed: 0,Question_Text_New,numeric
0,quebec nationalists see province nation,0
1,adopt dog would encourage people shop,0
2,velocity affect time space geometry,0
3,otto von guericke use magdeburg hemispheres,0
4,convert montra helicon mountain bike change tyres,0


In [19]:
max(file['numeric']),min(file['numeric'])

(2, 0)

In [20]:
file['Question_Text_New'][file['numeric']==2]

687189    evaluate limit ⁴x ⁴ ³x ³ x approach give function
853043                               remainder ²²² ³ divide
886025                                         solve ³ ⁿ² ²
Name: Question_Text_New, dtype: object

In [21]:
file.head()

Unnamed: 0,Question_Text_New,target,word_count,char_count,#Tags,numeric
0,quebec nationalists see province nation,0,5,39,0,0
1,adopt dog would encourage people shop,0,6,37,0,0
2,velocity affect time space geometry,0,5,35,0,0
3,otto von guericke use magdeburg hemispheres,0,6,43,0,0
4,convert montra helicon mountain bike change tyres,0,7,49,0,0


#**Splitting the data into Training & Validation**

In [22]:
X = file.Question_Text_New
y = file.target

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,stratify = y, random_state = 40)

In [24]:
(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

((1044574,), (261144,), (1044574,), (261144,))

##**Word Embedding**

In [25]:
def tokenization(data):

  """This function creates the vocabulary index based on word frequency and 
  Transforms each text in texts to a sequence of integers and 
  also returns vocabulary length
  """

  tok = Tokenizer(char_level=False,split=' ')
  #this creates the dictionary
  tok.fit_on_texts(data)
  #this transforms the texts in to sequences of indices
  return tok.texts_to_sequences(data), len(tok.index_word.keys())

def padding(sequences_data,maxlen):

  """This function pads variable length sequences.The default padding value is 0.0"""

  return sequence.pad_sequences(sequences_data,maxlen=maxlen)

In [26]:
np.quantile(file["word_count"],0.95)

13.0

In [27]:
maxlen = 13
sequences_train, vocab_len = tokenization(X_train)
sequences_train_matrix = padding(sequences_train,maxlen)

In [28]:
vocab_len

181506

##**The Training datasets contains 181506 Unique Vocabulary**

In [29]:
sequences_test, _ = tokenization(X_test)
sequences_test_matrix = padding(sequences_test,maxlen)

##**Building LSTM Model**

In [30]:
def build_model(input,LSTM_units,nb_classes,finalAct='sigmoid'):

    """This function Builds the LSTM Model using keras Functional API"""

    #Defining basic parameters
    embedding_input_dim = vocab_len
    embedding_output_dim = 200
    initializer = he_uniform(seed=200)
    
    #Input Layer, shape=15.0
    inputs = Input(name='inputs',shape=[input])
    #Embedding Layer
    layer = Embedding(embedding_input_dim+1,embedding_output_dim,input_length=input,
                      mask_zero=True,embeddings_initializer=initializer)(inputs)
    #LSTM Layer
    layer = LSTM(LSTM_units,kernel_initializer=initializer)(layer)
    #Classifier
    layer = Dense(units=64,name='FC1',kernel_initializer=initializer)(layer)
    layer = Activation('relu')(layer)
    #Dropout
    layer = Dropout(0.5)(layer)
    layer = Dense(nb_classes,name='Output_layer',kernel_initializer=initializer)(layer)
    #Final Output Layer
    layer = Activation(finalAct)(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [31]:
model = build_model(input=13,LSTM_units=100,nb_classes=1,finalAct='sigmoid')
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 13)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 13, 200)           36301400  
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               120400    
_________________________________________________________________
FC1 (Dense)                  (None, 64)                6464      
_________________________________________________________________
activation_1 (Activation)    (None, 64)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
Output_layer (Dense)         (None, 1)                 65  

## **Model Compiling**

In [32]:
#Using Adam optimizer with an initial learning rate of 0.0001
opt=Adam(lr=0.001, beta_1=0.91, beta_2=0.999, epsilon=1e-08, decay=0)
#Compile the model
model.compile(loss='binary_crossentropy',optimizer=opt,metrics=['accuracy'])

### **Defining Callback Function**

In [33]:
def myCallbacks():

    """This function returns a list of callbacks"""

    #Model Checkpoint
    file_path = r"/content/drive/My Drive/spamModel.h5"
    checkpoint = callbacks.ModelCheckpoint(file_path,monitor='val_acc',verbose=1,save_best_only=True,mode='auto')

    #ReduceLROnPlateau
    reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss',factor=0.1,patience=5,min_lr=1e-30,cooldown=2,verbose=1)

    # EarlyStopping
    es = callbacks.EarlyStopping(monitor='val_loss',mode='min',verbose=1,patience=10)

    return [checkpoint,reduce_lr, es]

### **Training Model using Keras.fit_generator()**

In [34]:
def train_batch_generator(features, labels, batch_size):
  # Create empty arrays to contain batch of features and labels#
  batch_features = np.zeros((batch_size, 13, ))
  batch_labels = np.zeros((batch_size,))
  while True:
    for i in range(batch_size):
      # choose random index in features
      index = np.random.choice(len(features),1)
      batch_features[i] = features[index]
      batch_labels[i] = labels[index]
    yield batch_features, batch_labels

In [35]:
def validation_batch_generator(features, labels, batch_size):
  # Create empty arrays to contain batch of features and labels#
  batch_features = np.zeros((batch_size, 13, ))
  batch_labels = np.zeros((batch_size,))
  while True:
    for i in range(batch_size):
      # choose random index in features
      index = np.random.choice(len(features),1)
      batch_features[i] = features[index]
      batch_labels[i] = labels[index]
    yield batch_features, batch_labels


In [36]:
#Defining class weights as the dataset is heavily imbalance
class_weight = class_weight.compute_class_weight('balanced',np.unique(y_train),y_train)
class_weight_dict = dict(enumerate(class_weight))
class_weight_dict

{0: 0.532981269165812, 1: 8.08006002568109}

In [37]:
#Training batch size
tbs = 512
#validation batch size
vbs = 64
training_generator = train_batch_generator(sequences_train_matrix, np.asarray(y_train),tbs)
validation_generator = validation_batch_generator(sequences_test_matrix, np.asarray(y_test),vbs)

In [38]:
epochs = 30
steps_per_epoch = int(sequences_train_matrix.shape[0]/512)
validation_steps = int(sequences_test_matrix.shape[0]/64)

## **Training Model**

In [39]:
history = model.fit_generator(generator=training_generator,steps_per_epoch=steps_per_epoch,
    epochs=epochs,class_weight=class_weight_dict,
    validation_data=validation_generator,validation_steps=validation_steps,
    callbacks=myCallbacks(),verbose=1)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/30
Epoch 2/30
   3/2040 [..............................] - ETA: 1:21 - loss: 0.2121 - accuracy: 0.9036



Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 00014: early stopping
