[View in Colaboratory](https://colab.research.google.com/github/suneetsawant/nlp/blob/master/Toxic_Comments.ipynb)

# Import all the libraries 

In [2]:
import pandas as pd 
import numpy as np 
import os,shutil
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import CuDNNGRU,CuDNNLSTM,Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.models import Model,Sequential
import seaborn as sns
from keras.models import model_from_json
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from keras.layers import Bidirectional
import pickle

%reload_ext autoreload
%autoreload 2

Using TensorFlow backend.


# Set up pydrive for uploading and downloading files (model and weights) 

The ***Drive()*** class gives method ***fileaction()***  to either upload or download a list of files 

### Usage :  
      
     To upload files named f1,f2  
        Drive().fileaction([f1,f2],'up')
     Similarly to download  
        Drive().fileaction([f1,f2],'down')

In [0]:
# Authenticate and create the PyDrive client.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
class Drive():   
  def __init__(self) :  
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    self.drive = GoogleDrive(gauth)


  def fileaction(self,files,op='up') : 
    file_list = self.drive.ListFile({'q': "'root' in parents and trashed=false"}).GetList()
    for filename in files:
      flag = 0 

      for file1 in file_list:
        if (file1['title']) == filename :
            if (op == 'up'and flag==0) : 
                file1.Delete()
                self.upload(filename)
                flag = 1

            elif (op == 'down') : 
                self.download(filename,file1) 

      if(op=='up' and flag==0): 
            self.upload(filename)
            flag = 1

  def upload(self,filename) : 
      Uploadfile = self.drive.CreateFile({'title': filename})
      Uploadfile.SetContentFile(filename)
      Uploadfile.Upload()
      print("Saved '{}' to Drive".format(filename))

  def download(self,filename,file1): 
      downloaded = self.drive.CreateFile({'id':file1['id']})
      downloaded.GetContentFile(filename)
      print("Downloaded '{}' from Drive".format(filename))

# **Download the Dataset**

### Download the dataset from [Toxic Comments](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge). 

Keep  a copy of the dataset on your root of google drive account . 

---



In [4]:
Drive().fileaction(['train.csv'],'down')

Downloaded 'train.csv' from Drive


### Download the glove wordvectors 

In [5]:
! wget nlp.stanford.edu/data/glove.6B.zip
!unzip -o glove.6B.zip

--2018-06-18 06:20:23--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2018-06-18 06:20:23--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2018-06-18 06:20:45 (37.8 MB/s) - ‘glove.6B.zip.1’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


# Data Preprocessing 

In [0]:
  def prepareData(restore , shuffle,valid_ratio) :
      
      if restore==0 : 
        
        dtrain = pd.read_csv('train.csv')
        dftrain, dfval = train_test_split(dtrain, test_size= valid_ratio,shuffle=shuffle)
        
        dftrain.to_csv('dftrain.csv',index=False)
        Drive().fileaction(['dftrain.csv'],'up')
        
        dfval.to_csv('dfval.csv',index = False)
        Drive().fileaction(['dfval.csv'],'up')
      
      else :  
        Drive().fileaction(['dftrain.csv'],'down')
        dftrain = pd.read_csv('dftrain.csv') 
        
        Drive().fileaction(['dfval.csv'],'down')
        dfval = pd.read_csv('dfval.csv')
         
      classes = dftrain.columns[2:]
      Xtrain = dftrain['comment_text'] 
      Ytrain = dftrain[classes].values 

      Xval = dfval['comment_text'] 
      Yval = dfval[classes].values 

      return Xtrain,Ytrain,Xval,Yval,classes
    
  def createTokens(restore,vocab_size, data) :
      if (restore==0) : 
        tokenizer = Tokenizer(num_words = vocab_size)
        tokenizer.fit_on_texts(data)
        file = open('tokenizer.pkl','wb') 
        pickle.dump(tokenizer,file)
        file.close()
        Drive().fileaction(['tokenizer.pkl'],'up')
      else : 
        Drive().fileaction(['tokenizer.pkl'],'down')
        file = open('tokenizer.pkl','rb') 
        tokenizer = pickle.load(file)
        file.close()
      return tokenizer, len(tokenizer.word_index)
  
  def tokenToSequence(data,tokenzier,max_len_sentence): 
      data = tokenizer.texts_to_sequences(data)
      data = pad_sequences(data,maxlen=max_len_sentence)
      return data   
    
  def createEmbeddingIndex (path)  :
      embeddings_index = {}
      f = open( path)
      for line in f:
          values = line.split()
          word = values[0]
          coefs = np.asarray(values[1:], dtype='float32')
          embeddings_index[word] = coefs
      f.close()
      return embeddings_index
    
  def EmbeddingMatrix(tokenizer, path , embedding_dim): 
      word_index = tokenizer.word_index
      embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
      embeddings_index = createEmbeddingIndex (path)
      for word, i in word_index.items():
          embedding_vector = embeddings_index.get(word)
          if embedding_vector is not None:
              # words not found in embedding index will be all-zeros.
              embedding_matrix[i] = embedding_vector
      return embedding_matrix       

# Define the Model 

In [0]:
  def Model(restore,loss,optimizer,vocab_size, embedding_matrix, max_len_sentence): 
      if(restore): return restoreModel(loss,optimizer)
      else : return newModel(loss,optimizer,vocab_size, embedding_matrix,max_len_sentence)
 

  def newModel(loss,optimizer, vocab_size, embedding_matrix, max_len_sentence) : 
      model = Sequential()
      #model.add(Embedding(vocab_size,256,input_length=max_len_sentence,name='Embedding'))
      model.add(Embedding( input_dim = vocab_size+1,
                            output_dim = embedding_matrix.shape[1],
                            weights = [embedding_matrix],
                            trainable = False,
                            name='Embedding'))
      model.add(CuDNNGRU(75,name ="lstm"))
      
      model.add(Dense(100,activation='relu',name='Dense1'))
      model.add(Dropout(0.2,name='Dropout1')) 
      
      model.add(Dense(6,activation='sigmoid',name='Dense2'))
      model.compile (loss=loss,
                        optimizer=optimizer)
      return model 
  
  def restoreModel(loss,optimizer): 
      
      Drive().fileaction(['model.json','weights.hdf5'],'down') 
      with open('model.json', 'r') as json_file:
          loaded_model_json = json_file.read()
      model = model_from_json(loaded_model_json)
      model.compile (loss=loss,
                        optimizer=optimizer)
      # load weights into new model
      model.load_weights("weights.hdf5")
      print("Loaded saved model")
      return model
    
  def buildModel(X,Y,epochs,batchsize,loss,optimizer,restore,vocab_size, embedding_matrix,max_len_sentence): 
    
      model = Model(restore,loss,optimizer,vocab_size, embedding_matrix,max_len_sentence)      
      filepath = "weights.hdf5"
      checkpoint = ModelCheckpoint(filepath, monitor='loss',verbose=1, 
                                   save_best_only=True, mode='auto')
      callbacks_list = [checkpoint]

      model.fit(X,Y, batch_size=batch_size, epochs=epochs,callbacks=callbacks_list,verbose=1)

      model_json = model.to_json()
      with open("model.json", "w") as json_file:
          json_file.write(model_json) 

      Drive().fileaction(['model.json','weights.hdf5'],'up') 
      return model
    
  def trainModel(Xtrain,Ytrain,Xval,Yval,epochs,batchsize,loss,optimizer,restore,vocab_size, embedding_matrix,max_len_sentence): 
       
      if(restore==1) : flag = 1 
      else: flag = 0    

      for i in range(epochs) :
        
        if(flag==0 and i>0): flag = 1
        model = buildModel(Xtrain,Ytrain,1,batchsize,loss,optimizer,flag,vocab_size, embedding_matrix,max_len_sentence)
        trainScore = evaluateModel(Xtrain,Ytrain,model) 
        validScore = evaluateModel(Xval,Yval,model)
        print('Training Score : {} - Validation Score:{}'.format(trainScore,validScore))
      
      model.summary()
      print('Model is evaluated on metric ROC AUC')

      return model
    
  def evaluateModel(X,Y,model): 
      preds = model.predict(X,verbose=1)
      return roc_auc_score (Y,preds)
    
 



# Training Model

Keep **restore** = 0 if training for the first time 

In [8]:
batch_size = 64
epochs = 10
loss='binary_crossentropy'
optimizer='adam'
restore =  1 #set 0 if training for first time
shuffle = True
valid_ratio = 0.2
max_vocab_size = 20000
max_len_sentence = 200 
glove_path  = './glove.6B.300d.txt'
embedding_dim = 300

Xtrain,Ytrain,Xval,Yval,classes = prepareData(restore, shuffle,valid_ratio)

tokenizer,vocab_size = createTokens(restore,max_vocab_size,Xtrain)
embedding_matrix = EmbeddingMatrix(tokenizer, glove_path , embedding_dim)
Xtrain =   tokenToSequence(Xtrain,tokenizer,max_len_sentence)
Xval   =   tokenToSequence(Xval,tokenizer,max_len_sentence)

model = trainModel(Xtrain,Ytrain,Xval,Yval,epochs,batch_size,loss,optimizer,
                   restore,vocab_size, embedding_matrix,max_len_sentence)

Downloaded 'dftrain.csv' from Drive
Downloaded 'dfval.csv' from Drive
Downloaded 'tokenizer.pkl' from Drive
Downloaded 'model.json' from Drive
Downloaded 'weights.hdf5' from Drive
Loaded saved model
Epoch 1/1


Epoch 00001: loss improved from inf to 0.01268, saving model to weights.hdf5
Saved 'model.json' to Drive
Saved 'weights.hdf5' to Drive
 27424/127656 [=====>........................] - ETA: 38s

Training Score : 0.9995219720880097 - Validation Score:0.9743328986301636
Downloaded 'model.json' from Drive
Downloaded 'weights.hdf5' from Drive
Loaded saved model
Epoch 1/1
 11584/127656 [=>............................] - ETA: 1:04 - loss: 0.0102


Epoch 00001: loss improved from inf to 0.01200, saving model to weights.hdf5
Saved 'model.json' to Drive
Saved 'weights.hdf5' to Drive
 15456/127656 [==>...........................] - ETA: 44s

Training Score : 0.9996239603067583 - Validation Score:0.9750385821602201
Downloaded 'model.json' from Drive
Downloaded 'weights.hdf5' from Drive
Loaded saved model
Epoch 1/1
  9536/127656 [=>............................] - ETA: 1:07 - loss: 0.0092


Epoch 00001: loss improved from inf to 0.01143, saving model to weights.hdf5
Saved 'model.json' to Drive
Saved 'weights.hdf5' to Drive
 14752/127656 [==>...........................] - ETA: 45s

Training Score : 0.9996200353931622 - Validation Score:0.9738893293048836
Downloaded 'model.json' from Drive
Downloaded 'weights.hdf5' from Drive
Loaded saved model
Epoch 1/1
  9216/127656 [=>............................] - ETA: 1:08 - loss: 0.0107


Epoch 00001: loss improved from inf to 0.01103, saving model to weights.hdf5
Saved 'model.json' to Drive
Saved 'weights.hdf5' to Drive
 14080/127656 [==>...........................] - ETA: 46s

Training Score : 0.9996437930436661 - Validation Score:0.9751721157108482
Downloaded 'model.json' from Drive
Downloaded 'weights.hdf5' from Drive
Loaded saved model
Epoch 1/1
  8768/127656 [=>............................] - ETA: 1:08 - loss: 0.0102


Epoch 00001: loss improved from inf to 0.01055, saving model to weights.hdf5
Saved 'model.json' to Drive
Saved 'weights.hdf5' to Drive
 15392/127656 [==>...........................] - ETA: 45s

Training Score : 0.9997060442276028 - Validation Score:0.97440119164507
Downloaded 'model.json' from Drive
Downloaded 'weights.hdf5' from Drive
Loaded saved model
Epoch 1/1
 10048/127656 [=>............................] - ETA: 1:07 - loss: 0.0089


Epoch 00001: loss improved from inf to 0.01021, saving model to weights.hdf5
Saved 'model.json' to Drive
Saved 'weights.hdf5' to Drive
 13696/127656 [==>...........................] - ETA: 49s

Training Score : 0.999719657140428 - Validation Score:0.9743795230592864
Downloaded 'model.json' from Drive
Downloaded 'weights.hdf5' from Drive
Loaded saved model
Epoch 1/1
  9024/127656 [=>............................] - ETA: 1:10 - loss: 0.0096


Epoch 00001: loss improved from inf to 0.01003, saving model to weights.hdf5
Saved 'model.json' to Drive
Saved 'weights.hdf5' to Drive
 14688/127656 [==>...........................] - ETA: 45s

Training Score : 0.9997763521165157 - Validation Score:0.9743190617960958
Downloaded 'model.json' from Drive
Downloaded 'weights.hdf5' from Drive
Loaded saved model
Epoch 1/1
  9536/127656 [=>............................] - ETA: 1:10 - loss: 0.0085


Epoch 00001: loss improved from inf to 0.00952, saving model to weights.hdf5
Saved 'model.json' to Drive
Saved 'weights.hdf5' to Drive
 14496/127656 [==>...........................] - ETA: 46s

Training Score : 0.9997402334892515 - Validation Score:0.9738869596120708
Downloaded 'model.json' from Drive
Downloaded 'weights.hdf5' from Drive
Loaded saved model
Epoch 1/1
  9024/127656 [=>............................] - ETA: 1:12 - loss: 0.0095


Epoch 00001: loss improved from inf to 0.00936, saving model to weights.hdf5
Saved 'model.json' to Drive
Saved 'weights.hdf5' to Drive
 13984/127656 [==>...........................] - ETA: 46s

Training Score : 0.9998150315530405 - Validation Score:0.9745826791506925
Downloaded 'model.json' from Drive
Downloaded 'weights.hdf5' from Drive
Loaded saved model
Epoch 1/1
  8768/127656 [=>............................] - ETA: 1:13 - loss: 0.0071


Epoch 00001: loss improved from inf to 0.00877, saving model to weights.hdf5
Saved 'model.json' to Drive
Saved 'weights.hdf5' to Drive
 14304/127656 [==>...........................] - ETA: 46s

Training Score : 0.9998063156635374 - Validation Score:0.9736328064248368
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Embedding (Embedding)        (None, None, 300)         54916200  
_________________________________________________________________
lstm (CuDNNGRU)              (None, 75)                84825     
_________________________________________________________________
Dense1 (Dense)               (None, 100)               7600      
_________________________________________________________________
Dropout1 (Dropout)           (None, 100)               0         
_________________________________________________________________
Dense2 (Dense)               (None, 6)                 606       
Total params: 55,009,231
Trainable params: 93,031
Non-trainable params: 54,916,200
_________________________________________________________________
Model is evaluated on metric ROC AUC


# Test the model

In [9]:
Drive().fileaction(['test.csv'],'down')
dtest = pd.read_csv('test.csv')
Xtest = dtest['comment_text']
dtest.info()


Downloaded 'test.csv' from Drive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153164 entries, 0 to 153163
Data columns (total 2 columns):
id              153164 non-null object
comment_text    153164 non-null object
dtypes: object(2)
memory usage: 2.3+ MB


In [0]:

Xtest = tokenizer.texts_to_sequences(Xtest)
Xtest = pad_sequences(Xtest,maxlen=max_len_sentence) 

In [11]:
preds = model.predict(Xtest,verbose=1)
preds[preds>=0.5] = 1
preds[preds<0.5] = 0
preds.shape



(153164, 6)

In [0]:
result = pd.DataFrame(preds,columns = classes)
result['id'] = dtest['id']
result.to_csv('result.csv',index=False)


In [13]:
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153164 entries, 0 to 153163
Data columns (total 7 columns):
toxic            153164 non-null float32
severe_toxic     153164 non-null float32
obscene          153164 non-null float32
threat           153164 non-null float32
insult           153164 non-null float32
identity_hate    153164 non-null float32
id               153164 non-null object
dtypes: float32(6), object(1)
memory usage: 4.7+ MB


In [0]:
from google.colab import files
files.download('result.csv')