In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Bidirectional,LSTM,RepeatVector,TimeDistributed,Activation
from keras.layers import BatchNormalization, Flatten, Conv1D, MaxPooling1D,GlobalMaxPool1D
from keras.layers import Dropout,SpatialDropout1D
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.models import Model, model_from_json
import tensorflow as tf

Using TensorFlow backend.


In [2]:
df_test = pd.read_csv('TrainTestData/GithubComments.csv')

In [3]:
df_test.tail(10)

Unnamed: 0,id,number,category,comment_text
12607,3613341,4,PullRequest,Fixed a bug in SimpleJson with DateTimeOffset ...
12608,3612528,3,PullRequest,The subject tells it all. I'll merge this imme...
12609,3611240,2,PullRequest,These were copied from GHfW but stripped of an...
12610,2597405,2,PullRequestReviewComment,You can remove the `-e whatever` flags. Those ...
12611,2597536,2,PullRequestReviewComment,Probably want to remove this right?
12612,3610730,1,PullRequest,We plan to release this code under the MIT lic...
12613,2596854,1,PullRequestReviewComment,Fill in the blanks?
12614,2596866,1,PullRequestReviewComment,An extra 't' in targeting.
12615,2596897,1,PullRequestReviewComment,Capitalize Contribute
12616,2597660,1,PullRequestReviewComment,TROLOLOLOL DOH!


In [4]:
df_test['comment_text'] = df_test['comment_text'].fillna("unknown")
df_test['comment_text'] = df_test['comment_text'].str.lower()

In [5]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [6]:
with open(f"ModelData/SavedTokenizer.pickle", 'rb') as handle:
    tokenizer = pickle.load(handle)

docs_test = df_test['comment_text'].astype(str)
encoded_docs_test = tokenizer.texts_to_sequences(docs_test)
df_test_padded = pad_sequences(encoded_docs_test, maxlen=150, padding='post')

In [7]:
#Reading the model from JSON file
with open('ModelData/LSTMModel.json', 'r') as json_file:
    json_savedModel= json_file.read()
#load the model architecture 
model_j = model_from_json(json_savedModel)
model_j.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 300)          63101100  
_________________________________________________________________
dropout_1 (Dropout)          (None, 150, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 150, 100)          140400    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 70)                7070      
_________________________________________________________________
dropout_2 (Dropout)          (None, 70)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                

In [8]:
model_j.load_weights('ModelData/LSTMModelWeights.hdf')
# compile the model
Adam_opt = Adam(lr=0.0003, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.000015)
model_j.compile(optimizer = Adam_opt, loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])
predictions = model_j.predict(df_test_padded)
predictions

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



array([[1.1384487e-04, 8.9406967e-08, 1.6272068e-05, 1.0430813e-06,
        3.8146973e-06, 7.1525574e-07],
       [3.4958124e-05, 0.0000000e+00, 5.3942204e-06, 0.0000000e+00,
        8.3446503e-07, 1.1920929e-07],
       [1.9453764e-03, 1.5795231e-06, 2.1758676e-04, 1.7106533e-05,
        1.1307001e-04, 1.9431114e-05],
       ...,
       [1.4069736e-02, 2.4557114e-05, 1.1205077e-03, 2.4852157e-04,
        1.0867417e-03, 1.9359589e-04],
       [7.1764886e-03, 1.8090010e-05, 8.2802773e-04, 1.5869737e-04,
        7.1296096e-04, 1.4537573e-04],
       [1.0181576e-02, 1.2725592e-05, 1.1458099e-03, 9.1254711e-05,
        9.5714489e-04, 1.5601776e-04]], dtype=float32)

In [9]:
results = pd.read_csv("TrainTestData/ResultsSample.csv")
results[list_classes] = predictions
results.to_csv("LSTM-Results.csv", index=False)