In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
!cp gdrive/My\ Drive/AI-In-Peer-Assessment/problems_expertiza_merged_gabe.csv .

In [0]:
import numpy as np

# To store data
import pandas as pd

# To use regular expressions
import re

#To load and save data
import pickle

from sklearn.model_selection import train_test_split

In [0]:
reviews = pd.read_csv("problems_expertiza_merged_gabe.csv") 

In [0]:
data = reviews.filter(["REVIEW","TAG"])

In [0]:
data = data.loc[data.REVIEW.apply(lambda x: not isinstance(x, (float, int)))]

In [0]:
data['TAG'].value_counts()

1    5585
0    5585
Name: TAG, dtype: int64

In [0]:
train_set, test_set = train_test_split(data, test_size=0.05, random_state=42)

In [0]:
train_reviews = list(train_set["REVIEW"])

In [0]:
train_labels = list(train_set["TAG"])

In [0]:
for i in range(len(train_reviews)):
  train_reviews[i] = re.sub('\d',' ',train_reviews[i]) # Replacing digits by space
  train_reviews[i] = re.sub(r'\s+[a-z][\s$]', ' ',train_reviews[i]) # Removing single characters and spaces alongside
  train_reviews[i] = re.sub(r'\s+', ' ',train_reviews[i]) # Replacing more than one space with a single space

In [0]:
for i in range(len(train_reviews)):
    if 'www.' in train_reviews[i] or 'http:' in train_reviews[i] or 'https:' in train_reviews[i] or '.com' in train_reviews[i]:
        train_reviews[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", train_reviews[i])
        
        
train_reviews[1:5]

['Yes. The team had bulleted list of tasked items to implement in the Proposed Imports Changes section.',
 'Yes the design doc incorporates all the functionality mentioned in the above link.',
 'They have test on most controllers and the user model',
 'Yes, the team has indeed added test case and made test plan for both the issues that they have resolved.']

In [0]:
# Use Tokenizer to remove punctuations and non-word characters and tokenize the text
import os
os.environ['KERAS_BACKEND']='tensorflow' # Or TenserFlow
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import *
from keras.models import Model
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [0]:
MAX_SEQUENCE_LENGTH = 80
MAX_NB_WORDS = 80 # This specifies how many top tokens in each review to be stored. Wrongly interpreted as total number of words(token) together in whole dataset
EMBEDDING_DIM = 100

In [0]:
# Takes 5 minutes to run on entire training dataset
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(train_reviews)
train_sequences = tokenizer.texts_to_sequences(train_reviews)

word_index = tokenizer.word_index
print('Number of Unique Tokens',len(word_index)) # Total 996497 unique words 

Number of Unique Tokens 8577


In [0]:
#Padding
train_sequences_padded = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [0]:
!cp gdrive/My\ Drive/glove100d.zip .
!unzip glove100d.zip

Archive:  glove100d.zip
replace glove.6B.100d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: glove.6B.100d.txt       


In [0]:
embeddings_index = {}
for i, line in enumerate(open('glove.6B.100d.txt')):
  values = line.split() # 0 th index will be the word and rest will the embedding vector (size 100 as we have used Glove.6B.100D embedding file) 
  embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

In [0]:
# create token(words in word index)-embedding mapping
embedding_matrix = np.zeros((len(word_index) + 1, 100)) # 100 since embedding_dimesion is 100, +1 because index 0 is reserved in word_index
for word, i in word_index.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector
# We can initialize random vector and assign for words which are not present in embeddings.Other option is keep trainable=true in embedding layer of the NN model.
# We choose 2nd option

In [0]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / len(word_index)

0.6510434883992072

In [0]:
from sklearn.utils import shuffle
x_train, y_train = shuffle(train_sequences_padded, train_labels)

In [0]:
x_train = np.array(x_train[:])
train_labels = [[1,0] if x == 1 else [0,1] for x in y_train[:]] 
y_train = np.array(train_labels[:])
len(x_train),len(y_train)

(10611, 10611)

In [0]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,trainable=True)
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

net = Dropout(0.3)(embedded_sequences)
net = Bidirectional(LSTM(200,recurrent_dropout=0.4))(net)
net = Dropout(0.3)(net)
output = Dense(2, activation = 'softmax')(net)
model = Model(inputs = sequence_input, outputs = output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

# Keeping a checkpoint to store only the model which gives best output validation accuracy
chkpt=ModelCheckpoint('expertiza_rnn_model.h5',monitor='val_acc',verbose=1,save_best_only=True)
model_history = model.fit(x_train, y_train, batch_size=256, epochs=10, validation_split=0.2,callbacks=[chkpt])






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 80)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 80, 100)           857800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 80, 100)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 400)               481600    
_________________________________________________________________
dropout_2 (Dropout)          (None, 400)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 802       
Tota

In [0]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,trainable=True)
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_input)

net1 = Dropout(0.3)(embedded_sequences_1)
net1 = Conv1D(50, 3, padding='same', activation='relu')(net1)
net1 = AveragePooling1D(pool_size=4)(net1)
net1 = LSTM(100, recurrent_dropout=0.3)(net1)
net1 = Dropout(0.2)(net1)
output1 = Dense(2, activation='softmax')(net1)

model5 = Model(inputs = sequence_input, outputs = output1)
model5.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model5.summary()

# Keeping a checkpoint to store only the model which gives best output validation accuracy
chkpt=ModelCheckpoint('expertiza_cnn_rnn_model.h5',monitor='val_acc',verbose=1,save_best_only=True)
model_history1 = model5.fit(x_train, y_train, batch_size=100, epochs=10, validation_split=0.1,callbacks=[chkpt])


Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 80)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 80, 100)           857800    
_________________________________________________________________
dropout_3 (Dropout)          (None, 80, 100)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 80, 50)            15050     
_________________________________________________________________
average_pooling1d_1 (Average (None, 20, 50)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               60400     
_________________________________________________________________
dropout_4 (Dropout)          (None, 100)               0  

In [0]:
# Saving the model so that it can be loaded easily again
model.save_weights('expertiza_rnn_model_weights.h5')

# Save the model architecture
with open('expertiza_rnn_model_architecture.json', 'w') as f:
    f.write(model.to_json())

In [0]:
# Persisting model weights
!cp expertiza_rnn_model_weights.h5 gdrive/My\ Drive/AI-In-Peer-Assessment/model/
# Persisting model architecture
!cp expertiza_rnn_model_architecture.json gdrive/My\ Drive/AI-In-Peer-Assessment/model/