In [1]:

#### Preparing the text data  ####
##First, we will simply iterate over the folders in which our bug samples are stored, and 
##format them into a list of samples. We will also prepare at the same time a list of class 
##indices matching the samples

import os 
import sys

TEXT_DATA_DIR = "$YOUR_PATH/data_latest" #

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    print path
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                if sys.version_info < (3,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                t = f.read()
                texts.append(t)
                f.close()
                labels.append(label_id)
    print('Found %s texts.' % len(texts)) #print cumulative number of samples found in corpus    


/home/nvidia/rtaneja/nvbugs/data_latest/General
Found 4822 texts.
/home/nvidia/rtaneja/nvbugs/data_latest/Preferences
Found 9982 texts.
/home/nvidia/rtaneja/nvbugs/data_latest/Theme
Found 14605 texts.
/home/nvidia/rtaneja/nvbugs/data_latest/Toolbars and Customization
Found 20122 texts.


In [2]:
#### Text formatting ####
## We format text sampls and labels into tensors  
## that can be fed to neural network. We use 


import keras # Using Tensorflow Backend
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from numpy import *
from keras.utils import np_utils

#Using Keras tokenizer to generate tokene per unique words in the data set

tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print(' Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences)

labels = np_utils.to_categorical(np.asarray(labels))

print('shape of data tensor' , data.shape)  # data tensor
print('shape of label tensor', labels.shape) # label tensor




Using TensorFlow backend.


 Found 11018 unique tokens.
('shape of data tensor', (20122, 46))
('shape of label tensor', (20122, 4))


In [3]:

## Create Training set and Validation set ##

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(0.2 * data.shape[0])
print nb_validation_samples
x_train = data[:-nb_validation_samples]
print x_train.shape
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
print x_val.shape
y_val = labels[-nb_validation_samples:]




4024
(16098, 46)
(4024, 46)


In [4]:
#### Word embedding layer ####
## Using Glove  popular embedding technique based on 
## factorizing a matrix of word co-occurence statistics.
## Specifically, we will use the 100-dimensional GloVe embeddings 
## of 400k words computed on a 2014 dump of English Wikipedia.
## http://nlp.stanford.edu/data/glove.6B.zip
## It is useful to compare to otheer embedding technqieus like word2vec which 
## we will likely test to see further improvement in accuracy besides changing hyper-parameters


GLOVE_DIR = "$YOUR_PATH" # Path where glove embeddings gets downloaded
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index)) #Output of this cell is embeddings_index dictionary

Found 400000 word vectors.


In [5]:
#### Preparing the embedding layer ####


from keras.layers import Embedding

#Use embedding index dictionary and word index to compute embedding matrix

embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector #if embedding found in Glove embedding dictionary
    else:
        embedding_matrix[i] = np.random.uniform(-0.25, 0.25, 100 ) # if embedding not found, we assign random vector
                                                                   # instead of 0; this greatly affects accuracy.

            
            
# Load this embedding matrix into an Embedding layer.
# trainable=False/True to prevent/allow the weights from being updated during training            
            
embedding_layer = Embedding(len(word_index) + 1,        
                           100,
                           weights =[embedding_matrix],
                           input_length=100,
                           trainable=True) 

In [7]:
#### Training the network ####
# We have created 2 models - 1) 1-Dim CNN (Conv1D) 2) RNN LSTM 
# Conv1-D is commented out as we observe better accuracy with LSTM for current hyper-parameter
# and embedding tunings. There exists some scope of improevement for both models.

from keras.layers import Activation, Dense, Input, Conv1D, MaxPooling1D, Flatten, Dropout
from keras.callbacks import EarlyStopping
from keras.layers.wrappers import Bidirectional 
from keras.models import Model 
from keras.models import Sequential
from keras.layers import LSTM
from keras.optimizers import RMSprop



## Model 1 - Using 1D Convolutional Neural Network ##
## This layer creates a convolution kernel   ## 
## that is convolved with the layer input    ##
## over a single spatial (or temporal)       ##
## dimension to produce a tensor of outputs. ##
## 3 conv1D (ReLu activation) + Maxpooling + Dropout layers    ##


#sequence_input = Input(shape=(100,), dtype='int32')
#embedded_sequences = embedding_layer(sequence_input)
#print embedded_sequences
#x = Conv1D(128,5, activation='tanh')(embedded_sequences)
#x = MaxPooling1D(5)(x)
#x = Dropout(0.75)(x)
#x = Conv1D(128,5, activation='tanh')(x)
#x = MaxPooling1D(5)(x)
#x = Dropout(0.75)(x)
#x = Conv1D(128,2, activation='tanh')(x)
#x = MaxPooling1D(2)(x)
#x = Dropout(0.75)(x)
#x = Flatten()(x)
#x = Dense(128, activation='tanh')(x)
#preds = Dense(len(labels_index), activation='softmax')(x)
#model = Model(sequence_input, preds)
# return_sequences = true


## Model 1 - Using RNN LSTM (Bidirectional) ##
## This layer creates a convolution kernel   ## 
## that is convolved with the layer input    ##
## over a single spatial (or temporal)       ##
## dimension to produce a tensor of outputs. ##
## 2 LStM(ReLu activation) + Maxpooling + Dropout layers    ##

model = Sequential()
model.add(Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], input_length=46, trainable=True))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(Dropout(0.5))
model.add(LSTM(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dense(len(labels_index),activation='softmax'))
model.add(Activation('softmax'))


#Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

print model.summary() # Print model architechture with parameters

#es = EarlyStopping(patience=5)  # this was used during training for monitoring overfitting
                                 # (gap between training vs validataionaccuracy)

    
#Begin training: 40 epochs and batch size of 128
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=40, batch_size=128)
scores = model.evaluate(x_val, y_val) # train and Val scores
print('Test score:', scores[0])
print('Test accuracy:', scores[1])

print("Accuracy: %.2f%%" % (scores[1]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 46, 100)           1101900   
_________________________________________________________________
lstm_3 (LSTM)                (None, 46, 128)           117248    
_________________________________________________________________
dropout_3 (Dropout)          (None, 46, 128)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 516       
__________

In [10]:
from keras.models import model_from_json
import pickle

# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

#save tokenizer
with open('tokenizer.pkl', 'wb') as output:
    pickle.dump(tokenizer, output, pickle.HIGHEST_PROTOCOL)

#save labels
labels = {v:k for k,v in labels_index.items()}
with open('labels.pkl', 'wb') as output:
    pickle.dump(labels, output, pickle.HIGHEST_PROTOCOL)
    
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

loaded_model.compile(loss='categorical_crossentropy',
             optimizer='rmsprop',
             metrics=['acc'])

##testing pickle
with open('tokenizer.pkl', 'rb') as input:
    tk = pickle.load(input)
text = np.array([texts[12]]) # sample number 12 randomly chosen to predict upon
mydata = tk.texts_to_sequences(text)
print (mydata)
pred = pad_sequences(mydata, maxlen=46)

prediction = loaded_model.predict(np.array(pred))
print (prediction)
print (np.argmax(prediction))

Saved model to disk
Loaded model from disk
[[2647, 2648, 5506, 4, 484, 109, 3392, 17, 2418]]
[[ 0.47536692  0.17487772  0.17487772  0.17487772]]
0
