DeepTriage code from https://bugtriage.mybluemix.net/#code

Some changes were made:
* **Data:** - Adjusted to conform to the DASENet data
* **Architecture:** - Simplified slightly; output changed to DASENet settings.

In [4]:
import numpy as np
np.random.seed(1337)
import json, re, nltk, string
from nltk.corpus import wordnet
from gensim.models import Word2Vec
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers import Dense, Dropout, Embedding, LSTM, Input, merge
from keras.optimizers import RMSprop
from keras.utils import np_utils
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/ryedida/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [24]:
bugs_json = './chromium.json'

The hyperparameters required for the entire code can be initialized upfront as follows:

In [2]:
#1. Word2vec parameters
min_word_frequency_word2vec = 5
embed_size_word2vec = 200
context_window_word2vec = 5

#2. Classifier hyperparameters
numCV = 10
max_sentence_len = 50
min_sentence_length = 15
rankK = 10
batch_size = 256

The bugs are loaded from the JSON file and the preprocessing is performed as follows:

In [25]:
with open(bugs_json) as data_file:
    data = json.load(data_file, strict=False)

all_data = []
all_owner = []
all_y = []
for item in data:
    #1. Remove \r 
    current_title = item['issue_title'].replace('\r', ' ')
    current_desc = item['description'].replace('\r', ' ')    
    #2. Remove URLs
    current_desc = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', current_desc)    
    #3. Remove Stack Trace
    start_loc = current_desc.find("Stack trace:")
    current_desc = current_desc[:start_loc]    
    #4. Remove hex code
    current_desc = re.sub(r'(\w+)0x\w+', '', current_desc)
    current_title= re.sub(r'(\w+)0x\w+', '', current_title)    
    #5. Change to lower case
    current_desc = current_desc.lower()
    current_title = current_title.lower()    
    #6. Tokenize
    current_desc_tokens = nltk.word_tokenize(current_desc)
    current_title_tokens = nltk.word_tokenize(current_title)
    #7. Strip trailing punctuation marks    
    current_desc_filter = [word.strip(string.punctuation) for word in current_desc_tokens]
    current_title_filter = [word.strip(string.punctuation) for word in current_title_tokens]      
    #8. Join the lists
    current_data = current_title_filter + current_desc_filter
    current_data = filter(None, current_data)
    all_data.append(current_data)
    all_owner.append(item['owner'])
    all_y.append(item['y'])

In [26]:
all_data = [list(x) for x in all_data]

A vocabulary is constructed and the word2vec model is learnt using the preprocessed data. The word2vec model provides a semantic word representation for every word in the vocabulary.

In [27]:
wordvec_model = Word2Vec(all_data, min_count=min_word_frequency_word2vec, size=embed_size_word2vec, window=context_window_word2vec)
vocabulary = wordvec_model.wv.vocab
vocab_size = len(vocabulary)

The ten times chronological cross validation split is performed as follows:

In [28]:
totalLength = len(all_data)
splitLength = int(totalLength / (numCV + 1))

for i in range(1, numCV+1):
    train_data = all_data[:i*splitLength-1]
    test_data = all_data[i*splitLength:(i+1)*splitLength-1]
    train_owner = all_owner[:i*splitLength-1]
    test_owner = all_owner[i*splitLength:(i+1)*splitLength-1]
    train_y = all_y[:i*splitLength-1]
    test_y = all_y[i*splitLength:(i+1)*splitLength-1]

For the ith cross validation set, remove all the words that is not present in the vocabulary

In [29]:
i = 1 # Denotes the cross validation set number
updated_train_data = []    
updated_train_data_length = []    
updated_train_owner = []
updated_train_y = []
final_test_data = []
final_test_owner = []
final_test_y = []
for j, item in enumerate(train_data):
    current_train_filter = [word for word in item if word in vocabulary]
    if len(current_train_filter)>=min_sentence_length:  
        updated_train_data.append(current_train_filter)
        updated_train_owner.append(train_owner[j])
        updated_train_y.append(train_y[j])

for j, item in enumerate(test_data):
    current_test_filter = [word for word in item if word in vocabulary]  
    if len(current_test_filter)>=min_sentence_length:
        final_test_data.append(current_test_filter)  
        final_test_owner.append(test_owner[j])
        final_test_y.append(test_y[j])

For the ith cross validation set, remove those classes from the test set, for whom the train data is not available.

In [30]:
i = 1 # Denotes the cross validation set number
# Remove data from test set that is not there in train set
train_owner_unique = set(updated_train_owner)
test_owner_unique = set(final_test_owner)
train_y_unique = set(updated_train_y)
test_y_unique = set(final_test_y)
unwanted_owner = list(test_owner_unique - train_owner_unique)
unwanted_y = list(test_y_unique - train_y_unique)
updated_test_data = []
updated_test_owner = []
updated_test_y = []
updated_test_data_length = []
for j in range(len(final_test_owner)):
    if final_test_owner[j] not in unwanted_owner:
        updated_test_data.append(final_test_data[j])
        updated_test_owner.append(final_test_owner[j])
        updated_test_y.append(final_test_y[j])

unique_train_label = list(set(updated_train_owner))
unique_train_ys = list(set(updated_train_y))
classes = np.array(unique_train_ys)

In [31]:
X_train = np.empty(shape=[len(updated_train_data), max_sentence_len, embed_size_word2vec], dtype='float32')
Y_train = np.empty(shape=[len(updated_train_y),1], dtype='int32')
# 1 - start of sentence, # 2 - end of sentence, # 0 - zero padding. Hence, word indices start with 3 
for j, curr_row in enumerate(updated_train_data):
    sequence_cnt = 0         
    for item in curr_row:
        if item in vocabulary:
            X_train[j, sequence_cnt, :] = wordvec_model[item] 
            sequence_cnt = sequence_cnt + 1                
            if sequence_cnt == max_sentence_len-1:
                break                
    for k in range(sequence_cnt, max_sentence_len):
        X_train[j, k, :] = np.zeros((1,embed_size_word2vec))        
    Y_train[j,0] = unique_train_ys.index(updated_train_y[j])

X_test = np.empty(shape=[len(updated_test_data), max_sentence_len, embed_size_word2vec], dtype='float32')
Y_test = np.empty(shape=[len(updated_test_y),1], dtype='int32')
# 1 - start of sentence, # 2 - end of sentence, # 0 - zero padding. Hence, word indices start with 3 
for j, curr_row in enumerate(updated_test_data):
    sequence_cnt = 0          
    for item in curr_row:
        if item in vocabulary:
            X_test[j, sequence_cnt, :] = wordvec_model[item] 
            sequence_cnt = sequence_cnt + 1                
            if sequence_cnt == max_sentence_len-1:
                break                
    for k in range(sequence_cnt, max_sentence_len):
        X_test[j, k, :] = np.zeros((1,embed_size_word2vec))        
    Y_test[j,0] = unique_train_ys.index(updated_test_y[j])

y_train = np_utils.to_categorical(Y_train, len(unique_train_ys))
y_test = np_utils.to_categorical(Y_test, len(unique_train_ys))

  X_train[j, sequence_cnt, :] = wordvec_model[item]
  X_test[j, sequence_cnt, :] = wordvec_model[item]


In [84]:
from tensorflow.keras.layers import Wrapper, InputSpec, TimeDistributed, BatchNormalization, Bidirectional
import tensorflow as tf
import keras.backend as K
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam

In [71]:
def make_safe(x):
    return K.clip(x, K.common._EPSILON, 1.0 - K.common._EPSILON)

class ProbabilityTensor(Wrapper):
    """ function for turning 3d tensor to 2d probability matrix, which is the set of a_i's """
    def __init__(self, dense_function=None, *args, **kwargs):
        self.supports_masking = True
        self.input_spec = [InputSpec(ndim=3)]
        #layer = TimeDistributed(dense_function) or TimeDistributed(Dense(1, name='ptensor_func'))
        layer = TimeDistributed(Dense(1, name='ptensor_func'))
        super(ProbabilityTensor, self).__init__(layer, *args, **kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.input_spec = [InputSpec(shape=input_shape)]
        if K.backend() == 'tensorflow':
            if not input_shape[1]:
                raise Exception('When using TensorFlow, you should define '
                                'explicitly the number of timesteps of '
                                'your sequences.\n'
                                'If your first layer is an Embedding, '
                                'make sure to pass it an "input_length" '
                                'argument. Otherwise, make sure '
                                'the first layer has '
                                'an "input_shape" or "batch_input_shape" '
                                'argument, including the time axis.')

        if not self.layer.built:
            self.layer.build(input_shape)
            self.layer.built = True
        super(ProbabilityTensor, self).build()

    def get_output_shape_for(self, input_shape):
        # b,n,f -> b,n 
        #       s.t. \sum_n n = 1
        if isinstance(input_shape, (list,tuple)) and not isinstance(input_shape[0], int):
            input_shape = input_shape[0]

        return (input_shape[0], input_shape[1])

    def squash_mask(self, mask):
        if K.ndim(mask) == 2:
            return mask
        elif K.ndim(mask) == 3:
            return K.any(mask, axis=-1)

    def compute_mask(self, x, mask=None):
        if mask is None:
            return None
        return self.squash_mask(mask)

    def call(self, x, mask=None):
        energy = K.squeeze(self.layer(x), 2)
        p_matrix = K.softmax(energy)
        if mask is not None:
            mask = self.squash_mask(mask)
            p_matrix = make_safe(p_matrix * mask)
            p_matrix = (p_matrix / K.sum(p_matrix, axis=-1, keepdims=True))*mask
        return p_matrix

    def get_config(self):
        config = {}
        base_config = super(ProbabilityTensor, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

class SoftAttentionConcat(ProbabilityTensor):
    '''This will create the context vector and then concatenate it with the last output of the LSTM'''
    def get_output_shape_for(self, input_shape):
        # b,n,f -> b,f where f is weighted features summed across n
        return (input_shape[0], 2*input_shape[2])

    def compute_mask(self, x, mask=None):
        if mask is None or mask.ndim==2:
            return None
        else:
            raise Exception("Unexpected situation")

    def call(self, x, mask=None):
        # b,n,f -> b,f via b,n broadcasted
        p_vectors = K.expand_dims(super(SoftAttentionConcat, self).call(x, mask), 2)
        expanded_p = K.repeat_elements(p_vectors, K.int_shape(x)[2], axis=2)
        context = K.sum(expanded_p * x, axis=1)
        last_out = x[:, -1, :]
        return K.concatenate([context, last_out])

In [97]:
inp = Input(shape=(max_sentence_len, embed_size_word2vec))
#sequence_embed = Embedding(vocab_size, embed_size_word2vec, input_length=max_sentence_len)(inp)

forwards_1 = Bidirectional(LSTM(1024, return_sequences=True, recurrent_dropout=0.2))(inp)
attention_1 = SoftAttentionConcat()(forwards_1)
after_dp_forward_5 = BatchNormalization()(attention_1)

after_merge = Dense(1000, activation='relu')(after_dp_forward_5)
after_dp = Dropout(0.4)(after_merge)
output = Dense(2, activation='softmax')(after_dp)                
model = Model(inputs=inp, outputs=output)
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=1e-4), metrics=['accuracy'])

In [98]:
model.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_25 (InputLayer)        [(None, 50, 200)]         0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 50, 2048)          10035200  
_________________________________________________________________
soft_attention_concat_24 (So (None, 4096)              2049      
_________________________________________________________________
batch_normalization_15 (Batc (None, 4096)              16384     
_________________________________________________________________
dense_13 (Dense)             (None, 1000)              4097000   
_________________________________________________________________
dropout_7 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_14 (Dense)             (None, 2)                 2002

Train the deep learning model and test using the classifier as follows:

In [96]:
y_train = np.argmax(y_train, axis=-1)
y_test = np.argmax(y_test, axis=-1)

y_train = tf.keras.utils.to_categorical(y_train < 5, num_classes=2)
y_test = tf.keras.utils.to_categorical(y_test < 5, num_classes=2)

In [99]:
early_stopping = EarlyStopping(monitor='val_loss', patience=2)
hist = model.fit(X_train, y_train, batch_size=batch_size, epochs=200, callbacks=[early_stopping])              
    
predict = model.predict(X_test)        
accuracy = []
sortedIndices = []
pred_classes = []
for ll in predict:
    sortedIndices.append(sorted(range(len(ll)), key=lambda ii: ll[ii], reverse=True))
for k in range(1, rankK+1):
    id = 0
    trueNum = 0
    for sortedInd in sortedIndices:            
        pred_classes.append(classes[sortedInd[:k]])
        if y_test[id] in classes[sortedInd[:k]]:
            trueNum += 1
        id += 1
    accuracy.append((float(trueNum) / len(predict)) * 100)
print('Test accuracy:', accuracy)       

train_result = hist.history        
print(train_result)

Epoch 1/200
  4/390 [..............................] - ETA: 18:18 - loss: 0.4092 - accuracy: 0.8438

KeyboardInterrupt: 