In [17]:
import keras

In [18]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [19]:
from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, \
    GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Sequential, Model
from keras.regularizers import l2
from keras.initializers import TruncatedNormal
from keras.layers.advanced_activations import LeakyReLU, ELU
from keras import optimizers

In [20]:
from methods.baseline import Baseline

In [21]:
MAX_SEQUENCE_LENGTH_T = 100 # 40
MAX_SEQUENCE_LENGTH_D = 500 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 20000

In [22]:
# Domain to use
DOMAIN = 'eclipse'
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Glove embeddings
GLOVE_DIR='data/embed'
# Save model
SAVE_PATH = 'baseline_feature@number_of_epochs@epochs_64batch({})'.format(DOMAIN)
SAVE_PATH_FEATURE = 'baseline_feature_@number_of_epochs@epochs_64batch({})'.format(DOMAIN)

# Extract CORPUs
EXTRACT_CORPUS = False

In [23]:
baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

In [24]:
baseline.load_ids(DIR)
len(baseline.bug_ids)

Reading bug ids


212512

In [25]:
def data_padding(data, max_seq_length):
    seq_lengths = [len(seq) for seq in data]
    seq_lengths.append(6)
    max_seq_length = min(max(seq_lengths), max_seq_length)
    padded_data = np.zeros(shape=[len(data), max_seq_length])
    for i, seq in enumerate(data):
        seq = seq[:max_seq_length]
        for j, token in enumerate(seq):
            padded_data[i, j] = int(token)
    return padded_data.astype(np.int)

In [26]:
import _pickle as pickle

def load_bugs(baseline):   
    removed = []
    baseline.corpus = []
    baseline.sentence_dict = {}
    baseline.bug_set = {}
    title_padding, desc_padding = [], []
    for bug_id in tqdm(baseline.bug_ids):
        try:
            bug = pickle.load(open(os.path.join(baseline.DIR, 'bugs', '{}.pkl'.format(bug_id)), 'rb'))
            title_padding.append(bug['title_word'])
            desc_padding.append(bug['description_word'])
            baseline.bug_set[bug_id] = bug
            #break
        except:
            removed.append(bug_id)
    
    # Padding
    title_padding = data_padding(title_padding, 100)
    desc_padding = data_padding(desc_padding, 500)
    
    for bug_id, bug_title, bug_desc in tqdm(zip(baseline.bug_ids, title_padding, desc_padding)):
        baseline.bug_set[bug_id]['title_word'] = bug_title
        baseline.bug_set[bug_id]['description_word'] = bug_desc
        bug = baseline.bug_set[bug_id]
        baseline.sentence_dict[",".join(bug_title.astype(str))] = bug['title']
        baseline.sentence_dict[",".join(bug_desc.astype(str))] = bug['description']
    
    if len(removed) > 0:
        for x in removed:
            baseline.bug_ids.remove(x)
        baseline.removed = removed
        print("{} were removed. To see the list call self.removed".format(len(removed)))

In [27]:
%%time

load_bugs(baseline)
len(baseline.sentence_dict)

HBox(children=(IntProgress(value=0, max=212512), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 1min 12s, sys: 2.26 s, total: 1min 14s
Wall time: 1min 13s


In [28]:
baseline.bug_set[1]

{'bug_severity': '2\n',
 'bug_status': '0\n',
 'component': '351\n',
 'creation_ts': '2001-10-10 21:34:00 -0400',
 'delta_ts': '2012-02-09 15:57:47 -0500',
 'description': 'setup project that contains gif resource release project to organization edit the gif resource with an external editor organization save and close external editor in organization open the icon resource and verify that your changes are there release project nothing to release in organization open the icon resource and verify that your changes are still there person because never refreshed from local the workspace hasn changed so person didn find anything however opening the resource with an external editor found the modified file on disk and showed the changes the real problem occurs if person actually finds something to release but you don spot that some resources are missing this is extremely error prone one of my changes didn made it into build because of this notes eg pm person should do refresh from local before

### Export bugs preprocessed

In [30]:
with open(os.path.join(baseline.DIR, 'bugs_preprocessed.pkl'), 'wb') as f:
      pickle.dump(baseline.bug_set, f)