In [1]:
import keras
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
# %matplotlib inline

from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

import keras.backend as K
from keras_bert import load_vocabulary
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import _pickle as pickle
import random

Using TensorFlow backend.


In [2]:
%env epochs 1000
%env base netbeans
%env method deepQL_topics_trainable

env: epochs=1000
env: base=netbeans
env: method=deepQL_topics_trainable


In [3]:
EMBEDDING_DIM = 300
MAX_NB_WORDS = 2000
epochs = int(os.environ['epochs'])

In [4]:
DOMAIN = os.environ['base']
TOKEN = 'bert'
PREPROCESSING = 'bert'
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
DIR = 'data/processed/{}/{}'.format(DOMAIN, PREPROCESSING)
TRAIN_PATH = os.path.join(DIR, 'train_chronological.txt')
TEST_PATH = os.path.join(DIR, 'test_chronological.txt')

In [5]:
pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
model_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

In [6]:
token_dict = load_vocabulary(vocab_path)

In [7]:
MAX_SEQUENCE_LENGTH_T = 100
MAX_SEQUENCE_LENGTH_D = 100

In [8]:
baseline = Baseline(DOMAIN, DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 
                    token_dict['[CLS]'], token_dict['[SEP]'])
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

In [9]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

In [10]:
issues_by_buckets = experiment.get_buckets_for_bugs()

HBox(children=(IntProgress(value=0, max=181971), HTML(value='')))




### Create split based on clusters 

In [11]:
print("Number of groups {}".format(len(retrieval.buckets)))

Number of groups 181971


In [12]:
buckets = retrieval.buckets

In [13]:
SPLIT_SIZE_TRAIN = 0.9
list_of_buckets = list(buckets.keys())
random.shuffle(list_of_buckets)
SPLIT_SIZE = int(len(list_of_buckets) * SPLIT_SIZE_TRAIN)
train_buckets = list_of_buckets[:SPLIT_SIZE]
test_buckets = list_of_buckets[SPLIT_SIZE:]

print("Duplicate groups in train: {}".format(len(train_buckets)))
print("Duplicate groups in test: {}".format(len(test_buckets)))

Duplicate groups in train: 163773
Duplicate groups in test: 18198


In [14]:
bucket_stats = { 'train' : { 'bucket_size' : {} }, 'test' : { 'bucket_size' : {} } }
    
def count_bucket_size(split, split_bucket):
    for bucket_id in split_bucket:
        bucket_size = len(buckets[bucket_id])
        if bucket_size in bucket_stats[split]['bucket_size']:
            bucket_stats[split]['bucket_size'][bucket_size] += 1
        else:
            bucket_stats[split]['bucket_size'][bucket_size] = 1

count_bucket_size('train', train_buckets)
count_bucket_size('test', test_buckets)

bucket_stats

{'test': {'bucket_size': {1: 16283,
   2: 1303,
   3: 312,
   4: 120,
   5: 57,
   6: 46,
   7: 21,
   8: 11,
   9: 14,
   10: 5,
   11: 7,
   12: 2,
   13: 1,
   14: 2,
   15: 3,
   16: 1,
   17: 2,
   18: 2,
   19: 2,
   25: 1,
   26: 1,
   36: 1,
   41: 1}},
 'train': {'bucket_size': {1: 146638,
   2: 11493,
   3: 2938,
   4: 1178,
   5: 548,
   6: 324,
   7: 186,
   8: 110,
   9: 105,
   10: 53,
   11: 35,
   12: 26,
   13: 29,
   14: 18,
   15: 17,
   16: 3,
   17: 8,
   18: 10,
   19: 8,
   20: 2,
   21: 6,
   22: 4,
   23: 4,
   25: 3,
   26: 1,
   27: 1,
   28: 1,
   29: 2,
   30: 2,
   31: 3,
   32: 1,
   33: 3,
   36: 1,
   37: 1,
   38: 1,
   40: 1,
   41: 1,
   42: 2,
   43: 1,
   44: 1,
   45: 1,
   49: 1,
   54: 1,
   56: 1}}}

In [15]:
train_dups = []
test_dups = []
train_ids = []
test_ids = []

def create_pairs(name, set_buckets, pair_dups, ids):
    for bucket_id in set_buckets:
        bucket = list(buckets[bucket_id])
        for i, bug_id in enumerate(bucket):
            for dup_id in bucket[i+1:]:
                pair_dups.append([bug_id, dup_id])
                ids.append(bug_id)
                ids.append(dup_id)
                
create_pairs("train", train_buckets, train_dups, train_ids)
create_pairs("test", test_buckets, test_dups, test_ids)

print("Train pair size {}".format(len(train_dups)))
print("Test pair size {}".format(len(test_dups)))
print("******* IDS ***********")
print("Train ids size {}".format(len(train_ids)))
print("Test ids size {}".format(len(test_ids)))
print("Train and test with equal instances {}".format(len(set(train_ids) & set(test_ids))))

Train pair size 87532
Test pair size 9904
******* IDS ***********
Train ids size 175064
Test ids size 19808
Train and test with equal instances 0


In [16]:
set(train_ids) & set(test_ids)

set()

In [17]:
train_dups[:20]

[[103384, 90017],
 [55428, 57796],
 [55428, 52229],
 [55428, 59307],
 [55428, 52696],
 [55428, 39449],
 [55428, 53307],
 [57796, 52229],
 [57796, 59307],
 [57796, 52696],
 [57796, 39449],
 [57796, 53307],
 [52229, 59307],
 [52229, 52696],
 [52229, 39449],
 [52229, 53307],
 [59307, 52696],
 [59307, 39449],
 [59307, 53307],
 [52696, 39449]]

In [18]:
test_dups[:20]

[[61739, 60341],
 [62217, 62171],
 [179665, 171610],
 [122522, 124522],
 [25881, 32026],
 [25881, 27569],
 [32026, 27569],
 [59515, 49646],
 [148344, 148047],
 [49112, 48345],
 [118539, 118541],
 [33225, 28703],
 [105526, 105335],
 [124492, 121470],
 [159016, 187495],
 [107396, 61767],
 [107396, 90412],
 [107396, 143408],
 [107396, 117114],
 [107396, 60540]]

## Logs

## Eclipse
- Number of groups 295713
- Duplicate groups in train: 266141
- Duplicate groups in test: 29572

```
{'test': {'bucket_size': {1: 27249,
   2: 1638,
   3: 402,
   4: 126,
   5: 56,
   6: 39,
   7: 17,
   8: 7,
   9: 11,
   10: 9,
   11: 2,
   12: 3,
   13: 2,
   14: 2,
   15: 3,
   16: 1,
   17: 1,
   21: 1,
   23: 1,
   30: 1,
   35: 1}},
 'train': {'bucket_size': {1: 244652,
   2: 15432,
   3: 3380,
   4: 1237,
   5: 574,
   6: 312,
   7: 167,
   8: 103,
   9: 78,
   10: 47,
   11: 39,
   12: 18,
   13: 23,
   14: 21,
   15: 9,
   16: 6,
   17: 10,
   18: 4,
   19: 3,
   20: 1,
   21: 4,
   22: 1,
   23: 3,
   24: 3,
   25: 1,
   26: 3,
   28: 1,
   29: 1,
   30: 2,
   33: 1,
   34: 1,
   40: 1,
   42: 1,
   45: 1,
   50: 1}}}
```

- Train pair size 76638
- Test pair size 8809
- ******* IDS ***********
- Train ids size 153276
- Test ids size 17618
- Train and test with equal instances 0
- ******* Retrieval ********
- Tested in 6163 groups queries

## Netbeans

- Number of groups 181971
- Duplicate groups in train: 163773
- Duplicate groups in test: 18198

```
{'test': {'bucket_size': {1: 16283,
   2: 1303,
   3: 312,
   4: 120,
   5: 57,
   6: 46,
   7: 21,
   8: 11,
   9: 14,
   10: 5,
   11: 7,
   12: 2,
   13: 1,
   14: 2,
   15: 3,
   16: 1,
   17: 2,
   18: 2,
   19: 2,
   25: 1,
   26: 1,
   36: 1,
   41: 1}},
 'train': {'bucket_size': {1: 146638,
   2: 11493,
   3: 2938,
   4: 1178,
   5: 548,
   6: 324,
   7: 186,
   8: 110,
   9: 105,
   10: 53,
   11: 35,
   12: 26,
   13: 29,
   14: 18,
   15: 17,
   16: 3,
   17: 8,
   18: 10,
   19: 8,
   20: 2,
   21: 6,
   22: 4,
   23: 4,
   25: 3,
   26: 1,
   27: 1,
   28: 1,
   29: 2,
   30: 2,
   31: 3,
   32: 1,
   33: 3,
   36: 1,
   37: 1,
   38: 1,
   40: 1,
   41: 1,
   42: 2,
   43: 1,
   44: 1,
   45: 1,
   49: 1,
   54: 1,
   56: 1}}}
```
- Train pair size 87532
- Test pair size 9904
- ******* IDS ***********
- Train ids size 175064
- Test ids size 19808
- Train and test with equal instances 0
- ******* Retrieval ********
- Tested in 5433 groups queries

## Open Office
- Number of groups 58572
- Duplicate groups in train: 52714
- Duplicate groups in test: 5858

```
    {'test': {'bucket_size': {1: 5228,
   2: 419,
   3: 86,
   4: 45,
   5: 26,
   6: 12,
   7: 11,
   8: 9,
   9: 4,
   10: 3,
   11: 3,
   12: 3,
   13: 1,
   14: 2,
   15: 2,
   16: 2,
   18: 1,
   39: 1}},
 'train': {'bucket_size': {1: 47212,
   2: 3457,
   3: 925,
   4: 373,
   5: 228,
   6: 138,
   7: 84,
   8: 60,
   9: 44,
   10: 22,
   11: 26,
   12: 21,
   13: 19,
   14: 18,
   15: 12,
   16: 13,
   17: 7,
   18: 4,
   19: 5,
   20: 5,
   21: 6,
   22: 3,
   23: 3,
   24: 4,
   25: 3,
   26: 2,
   27: 1,
   28: 2,
   29: 3,
   30: 1,
   31: 1,
   32: 3,
   34: 1,
   35: 1,
   36: 3,
   39: 1,
   47: 1,
   55: 1,
   99: 1}}}
```

- Train pair size 52246
- Test pair size 4116
- ******* IDS ***********
- Train ids size 104492
- Test ids size 8232
- Train and test with equal instances 0
- ******* Retrieval ********
- Tested in 1922 groups queries

### Save train pairs

In [19]:
TRAIN_PATH

'data/processed/netbeans/bert/train_chronological.txt'

In [20]:
with open(TRAIN_PATH, 'w') as f:
    for pair in train_dups:
        f.write("{} {}\n".format(pair[0], pair[1]))

### Save test pairs

In [21]:
TEST_PATH

'data/processed/netbeans/bert/test_chronological.txt'

In [22]:
test_data = {}
for pair in test_dups:
    bug1 = int(pair[0])
    bug2 = int(pair[1])
    
    bucket = issues_by_buckets[bug1]
        
    if bucket not in test_data:
        test_data[bucket] = set()
        
    test_data[bucket].add(bug1)
    test_data[bucket].add(bug2)
with open(TEST_PATH, 'w') as f:
    for bug in test_data.keys():
        f.write("{}\n".format(' '.join([str(x) for x in test_data[bug]])))