In [1]:
import keras
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
# %matplotlib inline

from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

import keras.backend as K
from keras_bert import load_vocabulary
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import _pickle as pickle
import random

Using TensorFlow backend.


In [2]:
%env epochs 1000
%env base openoffice
%env method deepQL_topics_trainable

env: epochs=1000
env: base=openoffice
env: method=deepQL_topics_trainable


In [3]:
EMBEDDING_DIM = 300
MAX_NB_WORDS = 2000
epochs = int(os.environ['epochs'])

In [54]:
DOMAIN = os.environ['base']
TOKEN = 'bert'
PREPROCESSING = 'bert'
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
DIR = 'data/processed/{}/{}'.format(DOMAIN, PREPROCESSING)
TRAIN_PATH = os.path.join(DIR, 'train_chronological.txt')
TEST_PATH = os.path.join(DIR, 'test_chronological.txt')

In [5]:
pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
model_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

In [6]:
token_dict = load_vocabulary(vocab_path)

In [7]:
MAX_SEQUENCE_LENGTH_T = 100
MAX_SEQUENCE_LENGTH_D = 100

In [12]:
baseline = Baseline(DOMAIN, DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 
                    token_dict['[CLS]'], token_dict['[SEP]'])
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

In [13]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

In [14]:
issues_by_buckets = experiment.get_buckets_for_bugs()

HBox(children=(IntProgress(value=0, max=58572), HTML(value='')))




### Create split based on clusters 

In [43]:
print("Number of groups {}".format(len(retrieval.buckets)))

Number of groups 58572


In [16]:
buckets = retrieval.buckets

In [17]:
SPLIT_SIZE_TRAIN = 0.9
list_of_buckets = list(buckets.keys())
random.shuffle(list_of_buckets)
SPLIT_SIZE = int(len(list_of_buckets) * SPLIT_SIZE_TRAIN)
train_buckets = list_of_buckets[:SPLIT_SIZE]
test_buckets = list_of_buckets[SPLIT_SIZE:]

print("Duplicate groups in train: {}".format(len(train_buckets)))
print("Duplicate groups in test: {}".format(len(test_buckets)))

Duplicate groups in train: 52714
Duplicate groups in test: 5858


In [18]:
bucket_stats = { 'train' : { 'bucket_size' : {} }, 'test' : { 'bucket_size' : {} } }
    
def count_bucket_size(split, split_bucket):
    for bucket_id in split_bucket:
        bucket_size = len(buckets[bucket_id])
        if bucket_size in bucket_stats[split]['bucket_size']:
            bucket_stats[split]['bucket_size'][bucket_size] += 1
        else:
            bucket_stats[split]['bucket_size'][bucket_size] = 1

count_bucket_size('train', train_buckets)
count_bucket_size('test', test_buckets)

bucket_stats

{'test': {'bucket_size': {1: 5228,
   2: 419,
   3: 86,
   4: 45,
   5: 26,
   6: 12,
   7: 11,
   8: 9,
   9: 4,
   10: 3,
   11: 3,
   12: 3,
   13: 1,
   14: 2,
   15: 2,
   16: 2,
   18: 1,
   39: 1}},
 'train': {'bucket_size': {1: 47212,
   2: 3457,
   3: 925,
   4: 373,
   5: 228,
   6: 138,
   7: 84,
   8: 60,
   9: 44,
   10: 22,
   11: 26,
   12: 21,
   13: 19,
   14: 18,
   15: 12,
   16: 13,
   17: 7,
   18: 4,
   19: 5,
   20: 5,
   21: 6,
   22: 3,
   23: 3,
   24: 4,
   25: 3,
   26: 2,
   27: 1,
   28: 2,
   29: 3,
   30: 1,
   31: 1,
   32: 3,
   34: 1,
   35: 1,
   36: 3,
   39: 1,
   47: 1,
   55: 1,
   99: 1}}}

In [20]:
train_dups = []
test_dups = []
train_ids = []
test_ids = []

def create_pairs(name, set_buckets, pair_dups, ids):
    for bucket_id in set_buckets:
        bucket = list(buckets[bucket_id])
        for i, bug_id in enumerate(bucket):
            for dup_id in bucket[i+1:]:
                pair_dups.append([bug_id, dup_id])
                ids.append(bug_id)
                ids.append(dup_id)
                
create_pairs("train", train_buckets, train_dups, train_ids)
create_pairs("test", test_buckets, test_dups, test_ids)

print("Train pair size {}".format(len(train_dups)))
print("Test pair size {}".format(len(test_dups)))
print("******* IDS ***********")
print("Train ids size {}".format(len(train_ids)))
print("Test ids size {}".format(len(test_ids)))
print("Train and test with equal instances {}".format(len(set(train_ids) & set(test_ids))))

Train pair size 52246
Test pair size 4116
******* IDS ***********
Train ids size 104492
Test ids size 8232
Train and test with equal instances 0


In [27]:
set(train_ids) & set(test_ids)

set()

In [28]:
train_dups[:20]

[[7499, 14267],
 [7499, 14451],
 [7499, 14335],
 [14267, 14451],
 [14267, 14335],
 [14451, 14335],
 [109857, 109858],
 [55321, 45346],
 [55321, 25797],
 [55321, 39359],
 [45346, 25797],
 [45346, 39359],
 [25797, 39359],
 [92745, 92738],
 [109864, 109794],
 [50427, 50284],
 [50504, 49782],
 [64488, 62539],
 [64488, 61654],
 [62539, 61654]]

In [29]:
test_dups[:20]

[[68834, 61479],
 [65216, 64835],
 [28992, 26890],
 [28992, 34122],
 [28992, 22287],
 [28992, 20402],
 [28992, 13267],
 [28992, 13492],
 [28992, 30742],
 [28992, 16410],
 [26890, 34122],
 [26890, 22287],
 [26890, 20402],
 [26890, 13267],
 [26890, 13492],
 [26890, 30742],
 [26890, 16410],
 [34122, 22287],
 [34122, 20402],
 [34122, 13267]]

## Logs

## Open Office
- Number of groups 58572
- Duplicate groups in train: 52714
- Duplicate groups in test: 5858

```
    {'test': {'bucket_size': {1: 5228,
   2: 419,
   3: 86,
   4: 45,
   5: 26,
   6: 12,
   7: 11,
   8: 9,
   9: 4,
   10: 3,
   11: 3,
   12: 3,
   13: 1,
   14: 2,
   15: 2,
   16: 2,
   18: 1,
   39: 1}},
 'train': {'bucket_size': {1: 47212,
   2: 3457,
   3: 925,
   4: 373,
   5: 228,
   6: 138,
   7: 84,
   8: 60,
   9: 44,
   10: 22,
   11: 26,
   12: 21,
   13: 19,
   14: 18,
   15: 12,
   16: 13,
   17: 7,
   18: 4,
   19: 5,
   20: 5,
   21: 6,
   22: 3,
   23: 3,
   24: 4,
   25: 3,
   26: 2,
   27: 1,
   28: 2,
   29: 3,
   30: 1,
   31: 1,
   32: 3,
   34: 1,
   35: 1,
   36: 3,
   39: 1,
   47: 1,
   55: 1,
   99: 1}}}
```

- Train pair size 52246
- Test pair size 4116
- ******* IDS ***********
- Train ids size 104492
- Test ids size 8232
- Train and test with equal instances 0
- ******* Retrieval ********
- Tested in 1922 groups queries

### Save train pairs

In [40]:
TRAIN_PATH

'data/processed/openoffice/bert/train_chronological.txt'

In [41]:
with open(TRAIN_PATH, 'w') as f:
    for pair in train_dups:
        f.write("{} {}\n".format(pair[0], pair[1]))

### Save test pairs

In [55]:
TEST_PATH

'data/processed/openoffice/bert/test_chronological.txt'

In [56]:
test_data = {}
for pair in test_dups:
    bug1 = int(pair[0])
    bug2 = int(pair[1])
    
    bucket = issues_by_buckets[bug1]
        
    if bucket not in test_data:
        test_data[bucket] = set()
        
    test_data[bucket].add(bug1)
    test_data[bucket].add(bug2)
with open(TEST_PATH, 'w') as f:
    for bug in test_data.keys():
        f.write("{}\n".format(' '.join([str(x) for x in test_data[bug]])))