# Bert pretrained in the dataset

https://pypi.org/project/bert-embedding/

## Word embedding vocabulary

In [1]:
import fasttext
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [2]:
from methods.baseline import Baseline

Using TensorFlow backend.


In [3]:
MAX_SEQUENCE_LENGTH_T = 100 # 40
MAX_SEQUENCE_LENGTH_D = 200 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 2000

In [4]:
# Domain to use
DOMAIN = 'eclipse'
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))

In [5]:
baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

In [6]:
baseline.load_ids(DIR)
len(baseline.bug_ids)

Reading bug ids


361006

In [7]:
import _pickle as pickle

def load_bugs(baseline):   
    removed = []
    baseline.corpus = []
    baseline.sentence_dict = {}
    baseline.bug_set = {}
    title_padding, desc_padding = [], []
    for bug_id in tqdm(baseline.bug_ids):
        try:
            bug = pickle.load(open(os.path.join(baseline.DIR, 'bugs', '{}.pkl'.format(bug_id)), 'rb'))
            baseline.bug_set[bug_id] = bug
            #break
        except:
            removed.append(bug_id)
    
    if len(removed) > 0:
        for x in removed:
            baseline.bug_ids.remove(x)
        baseline.removed = removed
        print("{} were removed. To see the list call self.removed".format(len(removed)))

In [8]:
%%time

load_bugs(baseline)

HBox(children=(IntProgress(value=0, max=361006), HTML(value='')))


CPU times: user 11.8 s, sys: 2.3 s, total: 14.1 s
Wall time: 14.1 s


#### Read the corpus from bugs

In [9]:
sent_title = [baseline.bug_set[bug_id]['title'][:MAX_SEQUENCE_LENGTH_T] for bug_id in baseline.bug_ids]
sent_desc = [baseline.bug_set[bug_id]['description'][:MAX_SEQUENCE_LENGTH_D] for bug_id in baseline.bug_ids]

In [10]:
len(sent_title), len(sent_desc)

(361006, 361006)

### BERT embedding

In [11]:
import mxnet as mx
from bert_embedding import BertEmbedding

ctx = mx.gpu(0)
bert_embedding = BertEmbedding(ctx, batch_size=32, max_seq_length=MAX_SEQUENCE_LENGTH_D)

### Save dataset vocabulary embedding

In [13]:
import _pickle as pickle

In [23]:
diff = 3208 + 1522
loop = tqdm(total=len(baseline.bug_ids) - diff)
index = 0
for title, desc, bug_id in zip(sent_title, sent_desc, baseline.bug_ids):
    if index < diff: pass
    result_title = bert_embedding(title, 'avg')
    result_desc = bert_embedding(desc, 'avg')

    bug = baseline.bug_set[bug_id]
    bug['title_bert_embed'] = np.mean(result_title[0][1], 0)
    bug['desc_bert_embed'] = np.mean(result_desc[0][1], 0)
    
    with open(os.path.join(baseline.DIR, 'bugs', '{}.pkl'.format(bug_id)), 'wb') as f:
        pickle.dump(bug, f)
    loop.update(1)
    index+=1
loop.close()

HBox(children=(IntProgress(value=0, max=357798), HTML(value='')))

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
bug_selected = np.random.choice(baseline.bug_ids, 1)[0]

bug = baseline.bug_set[bug_selected]

assert len(bug['title_bert_embed']) == 768
assert len(bug['desc_bert_embed']) == 768