# Glove pretrained in the dataset

https://medium.com/@japneet121/word-vectorization-using-glove-76919685ee0b

In [1]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [2]:
from methods.baseline import Baseline

Using TensorFlow backend.


In [3]:
MAX_SEQUENCE_LENGTH_T = 100 # 40
MAX_SEQUENCE_LENGTH_D = 500 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 2000

In [4]:
# Domain to use
DOMAIN = 'openoffice'
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Glove embeddings
GLOVE_DIR='data/embed'
# Save model
SAVE_PATH = 'baseline_feature@number_of_epochs@epochs_64batch({})'.format(DOMAIN)
SAVE_PATH_FEATURE = 'baseline_feature_@number_of_epochs@epochs_64batch({})'.format(DOMAIN)

In [5]:
baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

In [6]:
baseline.load_ids(DIR)
len(baseline.bug_ids)

Reading bug ids


98070

In [7]:
def data_padding(data, max_seq_length):
    seq_lengths = [len(seq) for seq in data]
    seq_lengths.append(6)
    max_seq_length = min(max(seq_lengths), max_seq_length)
    padded_data = np.zeros(shape=[len(data), max_seq_length])
    for i, seq in enumerate(data):
        seq = seq[:max_seq_length]
        for j, token in enumerate(seq):
            padded_data[i, j] = int(token)
    return padded_data.astype(np.int)

In [8]:
import _pickle as pickle

def load_bugs(baseline):   
    removed = []
    baseline.corpus = []
    baseline.sentence_dict = {}
    baseline.bug_set = {}
    title_padding, desc_padding = [], []
    for bug_id in tqdm(baseline.bug_ids):
        try:
            bug = pickle.load(open(os.path.join(baseline.DIR, 'bugs', '{}.pkl'.format(bug_id)), 'rb'))
            title_padding.append(bug['title_word'])
            desc_padding.append(bug['description_word'])
            baseline.bug_set[bug_id] = bug
            #break
        except:
            removed.append(bug_id)
    
    if len(removed) > 0:
        for x in removed:
            baseline.bug_ids.remove(x)
        baseline.removed = removed
        print("{} were removed. To see the list call self.removed".format(len(removed)))

In [9]:
%%time

load_bugs(baseline)

HBox(children=(IntProgress(value=0, max=98070), HTML(value='')))


CPU times: user 3.05 s, sys: 620 ms, total: 3.67 s
Wall time: 3.66 s


### Read the corpus from bugs

In [10]:
lines_title = [baseline.bug_set[bug_id]['title'] for bug_id in baseline.bug_ids]
lines_desc = [baseline.bug_set[bug_id]['description'] for bug_id in baseline.bug_ids]

lines = []
for title, desc in zip(lines_title, lines_desc):
    lines.append(title)
    lines.append(desc)
    
lines = [line.split(' ') for line in lines]

In [11]:
len(lines)

196140

## Glove embedding

In [12]:
%%time
#importing the glove library
from glove import Corpus, Glove
# creating a corpus object
corpus = Corpus() 
#training the corpus to generate the co occurence matrix which is used in GloVe
corpus.fit(lines, window=3)
#creating a Glove object which will use the matrix created in the above lines to create embeddings
#We can set the learning rate as it uses Gradient Descent and number of components
glove = Glove(no_components=300, learning_rate=0.05)
 
glove.fit(corpus.matrix, epochs=100, no_threads=7, verbose=True)
glove.add_dictionary(corpus.dictionary)
#glove.save('glove.model')

Performing 100 training epochs with 7 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49
Epoch 50
Epoch 51
Epoch 52
Epoch 53
Epoch 54
Epoch 55
Epoch 56
Epoch 57
Epoch 58
Epoch 59
Epoch 60
Epoch 61
Epoch 62
Epoch 63
Epoch 64
Epoch 65
Epoch 66
Epoch 67
Epoch 68
Epoch 69
Epoch 70
Epoch 71
Epoch 72
Epoch 73
Epoch 74
Epoch 75
Epoch 76
Epoch 77
Epoch 78
Epoch 79
Epoch 80
Epoch 81
Epoch 82
Epoch 83
Epoch 84
Epoch 85
Epoch 86
Epoch 87
Epoch 88
Epoch 89
Epoch 90
Epoch 91
Epoch 92
Epoch 93
Epoch 94
Epoch 95
Epoch 96
Epoch 97
Epoch 98
Epoch 99
CPU times: user 34min 56s, sys: 2.02 s, total: 34min 58s
Wall ti

In [13]:
#glove_load = Glove.load('glove.model')

In [14]:
glove.word_vectors[glove.dictionary['eclipse']]

array([ 6.01867771e-02,  3.32708199e-03,  5.34703701e-02,  3.59755813e-02,
       -1.50028588e-02,  7.06684434e-02,  1.20937319e-02,  4.71609428e-02,
       -2.78733469e-02, -2.81159162e-02, -3.55382513e-02, -1.14766120e-01,
       -9.87747919e-02,  3.94556895e-02, -6.15725544e-03, -1.39234217e-01,
       -1.90051784e-01, -1.48989947e-01,  1.69909657e-01, -3.31296975e-02,
       -1.17696441e-01, -7.53156419e-02, -9.57684318e-03,  2.62665182e-02,
        1.35284587e-01,  1.18351070e-01,  1.17474979e-02,  5.67653469e-02,
        4.53637514e-02,  4.82215382e-02, -4.73141708e-02, -6.45379264e-02,
        3.91995341e-02,  1.08785577e-01,  3.45878978e-02, -4.58123806e-02,
       -3.01455171e-02, -5.60844276e-02,  3.50961465e-02,  3.14311142e-02,
       -1.47080104e-02, -1.70614988e-02,  1.14051187e-01, -2.85323764e-02,
        2.12754614e-02, -1.26887494e-01, -9.57197256e-03, -1.79812675e-02,
       -4.85786148e-02,  6.53073952e-02,  5.48591859e-02,  7.00605318e-02,
        6.12393105e-02, -

### Save dataset vocabulary embedding

In [15]:
import _pickle as pickle

In [16]:
vocab_embed = {}
for word in glove.dictionary:
    vocab_embed[word] = glove.word_vectors[glove.dictionary[word]].tolist()
    
with open(os.path.join(baseline.DIR, 'vocab_embed.pkl'), 'wb') as f:
      pickle.dump(vocab_embed, f)

In [17]:
vocab_embed['eclipse']

[0.06018677705108694,
 0.0033270819852815473,
 0.05347037007380757,
 0.035975581313017674,
 -0.01500285880764355,
 0.07066844335723264,
 0.012093731948392932,
 0.04716094277272536,
 -0.02787334690425084,
 -0.02811591618614839,
 -0.03553825133506295,
 -0.11476612027247722,
 -0.09877479190689448,
 0.039455689463685426,
 -0.006157255444275719,
 -0.13923421726050955,
 -0.19005178424569263,
 -0.14898994703099985,
 0.16990965694464144,
 -0.03312969751863516,
 -0.11769644130720908,
 -0.0753156418556876,
 -0.009576843180881093,
 0.02626651815130591,
 0.135284587267566,
 0.11835106958835317,
 0.011747497945589856,
 0.05676534686989343,
 0.04536375141429867,
 0.0482215382301911,
 -0.04731417083862177,
 -0.06453792644014465,
 0.03919953408952698,
 0.10878557696307085,
 0.03458789779286799,
 -0.04581238058837084,
 -0.03014551710146009,
 -0.056084427561489854,
 0.035096146543139924,
 0.03143111417889879,
 -0.014708010357184814,
 -0.017061498788677538,
 0.11405118732442267,
 -0.02853237637786268,
 0