# FastText pretrained in the dataset

https://github.com/facebookresearch/fastText

In [1]:
import fasttext
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [2]:
from methods.baseline import Baseline

Using TensorFlow backend.


In [3]:
MAX_SEQUENCE_LENGTH_T = 100 # 40
MAX_SEQUENCE_LENGTH_D = 500 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 2000

In [4]:
# Domain to use
DOMAIN = 'eclipse'
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))

In [5]:
baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

In [6]:
baseline.load_ids(DIR)
len(baseline.bug_ids)

Reading bug ids


212512

In [7]:
def data_padding(data, max_seq_length):
    seq_lengths = [len(seq) for seq in data]
    seq_lengths.append(6)
    max_seq_length = min(max(seq_lengths), max_seq_length)
    padded_data = np.zeros(shape=[len(data), max_seq_length])
    for i, seq in enumerate(data):
        seq = seq[:max_seq_length]
        for j, token in enumerate(seq):
            padded_data[i, j] = int(token)
    return padded_data.astype(np.int)

In [8]:
import _pickle as pickle

def load_bugs(baseline):   
    removed = []
    baseline.corpus = []
    baseline.sentence_dict = {}
    baseline.bug_set = {}
    title_padding, desc_padding = [], []
    for bug_id in tqdm(baseline.bug_ids):
        try:
            bug = pickle.load(open(os.path.join(baseline.DIR, 'bugs', '{}.pkl'.format(bug_id)), 'rb'))
            title_padding.append(bug['title_word'])
            desc_padding.append(bug['description_word'])
            baseline.bug_set[bug_id] = bug
            #break
        except:
            removed.append(bug_id)
    
    if len(removed) > 0:
        for x in removed:
            baseline.bug_ids.remove(x)
        baseline.removed = removed
        print("{} were removed. To see the list call self.removed".format(len(removed)))

In [9]:
%%time

load_bugs(baseline)

HBox(children=(IntProgress(value=0, max=212512), HTML(value='')))


CPU times: user 6.62 s, sys: 1.31 s, total: 7.93 s
Wall time: 7.92 s


### Read the corpus from bugs

In [10]:
lines_title = [baseline.bug_set[bug_id]['title'] for bug_id in baseline.bug_ids]
lines_desc = [baseline.bug_set[bug_id]['description'] for bug_id in baseline.bug_ids]

lines = []
for title, desc in zip(lines_title, lines_desc):
    lines.append(title)
    lines.append(desc)

In [11]:
len(lines)

425024

#### Export corpus 

In [12]:
EXPORT_PATH = os.path.join(DIR, 'corpus_fasttext.txt')
print(EXPORT_PATH)
with open(EXPORT_PATH, 'w') as f:
    for row in lines:
        f.write("{}\n".format(str(row).encode("utf-8")))

data/processed/eclipse/corpus_fasttext.txt


## Fasttext embedding

In [13]:
%%time

import os

cpu = os.cpu_count() - 1

# Skipgram model :
model = fasttext.train_unsupervised(EXPORT_PATH, 
                                    model='cbow', 
                                    minCount=1,
                                    neg=10,
                                    wordNgrams=1,
                                    epoch=15, 
                                    dim=300, 
                                    loss='hs', 
                                    thread=cpu, 
                                    verbose=2)

### Save dataset vocabulary embedding

In [14]:
import _pickle as pickle

In [15]:
vocab_embed = {}
for word in model.words:
    vocab_embed[word] = model[word]
    
with open(os.path.join(baseline.DIR, 'vocab_embed_fasttext.pkl'), 'wb') as f:
      pickle.dump(vocab_embed, f)

In [22]:
model.get_sentence_vector('test vector')

array([ 0.00175972,  0.02722351, -0.01460259, -0.02026979, -0.01797591,
        0.05911409,  0.0319526 ,  0.02416203,  0.00084752,  0.0302299 ,
        0.03696913, -0.00107778,  0.04835556, -0.00819601, -0.01121616,
       -0.03569226, -0.08328871,  0.04925559,  0.00762468,  0.04186817,
        0.05689619, -0.03952855, -0.04416988,  0.02871136,  0.0231767 ,
       -0.06488574,  0.11846077,  0.00388976,  0.04012141, -0.00241362,
        0.0097272 ,  0.00541798, -0.03951032,  0.03909737,  0.01908095,
       -0.01078507,  0.00778152,  0.02252823,  0.03085625, -0.03297628,
       -0.00321536,  0.023133  ,  0.05466954, -0.01693534,  0.02910661,
       -0.05485218,  0.03256058,  0.01006469, -0.01969906, -0.02642566,
       -0.01864357, -0.03623489,  0.04043628, -0.05952513,  0.11100627,
        0.00285595,  0.04999549, -0.04425278,  0.01423279,  0.01518322,
       -0.01133189,  0.03238859,  0.06133053, -0.10155558, -0.03357397,
        0.06344308, -0.01000919,  0.0115397 ,  0.09718227, -0.00

In [16]:
vocab_embed['eclipse']

array([-0.2817357 ,  0.07263922, -0.23383522, -0.27178627, -0.46044028,
        0.09930528, -0.08266941, -0.3414438 , -0.12172007, -0.02020703,
        0.04272586,  0.12412646, -0.23061396, -0.02389326, -0.25645465,
        0.51635754,  0.60051656, -0.5477507 , -0.03928194, -0.4381328 ,
       -0.14773576,  0.6296185 , -0.28304514, -0.4619913 ,  0.11263881,
       -0.19993496, -0.09578929,  0.17086042, -0.08002959, -0.0577875 ,
       -0.14353517,  0.20513104, -0.14546664,  0.41664863, -0.00909067,
       -0.44577545, -0.05814071,  0.48839086,  0.4224951 , -0.12613736,
       -0.0060224 , -0.01784417,  0.31477505,  0.13123685, -0.24293622,
       -0.84600616,  0.12673122, -0.09047916, -0.35037407, -0.22508188,
        0.05803216,  0.29191056,  0.0483678 , -0.00171087,  0.01306592,
        0.53967595,  0.23050229, -0.17401344,  0.20859607, -0.03752097,
        0.31133562,  0.13282089,  0.46954548,  0.12900744,  0.4844747 ,
        0.27863738, -0.58140314, -0.47631353, -0.3390309 ,  0.37