# FastText pretrained in the dataset

https://github.com/facebookresearch/fastText

In [1]:
import fasttext
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [2]:
from methods.baseline import Baseline

Using TensorFlow backend.


In [3]:
MAX_SEQUENCE_LENGTH_T = 100 # 40
MAX_SEQUENCE_LENGTH_D = 500 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 2000

In [4]:
# Domain to use
DOMAIN = 'eclipse'
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))

In [5]:
baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

In [6]:
baseline.load_ids(DIR)
len(baseline.bug_ids)

Reading bug ids


361006

In [7]:
def data_padding(data, max_seq_length):
    seq_lengths = [len(seq) for seq in data]
    seq_lengths.append(6)
    max_seq_length = min(max(seq_lengths), max_seq_length)
    padded_data = np.zeros(shape=[len(data), max_seq_length])
    for i, seq in enumerate(data):
        seq = seq[:max_seq_length]
        for j, token in enumerate(seq):
            padded_data[i, j] = int(token)
    return padded_data.astype(np.int)

In [8]:
import _pickle as pickle

def load_bugs(baseline):   
    removed = []
    baseline.corpus = []
    baseline.sentence_dict = {}
    baseline.bug_set = {}
    title_padding, desc_padding = [], []
    for bug_id in tqdm(baseline.bug_ids):
        try:
            bug = pickle.load(open(os.path.join(baseline.DIR, 'bugs', '{}.pkl'.format(bug_id)), 'rb'))
            title_padding.append(bug['title_word'])
            desc_padding.append(bug['description_word'])
            baseline.bug_set[bug_id] = bug
            #break
        except:
            removed.append(bug_id)
    
    if len(removed) > 0:
        for x in removed:
            baseline.bug_ids.remove(x)
        baseline.removed = removed
        print("{} were removed. To see the list call self.removed".format(len(removed)))

In [9]:
%%time

load_bugs(baseline)

HBox(children=(IntProgress(value=0, max=361006), HTML(value='')))


CPU times: user 11.7 s, sys: 3.05 s, total: 14.8 s
Wall time: 15.1 s


### Read the corpus from bugs

In [10]:
lines_title = [baseline.bug_set[bug_id]['title'] for bug_id in baseline.bug_ids]
lines_desc = [baseline.bug_set[bug_id]['description'] for bug_id in baseline.bug_ids]

lines = []
for title, desc in zip(lines_title, lines_desc):
    lines.append(title)
    lines.append(desc)

In [11]:
len(lines)

722012

#### Export corpus 

In [12]:
EXPORT_PATH = os.path.join(DIR, 'corpus_fasttext.txt')
print(EXPORT_PATH)
with open(EXPORT_PATH, 'w') as f:
    for row in lines:
        f.write("{}\n".format(str(row).encode("utf-8")))

data/processed/eclipse/corpus_fasttext.txt


## Fasttext embedding

In [13]:
%%time

import os

cpu = os.cpu_count() - 1

# Skipgram model :
model = fasttext.train_unsupervised(EXPORT_PATH, 
                                    model='cbow', 
                                    minCount=1,
                                    neg=10,
                                    wordNgrams=3,
                                    epoch=15, 
                                    dim=300, 
                                    loss='hs', 
                                    thread=cpu, 
                                    verbose=2)

CPU times: user 6h 17min 25s, sys: 11.1 s, total: 6h 17min 36s
Wall time: 54min 22s


### Save dataset vocabulary embedding

In [14]:
import _pickle as pickle

In [15]:
vocab_embed = {}
for word in model.words:
    vocab_embed[word] = model[word]
    
with open(os.path.join(baseline.DIR, 'vocab_embed_fasttext.pkl'), 'wb') as f:
      pickle.dump(vocab_embed, f)

In [16]:
vocab_embed['eclipse']

array([ 2.27798879e-01, -3.70379865e-01, -1.78365380e-01,  5.32969192e-04,
       -3.62686247e-01,  3.92668277e-01, -4.49969381e-01,  1.07493997e-01,
        3.36375624e-01,  6.80377245e-01,  6.21302068e-01, -5.13689995e-01,
       -1.99582204e-01, -3.69280457e-01, -2.76777446e-01, -2.62730718e-01,
       -5.77495933e-01,  2.54344255e-01, -6.67263389e-01, -1.20380837e-02,
        4.69201386e-01, -2.46804208e-01,  2.85313725e-01,  1.06585130e-01,
        1.60871550e-01,  2.79145628e-01,  2.87379533e-01,  1.08026102e-01,
       -2.74197608e-01, -4.26531672e-01,  1.68908224e-01,  5.90773761e-01,
       -5.90794861e-01, -2.46319219e-01,  8.95781219e-02, -2.30665982e-01,
        4.25390720e-01, -1.52377427e-01,  4.07322735e-01,  7.29261994e-01,
        9.86797094e-01, -4.78519171e-01, -4.54688251e-01,  2.71126270e-01,
        1.91265941e-01, -8.22717607e-01, -2.86309514e-02, -1.43695042e-01,
       -1.92290708e-01,  5.70440650e-01, -6.63588867e-02,  1.01262629e+00,
        1.05394864e+00, -