# FastText pretrained in the dataset

https://github.com/facebookresearch/fastText

In [1]:
import fasttext
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [2]:
from methods.baseline import Baseline

Using TensorFlow backend.


In [3]:
MAX_SEQUENCE_LENGTH_T = 100 # 40
MAX_SEQUENCE_LENGTH_D = 500 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 2000

In [21]:
# Domain to use
DOMAIN = 'eclipse'
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Save model
SAVE_PATH_FEATURE = '{}_word_feature_@number_of_epochs@epochs({})'.format('fasttext', DOMAIN)

In [5]:
baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

In [6]:
baseline.load_ids(DIR)
len(baseline.bug_ids)

Reading bug ids


361006

In [7]:
def data_padding(data, max_seq_length):
    seq_lengths = [len(seq) for seq in data]
    seq_lengths.append(6)
    max_seq_length = min(max(seq_lengths), max_seq_length)
    padded_data = np.zeros(shape=[len(data), max_seq_length])
    for i, seq in enumerate(data):
        seq = seq[:max_seq_length]
        for j, token in enumerate(seq):
            padded_data[i, j] = int(token)
    return padded_data.astype(np.int)

In [8]:
import _pickle as pickle

def load_bugs(baseline):   
    removed = []
    baseline.corpus = []
    baseline.sentence_dict = {}
    baseline.bug_set = {}
    for bug_id in tqdm(baseline.bug_ids):
        try:
            bug = pickle.load(open(os.path.join(baseline.DIR, 'bugs', '{}.pkl'.format(bug_id)), 'rb'))
            baseline.bug_set[bug_id] = bug
            #break
        except:
            removed.append(bug_id)
    
    if len(removed) > 0:
        for x in removed:
            baseline.bug_ids.remove(x)
        baseline.removed = removed
        print("{} were removed. To see the list call self.removed".format(len(removed)))

In [9]:
%%time

load_bugs(baseline)

HBox(children=(IntProgress(value=0, max=361006), HTML(value='')))


CPU times: user 11.9 s, sys: 2.19 s, total: 14.1 s
Wall time: 14.1 s


### Read the corpus from bugs

In [10]:
lines_title = [baseline.bug_set[bug_id]['title'] for bug_id in baseline.bug_ids]
lines_desc = [baseline.bug_set[bug_id]['description'] for bug_id in baseline.bug_ids]

lines = []
for title, desc in zip(lines_title, lines_desc):
    lines.append(title)
    lines.append(desc)

In [11]:
len(lines)

722012

#### Export corpus 

In [12]:
EXPORT_PATH = os.path.join(DIR, 'corpus_fasttext.txt')
print(EXPORT_PATH)
with open(EXPORT_PATH, 'w') as f:
    for row in lines:
        f.write("{}\n".format(str(row).encode("utf-8")))

data/processed/eclipse/corpus_fasttext.txt


## Fasttext embedding

In [13]:
# Best configuration to openoffice (recall@25=60)

# model='cbow', 
# minCount=3,
# neg=10,
# wordNgrams=5,
# epoch=25, 
# dim=300, 
# loss='hs', 
# thread=cpu, 
# verbose=2

In [14]:
%%time

import os

cpu = os.cpu_count() - 1

# Skipgram model :
model = fasttext.train_unsupervised(EXPORT_PATH, 
                                    model='cbow', 
                                    minCount=1,
                                    neg=10,
                                    wordNgrams=3,
                                    epoch=15, 
                                    dim=300, 
                                    loss='hs', 
                                    thread=cpu, 
                                    verbose=2)

CPU times: user 11h 53min 58s, sys: 23.7 s, total: 11h 54min 22s
Wall time: 1h 43min 49s


### Save dataset vocabulary embedding

In [15]:
import _pickle as pickle

In [16]:
baseline.DIR

'data/processed/eclipse'

In [17]:
vocab_embed = {}
for word in model.words:
    vocab_embed[word] = model[word]
    
with open(os.path.join(baseline.DIR, 'vocab_embed_fasttext.pkl'), 'wb') as f:
      pickle.dump(vocab_embed, f)

### Top 20 words

In [18]:
import numpy as np
words_selected = np.random.choice(model.words, 20)
for word in words_selected:
    print(word)

stylesheet'
b'gtdnr
gmfruntime
eatures
pinentry
adapt'
dzrdkdde
displyaed
enhancemnt
shadowcolor
delayed'
refired
linkercmd
peviously
choose'
wtpmodels
b'dataset
tvinsertstruct
b'ridiculously
widgtet


In [19]:
len(vocab_embed)

185832

#### Saving model

In [22]:
model.save_model(os.path.join('modelos', SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(25)) + '.bin'))
'Saved!'

'Saved!'