## Classification using embedding model

In [1]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [2]:
from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

Using TensorFlow backend.


## Configurações Globais

In [3]:
MAX_SEQUENCE_LENGTH_T = 100 # 40
MAX_SEQUENCE_LENGTH_D = 500 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 2000

In [4]:
DOMAIN = 'netbeans'
METHOD = 'doc2vec'
DIR = 'data/processed/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
SAVE_PATH = '{}_classification({})'.format(METHOD, DOMAIN)
PRETRAINED_MODEL = 'modelos/{}_feature_@epochs@epochs({})'.format(METHOD, DOMAIN)

In [5]:
import fasttext
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

if METHOD == 'fasttext':
    model = fasttext.load_model(PRETRAINED_MODEL.replace('@epochs@', '60') + '.bin')
elif METHOD == 'doc2vec':
    model = Doc2Vec.load(PRETRAINED_MODEL.replace('@epochs@', '100'))

In [6]:
baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

In [7]:
baseline.info_dict

{'bug_severity': 7,
 'bug_status': 3,
 'component': 473,
 'priority': 4,
 'product': 39,
 'version': 18}

In [8]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

Creating the buckets...


HBox(children=(IntProgress(value=0, max=180483), HTML(value='')))




HBox(children=(IntProgress(value=0, max=36232), HTML(value='')))




#### Loading bug ids in memory

In [9]:
experiment.load_ids()
len(baseline.bug_ids)

Reading bug ids


216715

#### Loading train bugs

In [10]:
%%time

experiment.load_bugs()

HBox(children=(IntProgress(value=0, max=216715), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 1min 31s, sys: 3.33 s, total: 1min 34s
Wall time: 1min 42s


In [11]:
%%time
issues_by_buckets = experiment.get_buckets_for_bugs()
experiment.prepare_dataset(issues_by_buckets)

HBox(children=(IntProgress(value=0, max=180483), HTML(value='')))


Reading train data
Reading bug ids
CPU times: user 3min 6s, sys: 373 µs, total: 3min 6s
Wall time: 3min 6s


In [12]:
%%time

bug_train_ids = experiment.get_train_ids(baseline.train_data)
len(bug_train_ids)

CPU times: user 16.9 ms, sys: 15 µs, total: 16.9 ms
Wall time: 16.8 ms


In [13]:
%%time

baseline.test_data, baseline.dup_sets_test = experiment.read_test_data_classification(DIR, baseline.bug_set, bug_train_ids)

Reading test data for classification
CPU times: user 9.42 s, sys: 0 ns, total: 9.42 s
Wall time: 9.42 s


### Train

In [18]:
%%time

import random

train_pair_x = []

for pair in tqdm(baseline.train_data):
    dup_a, dup_b = pair
    bug_a = baseline.bug_set[dup_a]
    bug_b = baseline.bug_set[dup_b]
    text_a = "{} {}".format(bug_a['title'], bug_a['description'])
    text_b = "{} {}".format(bug_b['title'], bug_b['description'])
    
    if METHOD == 'fasttext':
        vec_a = model.get_sentence_vector(text_a)
        vec_b = model.get_sentence_vector(text_b)
    elif METHOD == 'doc2vec':
        vec_a = model.infer_vector(text_a.split(' '))
        vec_b = model.infer_vector(text_b.split(' '))
        
    neg_bug = baseline.get_neg_bug(baseline.dup_sets_train[dup_a], baseline.bug_ids)
    train_pair_x.append([np.concatenate([vec_a, vec_b], 0).tolist(), 1])
    bug_neg = baseline.bug_set[neg_bug]
    text_neg = "{} {}".format(bug_neg['title'], bug_neg['description'])
    
    if METHOD == 'fasttext':
        vec_neg = model.get_sentence_vector(text_neg.split(' '))
    elif METHOD == 'doc2vec':
        vec_neg = model.infer_vector(text_neg.split(' '))
    
    train_pair_x.append([np.concatenate([vec_a, vec_neg], 0).tolist(), 0])

    
random.shuffle(train_pair_x)

X = [row[0] for row in train_pair_x]
y = [row[1] for row in train_pair_x]

HBox(children=(IntProgress(value=0, max=87182), HTML(value='')))

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [22]:
np.asarray(X).shape, np.asarray(y).shape

((174364, 600), (174364,))

### Test

In [25]:
%%time

import random

test_pair_x = []

for pair in tqdm(baseline.test_data):
    dup_a, dup_b = pair
    bug_a = baseline.bug_set[dup_a]
    bug_b = baseline.bug_set[dup_b]
    text_a = "{} {}".format(bug_a['title'], bug_a['description'])
    text_b = "{} {}".format(bug_b['title'], bug_b['description'])
    
    if METHOD == 'fasttext':
        vec_a = model.get_sentence_vector(text_a)
        vec_b = model.get_sentence_vector(text_b)
    elif METHOD == 'doc2vec':
        vec_a = model.infer_vector(text_a.split(' '))
        vec_b = model.infer_vector(text_b.split(' '))
        
    neg_bug = baseline.get_neg_bug(baseline.dup_sets_test[dup_a], baseline.bug_ids)
    test_pair_x.append([np.concatenate([vec_a, vec_b], 0).tolist(), 1])
    bug_neg = baseline.bug_set[neg_bug]
    text_neg = "{} {}".format(bug_neg['title'], bug_neg['description'])
    
    if METHOD == 'fasttext':
        vec_neg = model.get_sentence_vector(text_neg.split(' '))
    elif METHOD == 'doc2vec':
        vec_neg = model.infer_vector(text_neg.split(' '))
    
    test_pair_x.append([np.concatenate([vec_a, vec_neg], 0).tolist(), 0])

random.shuffle(test_pair_x)

X_test = [row[0] for row in test_pair_x]
y_test = [row[1] for row in test_pair_x]

HBox(children=(IntProgress(value=0, max=1244), HTML(value='')))


CPU times: user 1min 13s, sys: 116 ms, total: 1min 13s
Wall time: 1min 13s


In [26]:
np.asarray(X_test).shape, np.asarray(y_test).shape

((2488, 600), (2488,))

### Classification

In [27]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

### Accuracy

In [28]:
result = { }

#### train

In [29]:
from sklearn.metrics import accuracy_score

y_true = y
y_scores = neigh.predict(X)

acc_train = accuracy_score(y_true, y_scores)
acc_train

1.0

#### test

In [30]:
from sklearn.metrics import accuracy_score

y_true = y_test
y_scores = neigh.predict(X_test)

acc_test = accuracy_score(y_true, y_scores)
acc_test

0.5920418006430869

### Roc AUC/SCORE

#### train

In [31]:
from sklearn.metrics import roc_auc_score

y_true = y
y_scores = neigh.predict(X)

roc_train = roc_auc_score(y_true, y_scores)
roc_train

1.0

#### test

In [32]:
from sklearn.metrics import roc_auc_score

y_true = y_test
y_scores = neigh.predict(X_test)

roc_test = roc_auc_score(y_true, y_scores)
roc_test

0.5920418006430869

#### Saving the accuracy and roc/auc

In [33]:
import _pickle as pickle

CLASSIFICATION_PATH = 'classification_{}.pkl'.format(METHOD)

def save_result(result):
    result = {
        'acc_train' : acc_train,
        'acc_test' : acc_test,
        'roc_train' : roc_train,
        'roc_test' : roc_test,
        'method' : METHOD
    }

    with open(os.path.join(DIR, CLASSIFICATION_PATH), 'wb') as f:
        pickle.dump(result, f)

try:
    with open(os.path.join(DIR, CLASSIFICATION_PATH), 'rb') as f:
        result = pickle.load(f)
    save_result(result)
except:
    save_result(result)
    
print("All saved.")

All saved.
