## Classification using Siamese Model

In [1]:
import keras
from tensorflow.contrib.tensorboard.plugins import projector

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [3]:
from methods.baseline import Baseline

In [4]:
from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Model

## Configurações Globais

In [5]:
MAX_SEQUENCE_LENGTH_T = 100 # 40
MAX_SEQUENCE_LENGTH_D = 100 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 2000

In [12]:
DIR = 'data/processed/eclipse'
DIR_PAIRS = 'data/normalized/eclipse'

In [13]:
baseline = Baseline(DIR, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

In [6]:
name = 'baseline_1000epoch_10steps_512batch(eclipse)'
similarity_model = Baseline.load_model('', name, {'l2_normalize' : Baseline.l2_normalize})

Loaded model from disk


#### Loading bug ids in memory

In [15]:
baseline.load_ids(DIR)

#### Generating tiple of batches

In [16]:
%%time

bug_dir = os.path.join(DIR)
baseline.prepare_dataset(bug_dir)

Reading train data
Wall time: 1.34 s


In [17]:
baseline.load_bugs()

100%|█████████████████████████████████| 212512/212512 [20:59<00:00, 168.76it/s]


## Siamese  model

In [7]:
## Freeze weights
for layer in similarity_model.layers:
    layer.trainable = False

In [10]:
from keras.layers import concatenate

bug_t_a = Input(shape = (MAX_SEQUENCE_LENGTH_T, ), name = 'title_a')
bug_t_b = Input(shape = (MAX_SEQUENCE_LENGTH_T, ), name = 'title_b')

bug_d_a = Input(shape = (MAX_SEQUENCE_LENGTH_D, ), name = 'desc_a')
bug_d_b = Input(shape = (MAX_SEQUENCE_LENGTH_D, ), name = 'desc_b')

title_encoder = similarity_model.get_layer('FeatureLstmGenerationModel')
desc_encoder = similarity_model.get_layer('FeatureCNNGenerationModel')

# model = similarity_model.get_layer('merge_features_in')

bugt_t_a = title_encoder(bug_t_a)
bugt_d_a = desc_encoder(bug_d_a)

bugt_t_b = title_encoder(bug_t_b)
bugt_d_b = desc_encoder(bug_d_b)

bug_a = concatenate([bugt_t_a, bugt_d_a], name = 'bug_a')
bug_b = concatenate([bugt_t_b, bugt_d_b], name = 'bug_b')

x = concatenate([bug_a, bug_b], name='bugs')
#x = Dense(64, activation = 'relu')(x)
#x = Dense(32, activation = 'relu')(x)
output = Dense(2, activation = 'softmax', name = 'output')(x)

model_clf = Model(inputs=[bug_t_a, bug_t_b, bug_d_a, bug_d_b], outputs=[output])

model_clf.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['accuracy'])
model_clf.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title_a (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
desc_a (InputLayer)             (None, 100)          0                                            
__________________________________________________________________________________________________
title_b (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
desc_b (InputLayer)             (None, 100)          0                                            
__________________________________________________________________________________________________
FeatureLst

In [42]:
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

def batch_classification(data, batch_size, n_neg):
    encoder = LabelEncoder()
    batch_size_normalized = batch_size // 2
    input_sample, input_pos, input_neg, sim = baseline.batch_iterator(data, batch_size_normalized, n_neg)
    
    pos = np.full((1, batch_size_normalized), 1)
    neg = np.full((1, batch_size_normalized), 0)
    sim = np.concatenate([pos, neg], -1)[0]
    
    sim = encoder.fit_transform(sim)
    sim = to_categorical(sim)

    title_a = np.concatenate([input_sample['title'], input_sample['title']])
    title_b = np.concatenate([input_pos['title'], input_neg['title']])
    desc_a = np.concatenate([input_sample['description'], input_sample['description']])
    desc_b = np.concatenate([input_pos['description'], input_neg['description']])
    
    return title_a, title_b, desc_a, desc_b, sim

def batch_classification_siamese(data, batch_size, n_neg):
    while True:
        title_a, title_b, desc_a, desc_b, sim = batch_classification(data, batch_size, n_neg)
        
        yield ({ 'title_a' : title_a, 'title_b': title_b, 
        'desc_a' : desc_a, 'desc_b' : desc_b }, sim)

In [43]:
title_a, title_b, desc_a, desc_b, sim = batch_classification(bug_dir, 512, 1)
test_validation = ({ 'title_a' : title_a, 'title_b': title_b, 
        'desc_a' : desc_a, 'desc_b' : desc_b }, sim)

In [44]:
%%time

from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

h_clf = model_clf.fit_generator(batch_classification_siamese(bug_dir, 512, 1), 
                               steps_per_epoch = 10,
                               validation_data=test_validation, # 
                                             epochs = 100,
                                             verbose = True) # callbacks=[early]

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100

KeyboardInterrupt: 

In [None]:
Baseline.validation_accuracy_loss(h_clf)

In [None]:
name = 'baseline_classification_100epoch_16steps(eclipse)'
save_model(similarity_model, name)
save_result(h, name)