In [1]:
import pytest
import os
import keras
from jobs.data_pipeline import DataPipeline
from utils.keras_utils import KerasUtils
from deep_learning.training.train_config import TrainConfig
from deep_learning.training.train_retrieval import TrainRetrieval
from deep_learning.training.train_classification import TrainClassification

Using TensorFlow backend.


In [2]:
# Retrieval
EPOCHS_TRAINED = 1
MODEL_NAME = 'SiameseTA'
DIR = "data/processed/eclipse_test/fake"
DOMAIN = 'eclipse_test'
PREPROCESSING = 'bert'
retrieval = TrainRetrieval(MODEL_NAME, DIR, DOMAIN, PREPROCESSING, 
            MAX_SEQUENCE_LENGTH_T=1, MAX_SEQUENCE_LENGTH_D=1,
            BERT_LAYERS=1, EPOCHS=EPOCHS_TRAINED, BATCH_SIZE=1, BATCH_SIZE_TEST=1).build()

retrieval_preload = retrieval.get_model()

# Classification
MODEL_NAME = 'SiameseTA'
DOMAIN = 'eclipse_test'
PREPROCESSING = 'bert'
PRETRAINED_MODEL = os.path.join(TrainConfig.OUTPUT_MODELS, TrainConfig.MODEL_NAME.format(PREPROCESSING, MODEL_NAME, EPOCHS_TRAINED, DOMAIN))
train = TrainClassification(retrieval_preload, MODEL_NAME, PRETRAINED_MODEL, 
            DIR, DOMAIN, PREPROCESSING, EPOCHS=2, 
            BATCH_SIZE=1, BATCH_SIZE_TEST=1)
train.pre_load_model()

Reading bug ids


100%|█████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 936.87it/s]
15it [00:00, 15001.09it/s]
100%|████████████████████████████████████████████████████████████████████████| 7/7 [00:00<?, ?it/s]


Reading train data
Reading bug ids


In [3]:
train.model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
categorical (InputLayer)        (None, 9)            0                                            
__________________________________________________________________________________________________
desc_token (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
desc_segment (InputLayer)       (None, 1)            0                                            
__________________________________________________________________________________________________
title_token (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
title_segm

In [4]:
train.model.get_layer('categorical').output_shape[1]

9

In [15]:
from deep_learning.model.classifier_base import ClassifierBase

from keras.layers import Dense, Dropout, Activation, concatenate
from keras.models import Model
from deep_learning.model.classifier_model import ClassifierModel
import numpy as np

from keras.layers import Input
from utils.util import Util

class ClassifierModel:

    def __init__(self, input_list, model_list):
        
        # Inputs
        for key, obj in input_list.items():
            obj['input'] = Input(shape = (obj['input_size'], ), name = key)

        # Outputs
        for obj in model_list.values():
            model_input = [input_list[i]['input'] for i in obj['input']]
            obj['feat'] = obj['model'](model_input)

        # Concatenate model features
        self.inputs = [tensor['input'] for tensor in Util.sort_dict_by_key(input_list).values()]
        self.model = [model['feat'] for model in Util.sort_dict_by_key(model_list).values()]


class ClassifierBase:

    NUMBER_OF_UNITS = 2

    def __init__(self, model, title_size=0, desc_size=0, 
                    categorical_size=0, topic_size=0):
        model_name = 'bug_classifier'
        
        encoder = model.get_layer('concatenated_bug_embed')
        bugs_inputs = []
        bugs_embed = []
        for i in range(2):
            input_list = {}
            model_list = {}

            if title_size > 0:
                title_feat = model.get_layer('title_encoder')
                input_list['title_token_{}'.format(i)]   = { 'input_size' : title_size }
                input_list['title_segment_{}'.format(i)] = { 'input_size' : title_size }
                model_list['title_feat'] = {
                    'input' : ['title_token_{}'.format(i), 'title_segment_{}'.format(i)],
                    'model' : title_feat,
                    'name'  : 'title_encoder'
                }
            if desc_size > 0:
                desc_feat = model.get_layer('description_encoder')
                input_list['desc_token_{}'.format(i)]   = { 'input_size' : desc_size }
                input_list['desc_segment_{}'.format(i)] = { 'input_size' : desc_size }
                model_list['desc_feat'] =  {
                    'input' : ['desc_token_{}'.format(i), 'desc_segment_{}'.format(i)],
                    'model' : desc_feat,
                    'name'  : 'description_encoder'
                }
            if topic_size > 0:
                topic_feat = model.get_layer('topic_encoder')
                input_list['topic_{}'.format(i)]  = { 'input_size' : topic_size }
                model_list['topic'] =  {
                    'input' : ['topic_{}'.format(i)],
                    'model' : topic_feat,
                    'name'  : 'topic_encoder',
                }
            if categorical_size > 0:
                categorical_feat = model.get_layer('categorical_encoder')
                input_list['categorical_{}'.format(i)]  = { 'input_size' : categorical_size }
                model_list['categorical'] =  {
                    'input' : ['categorical_{}'.format(i)],
                    'model' : categorical_feat,
                    "name"  : "categorical_encoder"
                }
            
            print(input_list)
            bug_feat = ClassifierModel(input_list, model_list)
            bugs_inputs.append(bug_feat.inputs)
            print(bug_feat.model)
            bug_embed = encoder(bug_feat.model)
            bugs_embed.append(bug_embed)
        
        x = concatenate(bugs_embed, name='bugs') # 

        for _ in range(self.NUMBER_OF_UNITS):
            x = Dense(64)(x)
            x = Dropout(0.25)(x)
            x = Activation('tanh')(x)

        inputs = np.concatenate(bugs_inputs).tolist()
        output = Dense(2, activation = 'softmax', name = 'softmax')(x)
        
        model = Model(inputs=inputs, outputs=[output], name=model_name)

        self.model = model

    def get_model(self):
        return self.model

class SiameseQATClassifier:
    
    def __init__(self, model, title_size=0, desc_size=0, 
                    categorical_size=0, topic_size=0):
        
        model = ClassifierBase(model, title_size=title_size, desc_size=desc_size, 
                    categorical_size=categorical_size, topic_size=topic_size).get_model()

        self.model = model
    
    def get_model(self):
        return self.model

    def get_metrics(self):
        return ['accuracy']

    def get_loss(self):
        return 'binary_crossentropy'

In [16]:
# from deep_learning.model.siameseQAT_classifier import SiameseQATClassifier

model = SiameseQATClassifier(train.model, 
                            title_size=train.TITLE_SIZE, 
                            desc_size=train.DESC_SIZE, 
                            categorical_size=train.CATEGORICAL_SIZE, 
                            topic_size=train.TOPIC_SIZE)

{'title_segment_0': {'input_size': 1}, 'title_token_0': {'input_size': 1}, 'desc_token_0': {'input_size': 1}, 'categorical_0': {'input_size': 9}, 'desc_segment_0': {'input_size': 1}}
[<tf.Tensor 'categorical_encoder_3/categorical_encoder/dense_3/Tanh:0' shape=(?, 300) dtype=float32>, <tf.Tensor 'description_encoder_3/description_encoder/dense_2/Tanh:0' shape=(?, 300) dtype=float32>, <tf.Tensor 'title_encoder_3/title_encoder/dense_1/Tanh:0' shape=(?, 300) dtype=float32>]
{'categorical_1': {'input_size': 9}, 'desc_token_1': {'input_size': 1}, 'title_token_1': {'input_size': 1}, 'desc_segment_1': {'input_size': 1}, 'title_segment_1': {'input_size': 1}}
[<tf.Tensor 'categorical_encoder_4/categorical_encoder/dense_3/Tanh:0' shape=(?, 300) dtype=float32>, <tf.Tensor 'description_encoder_4/description_encoder/dense_2/Tanh:0' shape=(?, 300) dtype=float32>, <tf.Tensor 'title_encoder_4/title_encoder/dense_1/Tanh:0' shape=(?, 300) dtype=float32>]


In [17]:
from deep_learning.model.compile_model import compile_model
cls = compile_model(model)

In [18]:
cls.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
categorical_0 (InputLayer)      (None, 9)            0                                            
__________________________________________________________________________________________________
desc_token_0 (InputLayer)       (None, 1)            0                                            
__________________________________________________________________________________________________
desc_segment_0 (InputLayer)     (None, 1)            0                                            
__________________________________________________________________________________________________
title_token_0 (InputLayer)      (None, 1)            0                                            
__________________________________________________________________________________________________
title_segm