<a href="https://colab.research.google.com/github/ymca3735/POP-ON/blob/main/POPON.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DRIVE MOUNT

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# UTIL

### Packages & Util Functions

In [None]:
import torch
from torch import nn
import gc

import pandas as pd

import sys
from time import time
import numpy as np


def print_progress (iteration, total, prefix = '', suffix = '', decimals = 1, barLength = 100):
    # 진행상황 출력
    formatStr = "{0:." + str(decimals) + "f}"
    percent = formatStr.format(100 * (iteration / float(total)))
    filledLength = int(round(barLength * iteration / float(total)))
    bar = '#' * filledLength + '-' * (barLength - filledLength)
    sys.stdout.write('\r%s |%s| %s%s %s' % (prefix, bar, percent, '%', suffix)),
    if iteration == total:
        sys.stdout.write('\n')
    sys.stdout.flush()

def list_to_string(sentence_list):
    """
    :param sentence_list: list of lists, [['A', 'B', 'C'], ... ]
    :return: list of strings, ['A B C', ...]
    """
    return_list = []
    for sentence in sentence_list:
        return_list.append(' '.join(sentence))
    return return_list




# PREPROCESSING

### Log Processor

In [None]:
from sklearn.model_selection import train_test_split


def sentencifier(input, evt_col_name, case_col_name, time_col_name, numeric_col_names=[], cartegorical_col_names=[], bag_abstraction=False):
    """
    :param input: pandas.DataFrame, 이벤트 로그 데이터
    :param evt_col_name: str, 이벤트명 컬럼 이름
    :param case_col_name: str, 케이스번호 컬럼 이름
    :param time_col_name: str, timestamp 컬럼 이름
    """
    
    sentence_set = []
    unique_events = set()

    input = input.copy()

    input.loc[:, time_col_name] = pd.to_datetime(input.loc[:, time_col_name], utc=True)
    input.loc[:, 'time:elapsed'] = pd.Series([0,])

    # One-Hot encode cartegorical columns
    cartegorical_col_names_encoded = []
    for col in cartegorical_col_names:
        unique_cartegories = input[col].unique()
        cartegorical_col_names_encoded += [col + ':' + str(cartegory) for cartegory in unique_cartegories]

    # attr_columns: column names fo dataframe to be returned.
    attr_columns = ['time:elapsed'] + numeric_col_names + cartegorical_col_names_encoded
    for col in attr_columns:
        if col in numeric_col_names: continue
        input.loc[:, col] = 0
    attr_set = pd.DataFrame([], columns=attr_columns)

    # process case by case
    cases = input.loc[:, case_col_name].unique()
    for case_idx, case in enumerate(cases):
        evt_seq = input.loc[input[case_col_name]==case, ].copy()
        evt_seq.sort_values(by=[time_col_name], axis=0, inplace=True)

        evts = []
        attrs = []
        for evt_idx, evt in evt_seq.iterrows():
            iter = 0

            # calculate elapsed time. zero if ValueError or KeyError
            try:
                time_elapsed = (evt[time_col_name] - evt_seq.loc[evt_idx-1, time_col_name])
                time_elapsed = np.timedelta64(time_elapsed).astype('int')
            except (ValueError, KeyError):
                time_elapsed = 0
            if time_elapsed==0 and bag_abstraction and iter != 0 :
                bagged_evt = '-'.join(evt_seq.loc[evt_seq[time_col_name]==evt[time_col_name], evt_col_name].sort_values())
                if bagged_evt not in evts:
                    evts.pop()
                    attrs.pop()

                    evts.append(bagged_evt)
                    # insert elapsed time
                    input.loc[evt_idx, 'time:elapsed'] = time_elapsed
                    # insert numerical attributes
                    for col in numeric_col_names:
                        input.loc[evt_idx, col] = evt[col]
                    # insert cartegorical attributes
                    for col in cartegorical_col_names:
                        input.loc[evt_idx, col + ':' + str(evt[col])] = 1
                    attrs.append(input.loc[evt_idx, attr_columns].copy())
                    # update unique_events
                    unique_events.add(bagged_evt)
            else:   
                evts.append(evt[evt_col_name])
                # insert elapsed time
                input.loc[evt_idx, 'time:elapsed'] = time_elapsed
                # insert numerical attributes
                for col in numeric_col_names:
                    input.loc[evt_idx, col] = evt[col]
                # insert cartegorical attributes
                for col in cartegorical_col_names:
                    input.loc[evt_idx, col + ':' + str(evt[col])] = 1
                attrs.append(input.loc[evt_idx, attr_columns].copy())
                # update unique_events
                unique_events.add(evt[evt_col_name])
            iter += 1

        for idx in range(0, len(evts)):
            if idx==0: continue
            sentence = evts[:idx+1]
            sentence_set.append(sentence)
            attr_set = attr_set.append(pd.DataFrame([attrs[idx]], columns=attr_columns)) 
        print_progress(case_idx+1, len(cases), 'Progress:', 'Complete', 2, 50)
    return sentence_set, attr_set, unique_events


### Custom GPT Tokenzier

In [None]:
import json

class CustomTokenizer(object):
    def __init__(self, vocab_file_path, eos_token, bos_token, pad_token):
        with open(vocab_file_path) as json_file:
            self.vocab = json.load(json_file)
        self.eos_token = eos_token
        self.bos_token = bos_token
        self.pad_token = pad_token
        self.eos_token_id = self.vocab[eos_token]
        self.bos_token_id = self.vocab[bos_token]
        self.pad_token_id = self.vocab[pad_token]

    def convert_tokens_to_ids(self, sentence):
        id_list = []
        for token in sentence:
            id_list.append(self.vocab[str(token)])
        return id_list

    def get_vocab_ids(self):
        return list(self.vocab.values())

### Encode

In [None]:
def encode_for_gpt2_input(sentence_set, tokenizer, n_seq=None, padding=True, add_eos_token=True):
    # returns input_ids, labels

    input_ids = []
    labels = []
    
    for idx, sentence in enumerate(sentence_set):
        sentence_tokenized = tokenizer.convert_tokens_to_ids(sentence)

        id = [tokenizer.bos_token_id]
        id += sentence_tokenized
        if add_eos_token: 
            id.append(tokenizer.eos_token_id)
        elif n_seq is not None and len(id) < n_seq:
            id.append(tokenizer.pad_token_id)
        labels.append(sentence_tokenized[-1])
        input_ids.append(id)
        
    print_progress(idx+1, len(sentence_set), 'Progress:', 'Complete', 0, 50)

    if n_seq:
        max_len = n_seq
    else:
        max_len = len(max(input_ids, key=len))
    # padding
    if padding:
        for idx in range(0, len(input_ids)):
            pad_len = max_len - len(input_ids[idx])
            input_ids[idx] += [tokenizer.pad_token_id] * pad_len
    else: pass

    return input_ids, labels

In [None]:
def encode_for_test(sentence_set, tokenizer, n_seq=None, separate=True, add_eos_token=True):
    inputs = []
    labels = []
    pred_idx = []
    for sentence in sentence_set:
        if separate:
            for i in range(0, len(sentence)):
                if i==0: continue
                input = sentence[:i]
                label = sentence[:i+1]
                idx = i

                inputs.append(input)
                labels.append(label)
                pred_idx.append(idx)
        else:
            inputs.append(sentence[:-1])
            labels.append(sentence)
            pred_idx.append(len(sentence)-1)
        
    inputs_encoded, _ = encode_for_gpt2_input(inputs, tokenizer, n_seq, add_eos_token=add_eos_token)
    labels_encoded, _ = encode_for_gpt2_input(labels, tokenizer, n_seq, add_eos_token=add_eos_token)
    return inputs_encoded, labels_encoded, pred_idx

# MODELING

### Config

In [None]:
class GPTConfig(object):
    def __init__(
            self,
            vocab_size=15,
            n_layer=12,

            d_model=64,
            embd_pdrop=0.1,
            
            attr_size=3,
            d_attr = 64,
            attr_pdrop=0.1,

            d_ff=512,
            ff_pdrop=0.1,
            
            n_head=12,
            head_size=64,
            attn_pdrop=0.1,

            layer_norm_epsilon=1e-5,
            
            i_pad=0,
            n_dec_seq=15,
    ):
        self.vocab_size=vocab_size
        self.n_layer = n_layer

        self.d_model = d_model
        self.embd_pdrop = embd_pdrop

        self.attr_size = attr_size
        self.d_attr = d_attr
        self.attr_pdrop = attr_pdrop

        self.d_ff = d_ff
        self.ff_pdrop = ff_pdrop

        self.n_head = n_head
        self.head_size = head_size
        self.attn_pdrop = attn_pdrop
        
        self.layer_norm_epsilon = layer_norm_epsilon
        
        self.i_pad = i_pad
        self.n_dec_seq = n_dec_seq

### Multihead Attention

In [None]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.dropout = nn.Dropout(config.attn_pdrop)
        self.scale = 1 / (self.config.head_size ** 0.5)

    def forward(self, Q, K, V, attn_mask):
        # (bs, n_head, n_q_seq, n_k_seq)
        scores = torch.matmul(Q, K.transpose(-1, -2))
        scores = scores.mul_(self.scale)
        scores.masked_fill_(attn_mask, -1e9)
        # (bs, n_head, n_q_seq, n_k_seq)
        attn_prob = nn.Softmax(dim=-1)(scores)
        attn_prob = self.dropout(attn_prob)
        # (bs, n_head, n_q_seq, d_v)
        context = torch.matmul(attn_prob, V)
        # (bs, n_head, n_q_seq, d_v), (bs, n_head, n_q_seq, n_v_seq)
        return context, attn_prob


class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.W_Q = nn.Linear(self.config.d_model, self.config.n_head * self.config.head_size)
        self.W_K = nn.Linear(self.config.d_model, self.config.n_head * self.config.head_size)
        self.W_V = nn.Linear(self.config.d_model, self.config.n_head * self.config.head_size)
        self.scaled_dot_attn = ScaledDotProductAttention(self.config)
        self.linear = nn.Linear(self.config.n_head * self.config.head_size, self.config.d_model)
        self.dropout = nn.Dropout(config.attn_pdrop)

    def forward(self, Q, K, V, attn_mask):
        batch_size = Q.size(0)
        # (bs, n_head, n_q_seq, head_size)
        q_s = self.W_Q(Q).view(batch_size, -1, self.config.n_head, self.config.head_size).transpose(1, 2)
        # (bs, n_head, n_k_seq, head_size)
        k_s = self.W_K(K).view(batch_size, -1, self.config.n_head, self.config.head_size).transpose(1, 2)
        # (bs, n_head, n_v_seq, head_size)
        v_s = self.W_V(V).view(batch_size, -1, self.config.n_head, self.config.head_size).transpose(1, 2)

        # (bs, n_head, n_q_seq, n_k_seq)
        attn_mask = attn_mask.unsqueeze(1).repeat(1, self.config.n_head, 1, 1)

        # (bs, n_head, n_q_seq, head_size), (bs, n_head, n_q_seq, n_k_seq)
        context, attn_prob = self.scaled_dot_attn(q_s, k_s, v_s, attn_mask)
        # (bs, n_head, n_q_seq, h_head * head_size)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.config.n_head * self.config.head_size)
        # (bs, n_head, n_q_seq, e_embd)
        output = self.linear(context)
        output = self.dropout(output)
        # (bs, n_q_seq, d_model), (bs, n_head, n_q_seq, n_k_seq)
        return output, attn_prob

### Pos-wise FF Net

In [None]:
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.conv1 = nn.Conv1d(in_channels=self.config.d_model, out_channels=self.config.d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=self.config.d_ff, out_channels=self.config.d_model, kernel_size=1)

        self.active = torch.nn.GELU()
        self.dropout = nn.Dropout(config.ff_pdrop)

    def forward(self, inputs):
        # (bs, d_ff, n_seq)
        output = self.conv1(inputs.transpose(1, 2))
        output = self.active(output)
        # (bs, n_seq, d_model)
        output = self.conv2(output).transpose(1, 2)
        output = self.dropout(output)
        # (bs, n_seq, d_model)
        return output

### Attribute Linear Layer

In [None]:
class AttrLinear(nn.Module):
    # This layer is a simple feed forward network, consists of 3 linear layers and

    def __init__(self, config):
        super().__init__()
        self.config = config

        self.linear1 = nn.Linear(self.config.attr_size, self.config.d_attr, bias=True)
        self.linear2 = nn.Linear(self.config.d_attr, self.config.d_attr, bias=True)
        self.linear3 = nn.Linear(self.config.d_attr, self.config.n_dec_seq * self.config.d_model, bias=True)
        self.active = torch.nn.ReLU()
        self.dropout = nn.Dropout(config.attr_pdrop)

    def forward(self, input):
        # (bs, n_dec_seq * d_model)
        output = self.linear1(input)
        output = self.linear2(output)
        output = self.linear3(output)
        output = self.active(output)
        # reshape (bs, n_dec_seq, d_model)
        output = output.reshape(input.size(0), self.config.n_dec_seq, self.config.d_model)
        return output

### Decoder Layer, Decoder Block, Multihead Attention

In [None]:
from torch.nn import MultiheadAttention

class DecoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.self_attn = MultiHeadAttention(self.config)
        self.attr_linear = AttrLinear(self.config)
        self.ffn = PoswiseFeedForwardNet(self.config)

        self.layer_norm1 = nn.LayerNorm(self.config.d_model, eps=self.config.layer_norm_epsilon)
        self.layer_norm2 = nn.LayerNorm(self.config.d_model, eps=self.config.layer_norm_epsilon)
        

    def forward(self, dec_inputs, self_attn_mask, attr_inputs):
        # (bs, n_dec_seq, d_model)
        attr_outputs = self.attr_linear(attr_inputs)

        # (bs, n_dec_seq, d_model), (bs, n_head, n_dec_seq, n_dec_seq)
        self_att_outputs, self_attn_prob = self.self_attn(dec_inputs, dec_inputs, dec_inputs, self_attn_mask)
        self_att_outputs = self.layer_norm1(dec_inputs + self_att_outputs + attr_outputs)

        # (bs, n_dec_seq, d_model)
        ffn_outputs = self.ffn(self_att_outputs)
        ffn_outputs = self.layer_norm2(self_att_outputs + ffn_outputs)

        # (bs, n_dec_seq, d_model), (bs, n_head, n_dec_seq, n_dec_seq), (bs, n_head, n_dec_seq, n_enc_seq)
        return ffn_outputs, self_attn_prob


class Decoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.dec_emb = nn.Embedding(self.config.vocab_size, self.config.d_model)
        sinusoid_table = torch.FloatTensor(get_sinusoid_encoding_table(self.config.n_dec_seq + 1, self.config.d_model))
        self.pos_emb = nn.Embedding.from_pretrained(sinusoid_table, freeze=True)
        self.layers = nn.ModuleList([
            DecoderLayer(self.config) for _ in range(0, self.config.n_layer)
        ])

    def forward(self, input_ids, attrs):
        positions = torch.arange(input_ids.size(1), device=input_ids.device, dtype=input_ids.dtype).expand(
            input_ids.size(0), input_ids.size(1)).contiguous() + 1
        pos_mask = input_ids.eq(self.config.i_pad)
        positions.masked_fill_(pos_mask, 0)

        # (bs, n_dec_seq, d_model)
        dec_outputs = self.dec_emb(input_ids) + self.pos_emb(positions)        
        # (bs, n_dec_seq, n_dec_seq)
        dec_attn_pad_mask = get_attn_pad_mask(input_ids, input_ids, self.config.i_pad)
        # (bs, n_dec_seq, n_dec_seq)
        dec_attn_decoder_mask = get_attn_decoder_mask(input_ids)
        # (bs, n_dec_seq, n_dec_seq)
        dec_self_attn_mask = torch.gt((dec_attn_pad_mask + dec_attn_decoder_mask), 0)

        self_attn_probs = []
        for layer in self.layers:
            # (bs, n_dec_seq, d_model), (bs, n_dec_seq, n_dec_seq)
            dec_outputs, self_attn_prob = layer(dec_outputs, dec_self_attn_mask, attrs)
            self_attn_probs.append(self_attn_prob)
        # (bs, n_dec_seq, d_model), [(bs, n_dec_seq, n_dec_seq)]
        return dec_outputs, self_attn_probs

### GPT Model, Mul

In [None]:
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.decoder = Decoder(self.config)

        # (bs, n_q_seq, vocab_size)
        self.linear1 = nn.Linear(self.config.d_model, self.config.vocab_size, bias=False)
        self.linear1.weight = self.decoder.dec_emb.weight

    def forward(self, input_ids, attrs):
        dec_outputs, dec_self_attn_probs = self.decoder(input_ids, attrs)

        logits = self.linear1(dec_outputs)
        return logits


def get_sinusoid_encoding_table(n_seq, d_model):
    def cal_angle(position, i_hidn):
        return position / np.power(10000, 2 * (i_hidn // 2) / d_model)
    def get_posi_angle_vec(position):
        return [cal_angle(position, i_hidn) for i_hidn in range(d_model)]

    sinusoid_table = np.array([get_posi_angle_vec(i_seq) for i_seq in range(n_seq)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # even index sin
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # odd index cos

    return sinusoid_table


def get_attn_pad_mask(seq_q, seq_k, i_pad):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    pad_attn_mask = seq_k.data.eq(i_pad)
    pad_attn_mask= pad_attn_mask.unsqueeze(1).expand(batch_size, len_q, len_k)
    return pad_attn_mask

def get_attn_decoder_mask(seq):
    subsequent_mask = torch.ones_like(seq).unsqueeze(-1).expand(seq.size(0), seq.size(1), seq.size(1))
    subsequent_mask = subsequent_mask.triu(diagonal=1) # upper triangular part of a matrix(2-D)
    return subsequent_mask




# TRAIN

### Train

In [None]:
from torch.optim import Adam
from torch.nn import CrossEntropyLoss


def train(model, input_ids, attrs, labels, n_epoch, batch_size, learning_rate, perm):

    criterion = CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    input_ids = torch.tensor(input_ids)
    attrs = torch.tensor(attrs, dtype=torch.float)
    labels = torch.tensor(labels)[:, 1:].contiguous()

    total_size = len(input_ids)
    iters_per_epoch = total_size // batch_size

    losses = []
    accs = []

    # PER EPOCH    

    for epoch_idx in range(0, n_epoch):     
        time_from = time()

        loss_epoch = 0
        correct_epoch = 0

        model.train()
        for batch_idx in range(0, iters_per_epoch):
            # PER BATCH
            index_from = batch_idx * batch_size
            index_to = (batch_idx + 1) * batch_size if batch_idx < (iters_per_epoch - 1) else total_size
            index_range = slice(index_from, index_to)
            
            input_ids_batch = input_ids[perm][index_range].cuda()
            attrs_batch = attrs[perm][index_range].cuda()
            target = labels[perm][index_range].cuda()
            
            logits = model(
                input_ids_batch,
                attrs_batch
            )[:, :-1, :].contiguous()

            # LOSS
            loss = criterion(logits.view(-1, logits.size(2)), target.view(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_epoch += loss.item()

            # PREDICTION
            pred = torch.argmax(logits, dim=2)[:, :]
            correct_epoch += torch.sum(torch.all(torch.eq(pred, target), dim=1)).item()

            # UPDATE PROGRESS
            print_progress(index_to,
                total_size,
                f'Training',
                f'Complete, {index_to}/{total_size}', 2, 25)

        # LOG EPOCH LOSS, EPOCH ACCURACY
        losses.append(loss_epoch / iters_per_epoch)
        accs.append(correct_epoch / total_size)

        # PRINT EPOCH REPORT
        print('\n')
        print('< EPOCH REPORT >')
        print('Time Elapsed   : ' + '{0:.2f}'.format(time() - time_from) + 's')
        print('Train Accuracy :', '{0:.4f}'.format(accs[-1]))
        print('Train Loss     :', '{0:.4f}'.format(losses[-1]))
        

    return accs, losses

# TEST

### Evaluation Metrics

In [None]:
# Test metrics are defined here.
# prediction, target should be torch.Tensor object with shape (N)
# Accuray, Precision, Recall, F1, MCC, AUC


from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef, roc_auc_score


def eval_metrics(prediction, target, average='micro', print_report=True):

    vocab = torch.unique(torch.cat((prediction, target))).tolist()
    accuracy = accuracy_score(target, prediction)
    # ignore support
    precision, recall, f1, _ = precision_recall_fscore_support(
        target.tolist(),
        prediction.tolist(),
        beta=1,
        labels=vocab,
        average=average,
        zero_division=0
    )
    mcc = matthews_corrcoef(target, prediction)
    
    if len(prediction) != len(target):
        print('PREDICTION AND TARGET HAS DIFFERENT LENGTH')
        return None

    else:
        precision_me = 0
        recall_me = 0
        f1_me = 0
        total_size = len(prediction)

        
        for token_id in vocab:
            true_size = torch.sum(target==token_id).item()

            # predicted token_id, and target is token_id
            tp = torch.sum(prediction[target==token_id]==token_id).item()

            # predicted not token_id, but target is token_id 
            fp = torch.sum(prediction[target!=token_id]==token_id).item()

            # predicted token_id, but target is token_id
            fn = torch.sum(prediction[target==token_id]!=token_id).item()

            # predicted not token_id
            tn = torch.sum(prediction[target!=token_id]!=token_id).item()

            # IF THERE IS A ZERO DIVISION, SETS THE VALUE TO ZERO
            


            if (tp + fp) == 0:
                precision_sub = 0
            else:
                precision_sub = true_size * tp / (tp + fp)
                
            if (tp + fn) == 0:
                recall_sub = 0
            else:
                recall_sub = true_size * tp / (tp + fn)
            
            if precision_sub + recall_sub == 0: 
                f1_sub = 0
            else:
                f1_sub = 2 * (precision_sub * recall_sub) / (precision_sub + recall_sub)

            precision_me += precision_sub
            recall_me += recall_sub
            f1_me += f1_sub

        # DIVIDE BY TOTAL SIZE
        precision_me /= total_size
        recall_me /= total_size
        f1_me /= total_size

        if print_report:
            print('\n')
            print('< TEST  REPORT >')
            #print('Precision_me :', '{0:.4f}'.format(precision_me))
            #print('Recall_me    :', '{0:.4f}'.format(recall_me))
            #print('F1_me        :', '{0:.4f}'.format(f1_me))
            #print('-------------------------')
            print('Accuracy     :', '{0:.4f}'.format(accuracy))
            print('Precision    :', '{0:.4f}'.format(precision))
            print('Recall       :', '{0:.4f}'.format(recall))
            print('F1           :', '{0:.4f}'.format(f1))
            print('MCC          :', '{0:.4f}'.format(mcc))

        return accuracy, precision, recall, f1, mcc

### Test

In [None]:
def test(model, input_ids, attrs, labels, pred_idx, metric_average='micro', batch_size=512, print_report=True):
    input_ids = torch.tensor(input_ids).cuda()
    attrs = torch.tensor(attrs, dtype=torch.float).cuda()
    labels = torch.tensor(labels)[:, 1:]
    

    total_size = len(input_ids)
    n_iter = len(input_ids) // batch_size

    model.eval()
    correct = 0

    pred_total = []
    target_total = []
    for batch_idx in range(0, n_iter):

        idx_from = batch_size * batch_idx
        idx_to = total_size if batch_idx==n_iter-1 else batch_size * (batch_idx+1)
        idx_range = slice(idx_from, idx_to)

        with torch.no_grad():
            logits = model(input_ids[idx_range], attrs[idx_range])

        pred = torch.argmax(torch.softmax(logits.cpu(), dim=2), dim=2)

        for i in range(0, idx_to-idx_from):
            pred_token = pred[i, pred_idx[idx_range][i]].item()
            pred_total.append(pred_token)

            target_token = labels[idx_range][i, pred_idx[idx_range][i]].item()
            target_total.append(target_token)
                
        

    return eval_metrics(torch.tensor(pred_total), torch.tensor(target_total), average=metric_average, print_report=print_report), pred_total, target_total

# REAL STUFFS

### LOAD DATA

In [None]:
LOG_FILE_PATH = '/content/drive/MyDrive/data/csv/BPI_2013/closed_problems1.csv'
NUMERICAL_ATTRIBUTES = [
    
]
CATEGORICAL_ATTRIBUTES = [
    'concept:name',
    'impact',
    'org:group',
    'org:role',
    'organization involved',
    'product',
    'organization country',
    'resource country'
]

data = pd.read_csv(LOG_FILE_PATH, engine='python')


sentence_set, attr_set, unique_events = sentencifier(
    data, 
    'lifecycle:transition', 
    'case:concept:name', 
    'time:timestamp', 
    NUMERICAL_ATTRIBUTES,
    CATEGORICAL_ATTRIBUTES,
    bag_abstraction=False
)


### VOCAB BUILDER

In [None]:
import json
VOCAB_FILE_PATH = '/content/drive/MyDrive/data/json/gpt_vocab.json'


# VOCAB
try:
    vocab_dict = {
        "<PAD>": 0,
        "<BOS>": 1,
        "<EOS>": 2
    }

    for index, item in enumerate(unique_events):
        vocab_dict[item] = index + 3

    with open(VOCAB_FILE_PATH, 'w', encoding='utf-8') as make_file:
        json.dump(vocab_dict, make_file)
except NameError:
    pass
# TOKENIZER
gpt_tkn = CustomTokenizer(VOCAB_FILE_PATH, '<EOS>', '<BOS>', '<PAD>')


### Split

In [None]:
from sklearn.model_selection import train_test_split

sentence_set_train, sentence_set_test, attr_set_train, attr_set_test = train_test_split(sentence_set, attr_set, test_size=0.2, random_state=42)

### Encode For Input

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

n_seq = max(len(max(sentence_set_train, key=len)), len(max(sentence_set_test, key=len))) + 2

# Encode train set
input_ids_train, labels_train, pred_idx_train = encode_for_test(sentence_set_train, gpt_tkn, n_seq=n_seq, separate=False)
attr_set_train = pd.DataFrame(attr_set_train)

scaler = MinMaxScaler()

attr_set_train[NUMERICAL_ATTRIBUTES + ['time:elapsed']] = scaler.fit_transform(attr_set_train[NUMERICAL_ATTRIBUTES + ['time:elapsed']])
attr_set_train = np.array(attr_set_train.values, dtype=np.float)

# Encode test set
input_ids_test, labels_test, pred_idx_test = encode_for_test(sentence_set_test, gpt_tkn, n_seq=n_seq, separate=False, add_eos_token=True)
attr_set_test = pd.DataFrame(attr_set_test)

attr_set_test[NUMERICAL_ATTRIBUTES + ['time:elapsed']] = scaler.transform(attr_set_test[NUMERICAL_ATTRIBUTES + ['time:elapsed']])
attr_set_test = np.array(attr_set_test.values, dtype=np.float)


### ※ Use this cells only in case of runtime crash

In [None]:
# train input backup
pd.DataFrame(input_ids_train).to_csv('/content/drive/MyDrive/backup/input_ids_train.csv', index=None)
pd.DataFrame(labels_train).to_csv('/content/drive/MyDrive/backup/labels_train.csv', index=None)
pd.DataFrame(pred_idx_train).to_csv('/content/drive/MyDrive/backup/pred_idx_train.csv', index=None)
pd.DataFrame(attr_set_train).to_csv('/content/drive/MyDrive/backup/attr_set_train.csv', index=None)

# test input backup
pd.DataFrame(input_ids_test).to_csv('/content/drive/MyDrive/backup/input_ids_test.csv', index=None)
pd.DataFrame(labels_test).to_csv('/content/drive/MyDrive/backup/labels_test.csv', index=None)
pd.DataFrame(pred_idx_test).to_csv('/content/drive/MyDrive/backup/pred_idx_test.csv', index=None)
pd.DataFrame(attr_set_test).to_csv('/content/drive/MyDrive/backup/attr_set_test.csv', index=None)

In [None]:
# load train input backup
input_ids_train = list(pd.read_csv('/content/drive/MyDrive/backup/input_ids_train.csv').to_numpy())
labels_train = list(pd.read_csv('/content/drive/MyDrive/backup/labels_train.csv').to_numpy())
pred_idx_train = list(pd.read_csv('/content/drive/MyDrive/backup/pred_idx_train.csv').to_numpy())
attr_set_train = pd.read_csv('/content/drive/MyDrive/backup/attr_set_train.csv')

# load test input backup
input_ids_test = list(pd.read_csv('/content/drive/MyDrive/backup/input_ids_test.csv').to_numpy())
labels_test = list(pd.read_csv('/content/drive/MyDrive/backup/labels_test.csv').to_numpy())
pred_idx_test = list(pd.read_csv('/content/drive/MyDrive/backup/pred_idx_test.csv').to_numpy())
attr_set_test = pd.read_csv('/content/drive/MyDrive/backup/attr_set_test.csv')

VOCAB_FILE_PATH = '/content/drive/MyDrive/data/csv/BPI_2013/closed_problems1.csv'
gpt_tkn = CustomTokenizer(VOCAB_FILE_PATH, '<EOS>', '<BOS>', '<PAD>')


### Train and Test

In [None]:
# This function below initializes GPT2 model with randomly generated weights,
# trains, and returns test report.

def gpt2_train_and_test(param, config, input_ids_train, attr_set_train, labels_train, input_ids_test, attr_set_test, labels_test, pred_idx_test, metric_average='weighted', model=None):
    # RETURNS BEST-ACCURATE MODEL

    if model is None:
        model = GPT(config).cuda()

    n_epoch = param['n_epoch']
    batch_size = param['batch_size']
    learning_rate = param['learning_rate']

    train_reports = []
    test_reports = []

    torch.manual_seed(777)

    best_acc = 0
    best_f1 = 0
    best_model = GPT(config)

    for i in range(0, n_epoch):
        print('\n')
        print(f'< {i+1} / {n_epoch} >')

        perm = torch.randperm(len(input_ids_train))
        train_report = train(model, input_ids_train, attr_set_train, labels_train, 1, batch_size, learning_rate, perm)
        test_acc, test_prc, test_rec, test_f1, test_mcc = test_report = test(model, input_ids_test, attr_set_test, labels_test, pred_idx_test, metric_average=metric_average)[0]

        train_reports.append(train_report)
        test_reports.append(test_report)
        
        if best_acc < test_acc:
            best_acc = test_acc
            best_f1 = test_f1
            print('\n<Overwriting with better model>')
            best_model.load_state_dict(model.state_dict())
        elif best_acc == test_acc:
            if best_f1 < test_f1:
                best_acc = test_acc
                best_f1 = test_f1
                print('<Overwriting with better model>')
                best_model.load_state_dict(model.state_dict())
            else:
                pass

    return best_model, train_reports, test_reports

### Set Parameters Here

In [None]:
# n_epoch, batch_size, learning_rate
parameter_sets = {
    'n_epoch': 200,
    'batch_size': 64,
    'learning_rate': 1e-6
}


# model config
config = GPTConfig(
    vocab_size= len(gpt_tkn.get_vocab_ids()),
    n_layer=8,
    n_head=8,
    d_model=128,
    d_attr=1024,
    d_ff=1024,

    head_size=16,
    n_dec_seq=len(input_ids_test[0]),
    attr_size=attr_set_test.shape[1],
    
    attn_pdrop=0.05,
    attr_pdrop=0.05,
    ff_pdrop=0.05,
    embd_pdrop=0.05
)

### This Is a Stuff!

In [None]:
gpt_model, train_reports, test_reports = gpt2_train_and_test(parameter_sets, config, input_ids_train, attr_set_train, labels_train, input_ids_test, attr_set_test, labels_test, pred_idx_test)

In [None]:
result = pd.DataFrame(np.array(test_reports))
result.to_csv('/content/drive/MyDrive/result/K1_2.csv')

### Graphical Report

In [None]:
from matplotlib import pyplot as plt


# Train Loss
plt.plot(range(1, parameter_sets['n_epoch']+1), np.array(train_reports)[:, 1])
plt.title("Train Loss")
plt.show()

# Train Accuracy
plt.plot(range(1, parameter_sets['n_epoch']+1), np.array(train_reports)[:, 0])
plt.ylim(0, 1)
plt.title("Train Accuracy")
plt.show()

# Test Report
plt.plot(range(1, parameter_sets['n_epoch']+1), np.array(test_reports)[:, 0])
plt.plot(range(1, parameter_sets['n_epoch']+1), np.array(test_reports)[:, 1])
plt.plot(range(1, parameter_sets['n_epoch']+1), np.array(test_reports)[:, 2])
plt.plot(range(1, parameter_sets['n_epoch']+1), np.array(test_reports)[:, 3])
plt.plot(range(1, parameter_sets['n_epoch']+1), np.array(test_reports)[:, 4])
plt.ylim(0, 1)
plt.legend(['Accuracay', 'Precision', 'Recall', 'F-1 Score', 'MCC'])
plt.title("Test Evaluation")
plt.show()


# Evaluation Awards
print('Best Accuracy :', f'{max(np.array(test_reports)[:, 0]):.4f}', 'at Epoch', np.argmax(np.array(test_reports)[:, 0])+1)
print('Best Precision:', f'{max(np.array(test_reports)[:, 1]):.4f}', 'at Epoch', np.argmax(np.array(test_reports)[:, 1])+1)
print('Best Recall   :', f'{max(np.array(test_reports)[:, 2]):.4f}', 'at Epoch', np.argmax(np.array(test_reports)[:, 2])+1)
print('Best F-1 Score:', f'{max(np.array(test_reports)[:, 3]):.4f}', 'at Epoch', np.argmax(np.array(test_reports)[:, 3])+1)
print('Best MCC      :', f'{max(np.array(test_reports)[:, 4]):.4f}', 'at Epoch', np.argmax(np.array(test_reports)[:, 4])+1)

### Model Performance Comparison

In [None]:
from matplotlib import pyplot as plt

better_report = pd.read_csv('/content/drive/MyDrive/result/2013_P_close_w_attr_linear.csv')
better_rport 

epochs = range(1, 201)
xlim = (1, 201)
ylim = (0, 1)
name1 = 'Model with ALL'
name2 = 'Model without ALL'
metrics = ['Accuracy', 'Precision', 'Recall', 'F-1 Score', 'MCC']

for idx, metric in enumerate(metrics):
    data1 = np.array(test_reports)[:, idx]
    data2 = better_report[str(idx)]

    fig, ax = plt.subplots(figsize = (8, 6), dpi=400)

    plt.plot(epochs, data1, linewidth=2.0, color='C3')
    plt.plot(epochs, data2, linewidth=2.0, color='C0')

    plt.title(metric)

    plt.xlabel('Epoch')
    plt.ylabel(metric)

    max_idx = np.argmax(data1)
    max_value = data1[max_idx]
    ax.annotate(
        f'{max_value:.4f}',
        xy=(max_idx, max_value),
        xytext=(max_idx-30, max_value+0.1),
        arrowprops={
            'facecolor': 'black',
            'width': 1,
            'shrink': 0.1,
            'headwidth': 5
        },
        size=15
    )

    max_idx = np.argmax(data2)
    max_value = data2[max_idx]
    ax.annotate(
        f'{max_value:.4f}',
        xy=(max_idx, max_value),
        xytext=(max_idx-30, max_value+0.1),
        arrowprops={
            'facecolor': 'black',
            'width': 1,
            'shrink': 0.1,
            'headwidth': 5,
        },
        size=15
    )

    plt.legend([name1, name2], loc='upper left')
    plt.xlim(xlim)
    plt.ylim(ylim)

    plt.show()

# MODEL SAVE & LOAD

Path

In [None]:
PATH = '/content/drive/MyDrive/model'

Save

In [None]:
torch.save(gpt2_model, PATH + '/model_BPI_2012_A.pt')

Load

In [None]:
gpt2_model = torch.load(PATH + '/model_BPI_2012_A.pt')