In [1]:
import os
import pickle
import tensorflow as tf
import time
from models.model_for_kt import TFTransfoXLModel,TFTransfoXLLMHeadModel,TFTransfoXLMLMHeadModel
from transformers import TransfoXLConfig
from tensorflow.keras.utils import register_keras_serializable
from tqdm import tqdm
import datetime
import logging
from tensorboard.plugins import projector
import numpy as np

import mlflow
from mlflow.models import ModelSignature
from mlflow.types.schema import Schema, TensorSpec




        


2024-01-31 14:57:46.522908: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-01-31 14:57:47.090269: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.4/lib64:
2024-01-31 14:57:47.090311: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.4/lib64:


In [2]:


current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")


# Set up train eval Metric
train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
name='test_accuracy')
test_precision = tf.metrics.Precision()
test_recall = tf.metrics.Recall()
train_auc = tf.keras.metrics.AUC()
test_auc = tf.keras.metrics.AUC()


# Set up logging configuration
logging.basicConfig(level=logging.INFO)

def load_TFdataset(config_xl) :
    tf_train_dir = config_xl.tf_data_dir+'/{}'.format(config_xl.mode)+'/train'
    tf_test_dir = config_xl.tf_data_dir+'/{}'.format(config_xl.mode)+'/test'
    train_dataset = tf.data.experimental.load(tf_train_dir)
    test_dataset = tf.data.experimental.load(tf_test_dir)
    with open(config_xl.tf_data_dir+"/dkeyid2idx.pkl", "rb") as file:
        dkeyid2idx = pickle.load(file) 
    
    return train_dataset,test_dataset,dkeyid2idx





@register_keras_serializable()
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = tf.cast(d_model, tf.float32)

        self.warmup_steps = tf.cast(warmup_steps,tf.float32)

    def __call__(self, step):
        step =tf.cast(step, tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
    
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
    
    def get_config(self):
        return {
            'd_model': self.d_model,
            'warmup_steps': self.warmup_steps
            }


@tf.function
def train_step(model, data1,data2, target, mems, optimizer):
    with tf.GradientTape() as tape:
        outputs = model(concepts=data1,responses=data2, labels=target, mems=mems)
        logit = outputs.logit
        mems = outputs.mems
        logit_mx = target != -100
        logit_value = logit[logit_mx]
        logit_value = tf.reshape(logit_value, [-1, config_xl.R_vocab_size])
        labels = target[logit_mx]

        
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logit_value)
        # batch_loss = tf.reduce_sum(loss) / valid_samples
        mean_loss = tf.reduce_mean(loss)
        train_loss(loss)
        train_accuracy(labels,logit_value)
        predictions =tf.nn.softmax(logit_value)
        train_auc(tf.one_hot(labels, depth=predictions.shape[1]), predictions)

    gradients = tape.gradient(mean_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    return mems,mean_loss


def evaluate(model,test_dataset,config_xl):
    total_loss = 0.0
    num_batches = 0
    evaluation_metrics = []
    test_mems = None

    for input_data, masked_responses, responses in tqdm(test_dataset, desc='eval'):

        outputs = model(concepts=input_data, responses=masked_responses, labels=responses, mems=test_mems, training=False)
        logit = outputs.logit
        test_mems = outputs.mems

        logit_mx = responses != -100
        logit_value = logit[logit_mx]
        logit_value = tf.reshape(logit_value, [-1, config_xl.R_vocab_size])
        labels = responses[logit_mx]

        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logit_value)
        mean_loss = tf.reduce_mean(loss)

        # Update precision and recall metrics
        predicted_labels = tf.argmax(logit_value, axis=1)
        predictions =tf.nn.softmax(logit_value)

        
        test_auc(tf.one_hot(labels, depth=predictions.shape[1]), predictions)
        test_precision(labels, predicted_labels)
        test_recall(labels, predicted_labels)

        test_accuracy(labels, logit_value)
        test_loss(loss)
        
        
        precision = test_precision.result().numpy()
        recall = test_recall.result().numpy()
        f1_score = 2 * (precision * recall) / (precision + recall + 1e-7)

        evaluation_metrics.append(test_accuracy.result().numpy())

        total_loss += mean_loss.numpy()
        num_batches += 1

        mlflow.log_metric('loss', test_loss.result(), step=num_batches)
        mlflow.log_metric('accuracy', test_accuracy.result(), step=num_batches)
        mlflow.log_metric('precision', test_precision.result(), step=num_batches)
        mlflow.log_metric('recall', test_recall.result(), step=num_batches)
        mlflow.log_metric('f1_score', f1_score, step=num_batches)
        mlflow.log_metric('auc', test_auc.result(), step=num_batches)

    # 평균 정밀도, 재현율, F1 점수를 계산합니다.
    average_precision = test_precision.result().numpy()
    average_recall = test_recall.result().numpy()
    average_f1_score = 2 * (average_precision * average_recall) / (average_precision + average_recall + 1e-7)

    average_metric = sum(evaluation_metrics) / len(evaluation_metrics)
    average_loss = total_loss / num_batches

    return average_loss, average_metric, average_precision, average_recall, average_f1_score

# make embedding projector 
def Make_embedding_projector(model,config_xl, dkeyid2idx,):
    log_dir=config_xl.tensorboard_emb_log_dir+'/'+current_time+'_{}ep_{}mem_{}/'.format(config_xl.epoch, config_xl.mem_len, config_xl.mode)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    # Save Labels separately on a line-by-line manner.
    with open(os.path.join(log_dir, 'metadata.tsv'), "w") as f:
        for valeu_before_mapping in dkeyid2idx[config_xl.mode]:
            f.write("{}\n".format(valeu_before_mapping))

    weights = tf.Variable(model.transformer.word_emb_C.get_weights()[0])

    checkpoint = tf.train.Checkpoint(embedding=weights)
    checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))
    # Set up config.
    config = projector.ProjectorConfig()
    embedding = config.embeddings.add()
    # The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
    embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
    embedding.metadata_path = 'metadata.tsv'
    projector.visualize_embeddings(log_dir, config)



def train(train_dataset,config_xl):
    try:
        learning_rate = CustomSchedule(config_xl.d_model)

        optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
        model = TFTransfoXLMLMHeadModel(config=config_xl)

        loss_values = []
        num_batches = 0

        for epoch in range(config_xl.epoch):
            start = time.time()
            total_loss = 0.0
            mems = None                   
            for input_data, masked_responses, responses in tqdm(train_dataset, desc='train'):
                mems, loss_value = train_step(model, input_data,masked_responses, responses, mems, optimizer)
                num_batches += 1
                total_loss += loss_value.numpy()
                if num_batches % 100 == 0:
                    loss_values.append(loss_value.numpy())
                    print(f'Epoch {epoch + 1} Batch {num_batches} Loss {loss_value.numpy()}')
                    mlflow.log_metric('loss', train_loss.result(), step=num_batches)
                    mlflow.log_metric('accuracy', train_accuracy.result(), step=num_batches)
                    mlflow.log_metric('auc', train_auc.result(), step=num_batches)


    except Exception as e:
        logging.error(f"Error: {e}")

    return model

2024-01-31 14:57:48.445252: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-01-31 14:57:48.445631: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-01-31 14:57:48.450159: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-01-31 14:57:48.450466: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-01-31 14:57:48.450755: I tensorflow/compiler/xla/stream_executo

In [3]:


    


config_xl = TransfoXLConfig(
    d_embed=128,
    d_head = 32,
    d_model=128,
    mem_len=600,
    n_head=8,
    n_layer=6,
    mask_token=3,
    C_vocab_size=188,
    Q_vocab_size = 12277,
    R_vocab_size = 2,
    epoch = 1,
    mode ='concepts', # concepts or questions 
    tf_data_dir ='/home/jun/workspace/KT/data/ednet/100_sam' ,
    tensorboard_emb_log_dir = '/home/jun/workspace/KT/logs/embedding/',
)

train_dataset,test_dataset,dkeyid2idx=load_TFdataset(config_xl)
model =train(train_dataset.take(1),config_xl)


# Set our tracking server uri for logging
# mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Create a new MLflow Experiment
mlflow.set_experiment("MLflow Quickstart")







Instructions for updating:
Use `tf.data.Dataset.load(...)` instead.


Instructions for updating:
Use `tf.data.Dataset.load(...)` instead.


train:   0%|          | 0/1 [00:00<?, ?it/s]

mlen 600
qlen 140
mlen 600
qlen 140


2024-01-31 14:57:56.917507: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x56157302b000 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-01-31 14:57:56.917523: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA GeForce RTX 2080 SUPER, Compute Capability 7.5
2024-01-31 14:57:56.917526: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (1): NVIDIA GeForce RTX 2080 SUPER, Compute Capability 7.5
2024-01-31 14:57:56.920892: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-01-31 14:57:56.997951: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
train: 100%|██████████| 1/1 [00:12<00:00, 12.18s/it]


<Experiment: artifact_location='file:///home/jun/workspace/KT/mlruns/804073929393319485', creation_time=1706512695084, experiment_id='804073929393319485', last_update_time=1706512695084, lifecycle_stage='active', name='MLflow Quickstart', tags={}>

In [4]:
model.save_pretrained('save_model/my_model',from_pt=True)


In [5]:
model = TFTransfoXLMLMHeadModel.from_pretrained("/home/jun/workspace/KT/save_model/my_model")


Some layers from the model checkpoint at /home/jun/workspace/KT/save_model/my_model were not used when initializing TFTransfoXLMLMHeadModel: ['transformer/dense/kernel:0', 'transformer/layer_normalization/beta:0', 'transformer/layer_normalization/gamma:0', 'transformer/embedding_1/embeddings:0', 'transformer/dense/bias:0', 'transformer/embedding/embeddings:0', 'transformer/dense_1/kernel:0']
- This IS expected if you are initializing TFTransfoXLMLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFTransfoXLMLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFTransfoXLMLMHeadModel were not initialized from the model checkpoint at /home/jun/workspace/KT/save_model/m

mlen 600
qlen 1


In [6]:

# # Start an MLflow run
# with mlflow.start_run():
#     # Log the hyperparameters
#     mlflow.log_params(config_xl.to_dict())

    
#     test_loss,test_acc,test_precision, test_recall, test_f1_score = evaluate(model, test_dataset,config_xl)
#     Make_embedding_projector(model,config_xl,dkeyid2idx)
#     logging.info('test_loss:{},test_acc:{},test_precision:{}, test_recall:{}, test_f1_score:{}'.format(test_loss,test_acc,test_precision, test_recall, test_f1_score))
    
#     # Infer the model signature
#     infer_mem = None
#     input_data, masked_responses, responses = next(iter(train_dataset))
#     outputs = model(concepts=input_data, responses=masked_responses, labels=responses, mems=infer_mem, training=False)
#     logit = outputs.logit
#     logit_value = tf.reshape(logit, [-1, config_xl.R_vocab_size])
#     predicted_labels = tf.argmax(logit_value, axis=1)

#     transposed_input = tf.transpose(input_data)
#     transposed_response = tf.transpose(masked_responses)
#     # 모델 입력과 출력에 대한 TensorSpec 정의
#     input_schema = Schema(
#     [
#         TensorSpec(np.dtype(np.int32), (-1,len(transposed_input[0].numpy())), "input_data"),
#         TensorSpec(np.dtype(np.int32), (-1,len(transposed_response[0].numpy())), "responses"),
#     ]
# )
#     output_schema = Schema([TensorSpec(np.dtype(np.int32),predicted_labels.numpy().shape, 'predicted_labels')])


#     signature = ModelSignature(input_schema)


#     # Log the model
#     mlflow.tensorflow.log_model(model, "model", signature=signature,registered_model_name="tracking-quickstart")
