In [None]:
import os
import sys
import warnings
import transformers

In [None]:
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
sys.path.insert(0, '../input/ai4code-source')
transformers.utils.logging.set_verbosity_error()

In [None]:
import os
import math
import random
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf

In [None]:
from utils import evaluate
from extractor import concat_data

In [None]:
from dataset import load_size
from dataset import get_dataset
from dataset import get_reg_input
from dataset import decode_triplet
from dataset import get_match_input

In [None]:
from model import get_optimizer
from model import get_reg_model
from model import get_match_model
from model import pairwise_cosine_similarity

In [None]:
from IPython.display import FileLink
from kaggle_datasets import KaggleDatasets
from kaggle_secrets import UserSecretsClient

In [None]:
random.seed(0)
np.random.seed(0)
tf.random.set_seed(0)

In [None]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)
batch_size = 64 * strategy.num_replicas_in_sync

In [None]:
user_secrets = UserSecretsClient()
# Add-ons -> Google Cloud SDK -> Link Account
# https://www.kaggle.com/product-feedback/163416
user_credential = user_secrets.get_gcloud_credential()
user_secrets.set_tensorflow_credential(user_credential)

In [None]:
class ModelCallback(tf.keras.callbacks.Callback):
    
    def on_train_begin(self, logs=None):
        self.loss_history = []
        self.checkpoint = 1
    
    def on_batch_end(self, batch, logs={}):
        if batch % 1000 == 0 and batch != 0:
            self.model.save_weights(f"{self.checkpoint:02d}-{logs.get('loss'):.05f}.h5")
            np.savez_compressed('history.npz', loss=np.array(self.loss_history))
            self.checkpoint += 1
        self.loss_history.append(logs.get('loss'))
    
    def on_train_end(self, logs=None):
        self.model.save_weights(f"last-{logs.get('loss'):.05f}.h5")
        np.savez_compressed('history.npz', loss=np.array(self.loss_history))

In [None]:
epochs = 6
warmup_rate = 0.05
learning_rate = 3e-5
weight_decay_rate = 0.01

train_data = np.load('../input/ai4code-data/train_reg.npz')
valid_data = np.load('../input/ai4code-data/valid_reg.npz')
train_steps = math.ceil(len(train_data['ids']) / batch_size)
valid_steps = math.ceil(len(valid_data['ids']) / batch_size)
warmup_steps = int(train_steps * epochs * warmup_rate)
total_steps = train_steps * epochs

with strategy.scope():

    train_dataset = get_dataset(data=get_reg_input(train_data), 
                                shuffled=True, buf_size=len(train_data['ids']), seed=0, 
                                repeated=True, batch_size=batch_size, strategy=strategy)
    
    valid_dataset = get_dataset(data=get_reg_input(valid_data), 
                                repeated=True, batch_size=batch_size, strategy=strategy)

    optimizer = get_optimizer(learning_rate, warmup_steps, total_steps, weight_decay_rate)
    regressor = get_reg_model('microsoft/codebert-base', pad_token_id=1)
    regressor.compile(loss='mae', optimizer=optimizer)
    
    regressor.fit(
        train_dataset, steps_per_epoch=train_steps,
        validation_steps=valid_steps, validation_data=valid_dataset,
        epochs=epochs, verbose=1, callbacks=[ModelCallback()]
    )

In [None]:
valid_data = np.load('../input/ai4code-data/valid_reg.npz')
valid_steps = math.ceil(len(valid_data['ids']) / batch_size)
valid_df = pd.read_feather('../input/ai4code-data/valid.ftr')

with strategy.scope():
    
    valid_dataset = get_dataset(data=get_reg_input(valid_data, scaler), 
                                batch_size=batch_size, strategy=strategy)
    
    reg_ranks = regressor.predict(valid_dataset, steps=valid_steps, verbose=1)[:,0]

evaluate(valid_df, reg_ranks, reg_ranks, rerank_match=False)

In [None]:
epochs = 2
warmup_rate = 0.05
learning_rate = 3e-5
weight_decay_rate = 0.01

gcs_path = KaggleDatasets().get_gcs_path('ai4code-tfrecs')
train_size = load_size('../input/ai4code-tfrecs/size.txt')
train_paths = tf.io.gfile.glob(f'{gcs_path}/*.tfrec')

train_steps = math.ceil(train_size / batch_size)
warmup_steps = int(train_steps * epochs * warmup_rate)
total_steps = train_steps * epochs

with strategy.scope():

    train_dataset = get_dataset(
        paths=train_paths, decode_fn=decode_triplet, 
        shuffled=True, buf_size=train_size, seed=0, 
        repeated=True, batch_size=batch_size, strategy=strategy
    )
    
    embedder, model = get_match_model(
        'microsoft/unixcoder-base', batch_size, 
        pad_token_id=1, from_pt=True
    )
    
    optimizer = get_optimizer(
        learning_rate, warmup_steps, total_steps, weight_decay_rate
    )
    
    model.compile(loss=None, optimizer=optimizer)
    
    model.fit(
        train_dataset, steps_per_epoch=train_steps, 
        epochs=epochs, verbose=1, callbacks=[ModelCallback()]
    )

In [None]:
valid_df = pd.read_feather('../input/ai4code-data/valid.ftr')
valid_data = np.load('../input/ai4code-data/valid_match.npz')
ctx_steps = math.ceil(len(valid_data['code_ids']) / batch_size)
mark_steps = math.ceil(len(valid_data['mark_ids']) / batch_size)

with strategy.scope():
    mark_dataset = get_dataset(data=valid_data['mark_ids'], batch_size=batch_size, strategy=strategy)
    ctx_dataset = get_dataset(data=valid_data['code_ids'], batch_size=batch_size, strategy=strategy)
    mark_embs = embedder.predict(mark_dataset, steps=mark_steps, verbose=1)
    ctx_embs = embedder.predict(ctx_dataset, steps=ctx_steps, verbose=1)

predicted = []
for i in tqdm(range(valid_data['mark_nb'].max() + 1)):
    m, c = valid_data['mark_nb'] == i, valid_data['code_nb'] == i
    scores = pairwise_cosine_similarity(mark_embs[m], ctx_embs[c])
    predicted.append(valid_data['code_pos'][c][tf.argmax(scores, axis=1).numpy()])

match_ranks = np.concatenate(predicted) - 0.001
evaluate(valid_df, match_ranks, match_ranks, rerank_match=False)