In [None]:
import os
import sys
import warnings
import transformers

In [None]:
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
sys.path.insert(0, '../input/ai4code-source')
transformers.utils.logging.set_verbosity_error()

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf

In [None]:
from utils import submit

In [None]:
from dataset import get_dataset
from dataset import get_reg_input

In [None]:
from loader import get_pct_ranks
from loader import load_notebooks

In [None]:
from extractor import extract_reg_data
from extractor import extract_match_data

In [None]:
from model import get_reg_model
from model import get_match_model
from model import pairwise_cosine_similarity

In [None]:
from transformers import AutoTokenizer

In [None]:
test_df = load_notebooks('../input/AI4Code/test', 150000)
test_df['pct_rank'] = get_pct_ranks(test_df, ['id','cell_type'])

In [None]:
tokenizer = AutoTokenizer.from_pretrained('../input/ai4code-tokenizers/codebert')
regressor = get_reg_model('../input/ai4code-models/codebert', pad_token_id=1)
regressor.load_weights('../input/ai4code-weights/reg-last-0.08864-0.8571.h5')

In [None]:
test_data = extract_reg_data(test_df, tokenizer, 72, 22, 512)
test_dataset = get_dataset(data=get_reg_input(test_data), batch_size=64)

In [None]:
reg_ranks = regressor.predict(test_dataset, verbose=1)[:,0]
del regressor, test_data, test_dataset

In [None]:
p = dict(pad_token_id=1, batch_size=64, from_pt=False)
embedder, model = get_match_model('../input/ai4code-models/unixcoder', **p)
tokenizer = AutoTokenizer.from_pretrained('../input/ai4code-tokenizers/unixcoder')
model.load_weights('../input/ai4code-weights/match-0.8451.h5')

In [None]:
test_data = extract_match_data(test_df, tokenizer, 128, 7)
marks_dataset = get_dataset(data=test_data['mark_ids'], batch_size=64)
codes_dataset = get_dataset(data=test_data['code_ids'], batch_size=64)
marks_embs = embedder.predict(marks_dataset, verbose=1)
codes_embs = embedder.predict(codes_dataset, verbose=1)

In [None]:
predicted = []
nb_mark_max_idx = test_data['mark_nb'].max()
nb_code_max_idx = test_data['code_nb'].max()
nb_max_idx = max(nb_mark_max_idx, nb_code_max_idx)

for i in tqdm(range(nb_max_idx + 1)):
    
    m = test_data['mark_nb'] == i
    c = test_data['code_nb'] == i
    
    if m.sum() > 0 and c.sum() == 0:
        predicted.append(test_data['mark_pos'][m])
    
    if m.sum() > 0 and c.sum() > 0:
        scores = pairwise_cosine_similarity(marks_embs[m], codes_embs[c])
        predicted.append(test_data['code_pos'][c][tf.argmax(scores, axis=1).numpy()])

del model, test_data, marks_dataset, codes_dataset, marks_embs, codes_embs
match_ranks = np.concatenate(predicted) - 0.001

In [None]:
submit(test_df, reg_ranks, match_ranks, rerank_match=True, reg_coef=1.2, match_coef=0.8)