In [None]:
import os
import sys
import warnings
import transformers

In [None]:
import random
import joblib
import sklearn
import numpy as np
import pandas as pd

In [None]:
random.seed(0)
np.random.seed(0)
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
sys.path.insert(0, '../input/ai4code-source')
transformers.utils.logging.set_verbosity_error()

In [None]:
from dataset import save_size
from dataset import save_triplets
from dataset import get_match_input

In [None]:
from loader import load_notebooks
from loader import train_test_split
from loader import get_train_pct_ranks

In [None]:
from extractor import extract_reg_data
from extractor import extract_match_data

In [None]:
from transformers import AutoTokenizer

In [None]:
os.makedirs('/tmp/data', exist_ok=True)
os.makedirs('/tmp/tfrecs', exist_ok=True)
os.makedirs('/tmp/models', exist_ok=True)
os.makedirs('/tmp/tokenizers', exist_ok=True)

In [None]:
df = load_notebooks('../input/AI4Code/train', 150000)
orders_df = pd.read_csv('../input/AI4Code/train_orders.csv')
ancestors_df = pd.read_csv('../input/AI4Code/train_ancestors.csv')

In [None]:
df['pct_rank'] = get_train_pct_ranks(df, orders_df)

In [None]:
train_df, valid_df = train_test_split(df, ancestors_df, 0.05, 0)

In [None]:
sorted_train_df = train_df.sort_values(['id','pct_rank'])
sorted_valid_df = valid_df.sort_values(['id','pct_rank'])

In [None]:
sorted_valid_df.reset_index(drop=True).to_feather('/tmp/data/valid.ftr')

In [None]:
name = 'microsoft/codebert-base'
model = TFAutoModel.from_pretrained(name)
tokenizer = AutoTokenizer.from_pretrained(name)
train_data = extract_reg_data(sorted_train_df, tokenizer, 72, 22, 512)
valid_data = extract_reg_data(sorted_valid_df, tokenizer, 72, 22, 512)
np.savez_compressed('/tmp/data/train_reg.npz', **train_data)
np.savez_compressed('/tmp/data/valid_reg.npz', **valid_data)
tokenizer.save_pretrained('/tmp/tokenizers/codebert')
model.save_pretrained('/tmp/models/codebert')

In [None]:
name = 'microsoft/unixcoder-base'
tokenizer = AutoTokenizer.from_pretrained(name)
model = TFAutoModel.from_pretrained(name, from_pt=True)
train_data = extract_match_data(sorted_train_df, tokenizer, 128, 7)
valid_data = extract_match_data(sorted_valid_df, tokenizer, 128, 7)
np.savez_compressed('/tmp/data/train_match.npz', **train_data)
np.savez_compressed('/tmp/data/valid_match.npz', **valid_data)
tokenizer.save_pretrained('/tmp/tokenizers/unixcoder')
model.save_pretrained('/tmp/models/unixcoder')

In [None]:
df = sklearn.utils.shuffle(df, random_state=0)
ancestors = df['id'].map(ancestors_df.set_index('id')['ancestor_id'].to_dict())
folds = sklearn.model_selection.GroupKFold(80).split(df, groups=ancestors)
tokenizer = AutoTokenizer.from_pretrained('microsoft/unixcoder-base')

total_num = 0
for i, (_, idx) in enumerate(folds):
    data = extract_match_data(df.iloc[idx], tokenizer, 128, 7) 
    save_triplets(get_match_input(data), f'/tmp/tfrecs/m{i:02}.tfrec')
    total_num += len(data['anc_idx'])

save_size(total_num, '/tmp/tfrecs/size.txt')

In [None]:
!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
%%writefile ~/.kaggle/kaggle.json
{"username":"valentinaliferov","key":"<key>"}

In [None]:
%%writefile /tmp/data/dataset-metadata.json
{"title":"ai4code_data","id":"valentinaliferov/ai4code-data","licenses":[{"name":"CC0-1.0"}]}

In [None]:
%%writefile /tmp/tfrecs/dataset-metadata.json
{"title":"ai4code_tfrecs","id":"valentinaliferov/ai4code-tfrecs","licenses":[{"name":"CC0-1.0"}]}

In [None]:
%%writefile /tmp/models/dataset-metadata.json
{"title":"ai4code_models","id":"valentinaliferov/ai4code-models","licenses":[{"name":"CC0-1.0"}]}

In [None]:
%%writefile /tmp/tokenizers/dataset-metadata.json
{"title":"ai4code_tokenizers","id":"valentinaliferov/ai4code-tokenizers","licenses":[{"name":"CC0-1.0"}]}

In [None]:
!kaggle datasets version -r zip -p /tmp/data -m 'update'
!kaggle datasets version -r zip -p /tmp/tfrecs -m 'update'
!kaggle datasets version -r zip -p /tmp/models -m 'update'
!kaggle datasets version -r zip -p /tmp/tokenizers -m 'update'