<a href="https://colab.research.google.com/github/yonseimath/datascience-biginner-2022-kaggle-competitions/blob/feature%2Fyenakim/yenakim/AI4Code_TF_TPU_CodeBert_Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
import glob
import os
from typing import List

import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from tqdm.notebook import tqdm

In [None]:
BATCH_SIZE = 32
SLICES = 8
MD_MAX_LEN = 64
TOTAL_MAX_LEN = 512
STRATEGY = tf.distribute.get_strategy() # with strategy.scope()의 strategy 반환
BASE_MODEL = "../input/codebert-base/codebert-base"
TOKENIZER = transformers.AutoTokenizer.from_pretrained(BASE_MODEL)
INPUT_PATH = "../input/AI4Code"

# Collect Data

In [None]:
paths = glob.glob(os.path.join(INPUT_PATH, "test", "*.json"))
df = (
    pd.concat([read_notebook(x) for x in tqdm(paths, desc="Concat")])
    .set_index("id", append=True)
    .swaplevel()
    .sort_index(level="id", sort_remaining=False)
).reset_index()
df["source"] = df["source"].str.slice(0, MD_MAX_LEN) # MD_MAX_LEN을 넘어가면 잘라냄
df["rank"] = df.groupby(["id", "cell_type"]).cumcount() # 각 그룹의 항목에 0부터 숫자를 매김
df["pct_rank"] = df.groupby(["id", "cell_type"])["rank"].rank(pct=True)

fts = get_features(df)

# Run Inference

In [None]:
with STRATEGY.scope():
    model = get_model()
    model.load_weights("../input/ai4code-codebert-weights/model_0.h5")

predict = np.array([], dtype=np.float32)

for chunk in tqdm(
    np.array_split(df[df["cell_type"] == "markdown"], SLICES), total=SLICES # 이 부분 이해 X
):
    if chunk.empty:
        continue

    data = tokenize(chunk, fts)

    dataset = get_dataset(data["input_ids"], data["attention_mask"], data["features"])
    predict = np.r_[
        predict,
        model.predict(dataset).reshape(
            -1,
        ),
    ]

# Save Submission

In [None]:
df.loc[df["cell_type"] == "markdown", "pct_rank"] = predict
df = df.sort_values("pct_rank").groupby("id")["cell_id"].apply(" ".join)
df.name = "cell_order"
df.to_csv("submission.csv")