In [None]:
# # Quick Save
# import os
# if not os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
#     ! touch submission.csv
#     import sys
#     sys.exit(0)

In [None]:
! pip install --quiet --use-deprecated=legacy-resolver --no-index /kaggle/input/llm-se-python-wheel/llm_science_exam-0.0.1-py3-none-any.whl --find-links /kaggle/input/llm-se-required-libs-python-wheels

In [None]:
# checkpoint_path = "/kaggle/input/llm-se-deberta-v3-large-weights"

In [None]:
import llm_science_exam.model.deberta
import llm_science_exam.data.config
import llm_science_exam.open_book_v2.tf_idf
from torch.utils.data import DataLoader
from datasets import Dataset
import tqdm
import numpy as np
import torch
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import gc
import ctypes


def clean_memory():
    gc.collect()
    ctypes.CDLL("libc.so.6").malloc_trim(0)
    torch.cuda.empty_cache()

llm_science_exam.model.deberta.custom_forward_method.enable_memory_efficient_forward_method()

In [None]:
config = llm_science_exam.data.config.get_config_from_checkpoint(checkpoint_path, drop_log_history=True)
config

In [None]:
# Load data
# 
import os

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    dataset_type = "test"
else:
    dataset_type = "train"
#     
# df = pd.read_csv(f"/kaggle/input/kaggle-llm-science-exam/{dataset_type}.csv")
# 
# 
# # # 4000 validation
# # if not os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
# #     extra_df = pd.read_csv("/kaggle/input/llm-se-extra-train-datasets/takeshisuzuki/additional-dataset-800articles-4000rows/additional_dataset_800articles_4000rows.csv")
# #     extra_df = extra_df.dropna().reset_index(drop=True)
# # #     df = extra_df
# #     df = pd.concat([df, extra_df]).reset_index(drop=True)
# 
# # # +300 validation
# # if not os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
# #     extra_df = pd.read_csv("/kaggle/input/llm-se-extra-train-datasets/yalickj/dataset-wiki-new-1/dataset_wiki_new_1_balanced.csv")
# #     df = pd.concat([df, extra_df]).reset_index(drop=True)
# 
# 
# # if not os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
# #    df = pd.concat([df] * (4000 // len(df))).reset_index(drop=True)
# #    df["id"] = np.arange(len(df))
# 
# # if not os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
# #     for col in ["prompt", "A", "B", "C", "D", "E"]:
# #         for _ in range(10 - 1):
# #             df[col] = df[col] + df[col]
# 
# 
# id_list = df["id"].to_numpy()
# 
# if "answer" in df.columns:
#     answers = df["answer"].to_numpy()
# else:
#     answers = None
# 
# df

In [None]:
# !cp -r /kaggle/input/all-paraphs-parsed-expanded /kaggle/working/
# 
# df["context"] = llm_science_exam.open_book_v2.tf_idf.get_context(
#     df,
#     wiki_dataset_paths=[
#         "/kaggle/working/all-paraphs-parsed-expanded",
# #         "/kaggle/input/llm-se-additional-wiki-stem-articles"
#     ],
#     num_titles=3,
# #     num_titles=10,
# )
# 
# print(df.iloc[0])
# df.to_csv(f"{dataset_type}.csv", index=False)
# del df
# 
# clean_memory()

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

dataset = load_dataset("csv", data_files={"train": dataset_with_context_path})["train"]

# tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

tokenized_dataset = llm_science_exam.model.deberta.dataset.map_preprocess(
    dataset,
    tokenizer,
#         max_length=2 * 1024,
#         max_length=int(1.5 * 1024),
    with_answer=False,
    max_length=1024,
    num_proc=None,
)
    
del dataset, tokenizer
clean_memory()

df = tokenized_dataset.to_pandas()
del tokenized_dataset
clean_memory()

In [None]:
# import numpy as np
# probs = np.random.random((len(df), 5))
# probs /= np.sum(probs, axis=1, keepdims=True)

from tqdm.auto import tqdm

def run_model(device: int, df):
    model, tokenizer = llm_science_exam.model.deberta.model.get_model_from_checkpoint(config["model"], checkpoint_path)
    model.to(f"cuda:{device}")
    model.half()
    model.eval()

    data_loader = DataLoader(
        Dataset.from_pandas(df),
        batch_size=1,
        shuffle=False,
        collate_fn=llm_science_exam.model.deberta.dataset.DataCollatorForMultipleChoice(tokenizer=tokenizer),
    )
    
    probs = []
    for batch in tqdm(data_loader, desc=f"inference on device cuda:{device}", position=device + 1):
        for k in batch.keys():
            batch[k] = batch[k].to(model.device)
        with torch.no_grad():
            outputs = model(**batch)
            probs.append(torch.softmax(outputs.logits, dim=-1).cpu().detach().numpy())
        del batch, outputs
        clean_memory()
    return np.concatenate(probs)


# Run model

with ThreadPoolExecutor() as executor:
    probs = list(executor.map(run_model, [0, 1], np.array_split(df, 2)))
    probs = np.concatenate(probs)

In [None]:
del df
clean_memory()
pd.DataFrame(probs).to_csv("probs.csv", index=False)

In [None]:
# Save results
df = pd.read_csv(dataset_with_context_path)

n = len(df)
for i, scores in enumerate(probs):
    top3 = np.argsort(scores)[::-1]
    df.loc[i, 'prediction'] = ' '.join(['ABCDE'[j] for j in top3])
df[["id", 'prediction']].to_csv('submission.csv', index=False)

# Display performances if train set is used

def print_map_at_3(df):
    n = len(df)
    for i in range(n):
        df.loc[i, 'top_1'] = df.loc[i, 'prediction'][0]
        df.loc[i, 'top_2'] = df.loc[i, 'prediction'][2]
        df.loc[i, 'top_3'] = df.loc[i, 'prediction'][4]

    top_i = [(df[f'top_{i}'] == df["answer"]).sum() for i in [1, 2, 3]]
    print(f'top1 : {top_i[0]}/{n}, top2 : {top_i[1]}/{n}, top3 : {top_i[2]}/{n} (total={sum(top_i)} / {n})')
    print(f'Accuracy: {100*top_i[0]/n:.1f}%, map3: {100*(top_i[0] + top_i[1]*1/2 + top_i[2]*1/3).sum()/n:.1f}%')

    
if 'answer' in df.columns:
    if len(df) > 200:
        print("Old CV:")
        print_map_at_3(df.iloc[:200])

        print("\nNew CV:")
    else:
        print("CV:")
    print_map_at_3(df)