### Initialization

In [None]:
import json
import itertools
from pathlib import Path
import numpy as np
import pandas as pd
from re import search
from typing import List, Tuple, Any, Union
import nltk
import torch
from datasets import Dataset
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer, AutoTokenizer
from tqdm.auto import tqdm
import collections
from sklearn.utils import shuffle
import os

def load_json(file: str):
    return json.loads(Path(file).read_bytes())

# Load data
p_data = pd.read_csv("./dataset/train.csv").drop(["Unnamed: 6", "total no.: 7987"], axis=1)
split_ids = load_json('./dataset/splitIds__splitBy-id_stratifyBy-s_train-0.6_valid-0.2_test-0.2_seed-42.json')
train_data, valid_data, train2_data = [p_data[p_data.id.isin(split_ids[split])] for split in ["train", "valid", "test"]]

# Concat "train" and "test" split into new training data 
train_data = pd.concat([train_data, train2_data],axis=0)
print(train_data.shape[0], valid_data.shape[0])
p_data = pd.read_csv("./dataset/train.csv").drop(["Unnamed: 6", "total no.: 7987"], axis=1)

In [None]:
# These are the parameters of the best model
# The total batch size is 32 (8 * 2(accumulate) * 2(two gpus))
args = {
    "max_len" : 512,
    "batch_size" : 8,
    "model_name" : "janeel/muppet-roberta-base-finetuned-squad",
    "learning_rate" : 3e-5,
    "warmup_ratio" : 0.06,
    "seed" : 26,
    "split" : "6+22",
    "special" : "shuffle"
}

### Functions

In [None]:
def contains(small, big):
    for i in range(len(big)-len(small)+1):
        for j in range(len(small)):
            if big[i+j] != small[j]:
                break
        else:
            return i, i+len(small)-1
    return False

def keep_continuous(data: pd.DataFrame):
    keep=[]
    for i in range(data.shape[0]):
        qp_not_in_q = data.iloc[i]['q\''][1:-1] not in data.iloc[i]['q'][1:-1]
        rp_not_in_r = data.iloc[i]['r\''][1:-1] not in data.iloc[i]['r'][1:-1]
        if not (qp_not_in_q or rp_not_in_r):
            keep.append(i)
    
    data = data.iloc[keep]
    return data

def format_data_qp(q: List[int], r: List[int], s: int, qp: List[int], rp: List[int]) -> Tuple[List[int], List[int], List[int], int, int]:
    q_r_s = [clsid] + q + [sepid] + r + [sepid] + s + [sepid]
    attention_mask = [1 if _ in range(len(q_r_s)) else 0 for _ in range(args['max_len'])]
    input_id = [q_r_s[_] if _ in range(len(q_r_s)) else padid for _ in range(args['max_len'])]
    
    if contains(qp, q_r_s):
        start_pos, end_pos = contains(qp, q_r_s)
        print('qp:', qp)
        print('q_r_s:', q_r_s)
        print(start_pos, end_pos)
    else:
        start_pos, end_pos = 0, 0
        
    return input_id, attention_mask, start_pos, end_pos

def format_data_rp(q: List[int], r: List[int], s: int, qp: List[int], rp: List[int]) -> Tuple[List[int], List[int], List[int], int, int]:
    q_r_s = [clsid] + r + [sepid] + q + [sepid] + s + [sepid]
    attention_mask = [1 if _ in range(len(q_r_s)) else 0 for _ in range(args['max_len'])]
    input_id = [q_r_s[_] if _ in range(len(q_r_s)) else padid for _ in range(args['max_len'])]
    
    if contains(rp, q_r_s):
        start_pos, end_pos = contains(rp, q_r_s)
    else:
        start_pos, end_pos = 0, 0
        
    return input_id, attention_mask, start_pos, end_pos

### Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(args["model_name"])
clsid = tokenizer.cls_token_id
sepid = tokenizer.sep_token_id
padid = tokenizer.pad_token_id
def model_tokenize(text: str) -> List[int]:
    text = text.strip('"')
    token_ids = tokenizer(text)["input_ids"]
    return token_ids[1:-1] #without cls sep

In [None]:
def preprocess(data: pd.DataFrame, choice:str):
    # Only keep data with continuous span
    data = keep_continuous(data)
    print('ids left:', data['id'].nunique())
    print('instances left', data.shape[0])
    ids = list(data.id)
    Q, R, S, QP, RP = [data[field] for field in ["q", "r", "s", "q'", "r'"]]
    Q, R, QP, RP, S = [list(map(model_tokenize, x)) for x in [Q, R, QP, RP, S]]

    # Only keep data Q+R+S < 512 tokens
    count = 0
    keep = []
    for i in range(len(Q)):
        if (len(Q[i])+len(R[i])) > 512-5:
            count += 1
        else:
            keep.append(i)
    print(f"Q+R+S longer than {args['max_len']} tokens:", count, " Remains:",len(keep))
    Q = [Q[i] for i in keep]
    R = [R[i] for i in keep]
    QP = [QP[i] for i in keep]
    RP = [RP[i] for i in keep]
    S = [S[i] for i in keep]
    ids = [ids[i] for i in keep]
    
    # Find start end positions then make a dictionary
    if choice == 'qp':
        data = list(map(format_data_qp, Q, R, S, QP, RP))
    elif choice == 'rp':
        data = list(map(format_data_rp, Q, R, S, QP, RP))
    else:
        return 'ERROR'
    return 0
    input_list, token_list, attention_list, s_pos, e_pos =[], [], [], [], []
    for i in range(len(data)):
        input_list.append(data[i][0])
        attention_list.append(data[i][1])
        s_pos.append(data[i][2])
        e_pos.append(data[i][3])
        
    data = {
        'input_ids': input_list,
        'attention_masks': attention_list,
        'start_positions': s_pos,
        'end_positions': e_pos
    }
    
    # Turn dictionary into a dataset
    ds = Dataset.from_dict(data)
    return ds

In [None]:
# Start preprocess
train_data_qp_done=preprocess(train_data, 'qp')
valid_data_qp_done=preprocess(valid_data, 'qp')

train_data_rp_done=preprocess(train_data, 'rp')
valid_data_rp_done=preprocess(valid_data, 'rp')

### Model / Collator / Trainer

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import default_data_collator

# Start training Q'
model = AutoModelForQuestionAnswering.from_pretrained(args["model_name"])
model_args = TrainingArguments(
    f'corrected_models/{args["model_name"]}-qp-{args["split"]}-b_{args["batch_size"]}-lr_{args["learning_rate"]}-warm_{args["warmup_ratio"]}-seed_{args["seed"]}-{args["special"]}',
    evaluation_strategy = "epoch",
    learning_rate=args["learning_rate"],
    per_device_train_batch_size=args['batch_size'],
    per_device_eval_batch_size=args['batch_size'],
    warmup_ratio=args["warmup_ratio"],
    seed=args["seed"],
    num_train_epochs=6,
    weight_decay=0.01,
    gradient_accumulation_steps=2
)
data_collator = default_data_collator
trainer = Trainer(
    model,
    model_args,
    train_dataset=train_data_qp_done,
    eval_dataset=valid_data_qp_done,
    data_collator=data_collator,
    tokenizer=tokenizer,
)
trainer.train()

In [None]:
# Start training R'
model = AutoModelForQuestionAnswering.from_pretrained(args["model_name"])
model_args = TrainingArguments(
    f'corrected_models/{args["model_name"]}-rp-{args["split"]}-b_{args["batch_size"]}-lr_{args["learning_rate"]}-warm_{args["warmup_ratio"]}-seed_{args["seed"]}-{args["special"]}',
    evaluation_strategy = "epoch",
    learning_rate=args["learning_rate"],
    per_device_train_batch_size=args['batch_size'],
    per_device_eval_batch_size=args['batch_size'],
    warmup_ratio=args["warmup_ratio"],
    seed=args["seed"],
    num_train_epochs=6,
    weight_decay=0.01,
    gradient_accumulation_steps=2
)
data_collator = default_data_collator
trainer = Trainer(
    model,
    model_args,
    train_dataset=train_data_rp_done,
    eval_dataset=valid_data_rp_done,
    data_collator=data_collator,
    tokenizer=tokenizer,
)
trainer.train()

### Predict Answer from model checkpoints

In [None]:
def leave_unique_id(data: pd.DataFrame):
    ids = []
    index = []
    for i in range(data.shape[0]):
        if data.iloc[i]['id'] not in ids:
            ids.append(data.iloc[i]['id'])
            index.append(i)
    print(len(index), len(ids))
    data = data.iloc[index]
    return data

def format_data_post_qp(q: str, r: str, s: str, ids: str):
    q_r_s =  q + '</s>' + r + '</s>' + s 
    tokenized_q_r_s = tokenizer(q_r_s, return_offsets_mapping=True, padding="max_length", max_length=512, truncation=True)
    
    cls_q_sep =  q
    tokenized_q = tokenizer(cls_q_sep)["input_ids"]
    
    tokenized_q_r_s["example_id"] = ids
    tokenized_q_r_s["offset_mapping"] = [tokenized_q_r_s["offset_mapping"][_] if _ in range(len(tokenized_q)-1) else None for _ in range(len(tokenized_q_r_s["offset_mapping"]))]
    tokenized_q_r_s["offset_mapping"][0] = None
    return tokenized_q_r_s

def format_data_post_rp(q: str, r: str, s: str, ids: str):
    q_r_s =  r + '</s>' + q + '</s>' + s 
    tokenized_q_r_s = tokenizer(q_r_s, return_offsets_mapping=True, padding="max_length", max_length=512, truncation=True)
    
    cls_q_sep =  r
    tokenized_q = tokenizer(cls_q_sep)["input_ids"]
    
    tokenized_q_r_s["example_id"] = ids
    tokenized_q_r_s["offset_mapping"] = [tokenized_q_r_s["offset_mapping"][_] if _ in range(len(tokenized_q)-1) else None for _ in range(len(tokenized_q_r_s["offset_mapping"]))]
    tokenized_q_r_s["offset_mapping"][0] = None
    return tokenized_q_r_s
        
def postprocess(data: pd.DataFrame, choice: str):
    ids = list(data.id)
    Q, R, S = [data[field] for field in ["q", "r", "s"]]
    Q, R, S = [list(map(lambda x: x.strip('"'), y)) for y in [Q, R, S]]
    
    if choice == 'qp':
        data = list(map(format_data_post_qp, Q, R, S, ids))
    elif choice == 'rp':
        data = list(map(format_data_post_rp, Q, R, S, ids))
    input_list, token_list, attention_list, offset, ex_id =[], [], [], [], []
    
    for i in range(len(data)):
        input_list.append(data[i]["input_ids"])
        attention_list.append(data[i]["attention_mask"])
        offset.append(data[i]["offset_mapping"])
        ex_id.append(data[i]["example_id"])
        
    data = {
        'input_ids': input_list,
        'attention_mask': attention_list,
        'offset_mapping': offset,
        'example_id': ex_id
    }
    ds = Dataset.from_dict(data)
    return ds

# load model
def getPredictFromCkpt(ckpt: str, choice: str, test_post):
    model = AutoModelForQuestionAnswering.from_pretrained(f'corrected_models/{args["model_name"]}-{choice}-{args["split"]}-b_{args["batch_size"]}-lr_{args["learning_rate"]}-warm_{args["warmup_ratio"]}-seed_{args["seed"]}-{args["special"]}/checkpoint-{ckpt}')
    test_args = TrainingArguments(
        output_dir = f'corrected_models/{args["model_name"]}-{choice}-{args["split"]}-b_{args["batch_size"]}-lr_{args["learning_rate"]}-warm_{args["warmup_ratio"]}-seed_{args["seed"]}-{args["special"]}/checkpoint-{ckpt}',
        do_train = False,
        do_predict = True,
        per_device_eval_batch_size = args["batch_size"],
        gradient_accumulation_steps=2
    )

    # init trainer
    trainer = Trainer(model = model, args = test_args)
    raw_predictions = trainer.predict(test_post)
    return raw_predictions

# turn raw predictions (start/end span) to strings
def postprocess_qa_predictions(examples, features, raw_predictions, choice, n_best_size = 10, max_answer_length = 510):
    all_start_logits, all_end_logits = raw_predictions
    n_best_size = 10
    predictions = collections.OrderedDict()
    # Let's loop over all the examples!
    for example_index in range(examples.shape[0]):
        # Those are the indices of the features associated to the current example.
        valid_answers = []
        
        if choice == 'qp':
            context = examples.iloc[example_index]["q"][1:-1] #strip "
        elif choice == 'rp':
            context = examples.iloc[example_index]["r"][1:-1] #strip "
        
        # We grab the predictions of the model for this feature.
        start_logits = all_start_logits[example_index]
        end_logits = all_end_logits[example_index]
        # This is what will allow us to map some the positions in our logits to span of texts in the original
        # context.
        offset_mapping = features[example_index]["offset_mapping"]
        # Update minimum null prediction.
        cls_index = features[example_index]["input_ids"].index(tokenizer.cls_token_id)
        feature_null_score = start_logits[cls_index] + end_logits[cls_index]

        # Go through all possibilities for the `n_best_size` greater start and end logits.
        start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
        end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                # to part of the input_ids that are not in the context.
                if (
                    start_index >= len(offset_mapping)
                    or end_index >= len(offset_mapping)
                    or offset_mapping[start_index] is None
                    or offset_mapping[end_index] is None
                ):
                    continue
                # Don't consider answers with a length that is either < 0 or > max_answer_length.
                if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                    continue

                start_char = offset_mapping[start_index][0]
                end_char = offset_mapping[end_index][1]
                valid_answers.append(
                    {
                        "score": start_logits[start_index] + end_logits[end_index],
                        "text": context[start_char: end_char] # +1 because of the starting "
                    }
                )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        predictions[examples.iloc[example_index]["id"]] = best_answer["text"]
        
    return predictions

# make gold csv from valid/ test set data
def make_gold_csv(data, output_name):
    test_dropped = data.drop(['q', 'r', 's'], axis=1)
    test_dropped = test_dropped.fillna('')
    test_dropped.to_csv(output_name, header=False)

In [None]:
# Make the evaluation csv
make_gold_csv(valid_data,"split62+2.csv")

In [None]:
# real_predict_test = 1, if we are going to predict the final submission test
# real_predict_test = 0, if we are only predicting for evaluation 
real_predict_test = 0
if real_predict_test == 1:
    # Get test data for final submission
    valid_data = pd.read_csv("./dataset/test.csv")

# Leave only unique id
test_data_unique = leave_unique_id(valid_data)
# Start postprocessing
test_qp_post = postprocess(test_data_unique, 'qp')
test_rp_post = postprocess(test_data_unique, 'rp')

ckpts = list(range(400, 3200, 200))
if real_predict_test == 1:
    ckpts = ['1600']
for ckpt in ckpts:
    if real_predict_test == 0:
        # Get raw predictions from our model (We use the same checkpoint for both q' r' when evaluating)
        raw_predictions_qp = getPredictFromCkpt(ckpt, 'qp', test_qp_post)
        raw_predictions_rp = getPredictFromCkpt(ckpt, 'rp', test_rp_post)
    elif if real_predict_test == 1:
        # Get raw predictions from our model (We found combining different checkpoints of q' models and r' models gives us a better result)
        # Checkpoint-1200 of q' and Checkpoint-2400 of r' claims the best result
        raw_predictions_qp = getPredictFromCkpt('1200', 'qp', test_qp_post)
        raw_predictions_rp = getPredictFromCkpt('2400', 'rp', test_rp_post)
    
    # The Trainer hides the columns that are not used by the model, so we set them back
    test_qp_post.set_format(type=test_qp_post.format["type"], columns=list(test_qp_post.features.keys()))
    test_rp_post.set_format(type=test_rp_post.format["type"], columns=list(test_rp_post.features.keys()))
    
    # Get final predictions
    final_predictions_qp = postprocess_qa_predictions(test_data_unique, test_qp_post, raw_predictions_qp.predictions, 'qp')
    final_predictions_rp = postprocess_qa_predictions(test_data_unique, test_rp_post, raw_predictions_rp.predictions, 'rp')
    
    ids, qp, rp = [], [], []
    for k,v in final_predictions_qp.items():
        ids.append(k)
        qp.append(v)
    for k,v in final_predictions_rp.items():
        rp.append(v)

    dict = {'id': ids, "q'": qp, "r'": rp} 
    df = pd.DataFrame(dict) 
    df = df.replace(np.nan, '', regex=True)
    
    # Save the predictions as csv
    if real_predict_test == 1:
        df.to_csv(f'./outputs/corrected_models/HALF_predict_test_{args["model_name"]}-{args["split"]}-b_{args["batch_size"]}-lr_{args["learning_rate"]}-warm_{args["warmup_ratio"]}-seed_{args["seed"]}-{args["special"]}-checkpoint-{ckpt}.csv',header=False)
    else:
        tempname = 'fb-muppet-roberta-base'
        path = f'./outputs/corrected_models/{tempname}'
        # Check whether the specified path exists or not
        isExist = os.path.exists(path)
        if not isExist:
           # Create a new directory because it does not exist
           os.makedirs(path)
        df.to_csv(f'./outputs/corrected_models/{tempname}-{args["split"]}-b_{args["batch_size"]}-seed_{args["seed"]}/checkpoint-{ckpt}.csv',header=False)
    print("Predictions to csv done.")

In [None]:
# Only run this block for predicting final submission
# Choose how to fill na if any, {0: Fill null, 1: Fill whole q or r sequence, 2: Fill with other csv predictions}
how_to_fill_na = 1

if real_predict_test == 1:
    pred_df = pd.read_csv(f'./outputs/corrected_models/HALF_predict_test_muppet-roberta-base-finetuned-squad-{args["split"]}-b_{args["batch_size"]}-lr_{args["learning_rate"]}-warm_{args["warmup_ratio"]}-seed_{args["seed"]}-{args["special"]}-checkpoint-{ckpt}.csv', names=["id", "q'", "r'"], dtype=str)
    valid_data = pd.read_csv("./dataset/test.csv")
    
    print('Any nas?', pred_df.isna().sum().sum())    
    if how_to_fill_na == 0:
        pred_df.fillna(value='', inplace=True)
    elif how_to_fill_na == 1:
        r, _ = np.where(pred_df.isna())
        print(r, _)
        for i in range(len(r)):
            for j in range(valid_data.shape[0]):
                if valid_data.iloc[j]['id'] == int(pred_df.iloc[r[i]][0]):
                    getq = valid_data.iloc[j]["q"]
                    getr = valid_data.iloc[j]["r"]
                    if _[i] == 1:
                        pred_df.iloc[r[i]]["q'"] = getq
                    elif _[i] == 2:
                        pred_df.iloc[r[i]]["r'"] = getr
                    break
    elif how_to_fill_na == 2:
        helper_df = pd.read_csv('./outputs/corrected_models/REAL_predict_test_muppet-roberta-base-finetuned-squad-6+22-b_8-lr_3e-05-warm_0.06-seed_24-checkpoint-2000.csv', names=["id", "q", "r"], dtype=str)
        r, _ = np.where(pred_df.isna())
        print(helper_df.shape[0], helper_df.iloc[0]['id'])
        print(r, _)
        for i in range(len(r)):
            for j in range(helper_df.shape[0]):
                if int(helper_df.iloc[j]['id']) == int(pred_df.iloc[r[i]][0]):
                    getq = helper_df.iloc[j]["q"].replace('"','')
                    getr = helper_df.iloc[j]["r"].replace('"','')
                    if _[i] == 1 and getq != '':
                        pred_df.iloc[r[i]]["q'"] = getq
                    elif _[i] == 2 and getr != '':
                        pred_df.iloc[r[i]]["r'"] = getr
                    break
    print('Any nas?', pred_df.isna().sum().sum())

    pred_df = pred_df.rename({"q'": 'q', "r'": 'r'}, axis=1)
    pred_df.loc[:, ['q', 'r']] = pred_df[['q', 'r']].applymap(lambda s: '"' + str(s).strip('"') + '"')
    pred_df.to_csv(f'./outputs/corrected_models/REAL_predict_test_muppet-roberta-base-finetuned-squad-{args["split"]}-b_{args["batch_size"]}-lr_{args["learning_rate"]}-warm_{args["warmup_ratio"]}-seed_{args["seed"]}-checkpoint-{ckpt}.csv', header=True, quotechar='"', index=False, encoding="utf-8")

### Evaluation

In [None]:
from pathlib import Path
import json
from typing import Dict
from typing import List, Tuple, Any, Union
from pathlib import Path
import pandas as pd
import nltk
from transformers import EvalPrediction

In [None]:
def nltk_tokenize(text: str, filter_puncts: bool = True) -> List[str]:
    punctuations = set([ch for ch in "!\"#$%&'()*+, -./:;<=>?@[\]^_`{|}~"])
    text = text.strip('"') # NOTE: remove the quotes first
    tokens = nltk.tokenize.word_tokenize(text)
    if filter_puncts:
        tokens = list(filter(lambda t: t not in punctuations, tokens))
    return tokens
    
def longestCommonSubsequence(text1: list, text2: list) -> int:
    if len(text2) > len(text1):
        text1, text2 = text2, text1

    lcs = [[0] * (len(text2) + 1) for _ in range(2)]
    for i in range(1, len(text1)+1):
        for j in range(1, len(text2)+1):
            if text1[i-1] == text2[j-1]:
                lcs[i % 2][j] = lcs[(i-1) % 2][j-1] + 1
            else:
                lcs[i % 2][j] = max(lcs[(i-1) % 2][j], lcs[i % 2][j-1])

    return lcs[len(text1) % 2][len(text2)]

def compute_lcs_score(pred: list, ans: list) -> float:
    intersection = longestCommonSubsequence(pred, ans)
    union = len(pred) + len(ans) - intersection
    if union == 0:
        return 0
    lcs_score = intersection / union
    if (lcs_score < 0) or (lcs_score) > 1:
        raise ValueError("LCS score must be between 0 and 1")
    return lcs_score

def compute_lcs_scores(pred_df: pd.DataFrame, ans_df: pd.DataFrame) -> pd.DataFrame:
    ids, qp_scores, rp_scores = list(), list(), list()
    for _, prow in pred_df.iterrows():
        pid, qp_pred, rp_pred = prow["id"], prow["q'"], prow["r'"]
        qp_pred, rp_pred = [nltk_tokenize(pred) for pred in [qp_pred, rp_pred]]
        ans_rows = ans_df[ans_df.id == pid]

        for _, arow in ans_rows.iterrows():
            qp_ans, rp_ans = arow["q'"], arow["r'"]
            qp_ans, rp_ans = [nltk_tokenize(ans) for ans in [qp_ans, rp_ans]]
            qp_score, rp_score = compute_lcs_score(qp_pred, qp_ans), compute_lcs_score(rp_pred, rp_ans)

            for item, l in zip([pid, qp_score, rp_score], [ids, qp_scores, rp_scores]):
                l.append(item)

    assert ids == ans_df.id.tolist()
    lcs_df = pd.DataFrame(data={
        "id": ids,
        "qp_scores": qp_scores,
        "rp_scores": rp_scores
    })
    return lcs_df

def compute_final_score(lcs_df: pd.DataFrame) -> float:
    lcs_df["total_scores"] = lcs_df["qp_scores"] + lcs_df["rp_scores"]
    max_scores = lcs_df.groupby("id")["total_scores"].max()
    final_score = max_scores.sum() / (2 * len(max_scores))
    if (final_score < 0) or (final_score > 1):
        raise ValueError("The final score must be between 0 and 1, please check the implementation.")
    return final_score

In [None]:
how_to_fill_nas = [0, 1]
# Change ckpts if you need to evaluate different ckpts
# ckpts = list(range(200, 2600, 200))

# Run both fill_na with null and whole sequence of qr 
for how_to_fill_na in how_to_fill_nas:
    for ckpt in ckpts:
        pred_df = pd.read_csv(f'./outputs/corrected_models/{tempname}/{args["split"]}-b_{args["batch_size"]}-seed_{args["seed"]}/checkpoint-{ckpt}.csv', names=["id", "q'", "r'"], dtype=str)
        ans_df = pd.read_csv("split6+22.csv", names=["id", "q'", "r'"], dtype=str)

        print('Any nas?', pred_df.isna().sum().sum())
        if how_to_fill_na == 0:
            pred_df.fillna(value='', inplace=True)
        elif how_to_fill_na == 1:
            r, _ = np.where(pred_df.isna())
            for i in range(len(r)):
                for j in range(valid_data.shape[0]):
                    if valid_data.iloc[j]['id'] == int(pred_df.iloc[r[i]][0]):
                        getq = valid_data.iloc[j]["q"]
                        getr = valid_data.iloc[j]["r"]
                        if _[i] == 1:
                            pred_df.iloc[r[i]][1] = getq
                        elif _[i] == 2:
                            pred_df.iloc[r[i]][2] = getr
                        break

        
        # Check whether pred_df has the same nums of evaluation ids
        if len(pred_df) != len(ans_df.groupby("id").size()):
            raise ValueError("The prediction file must have the same number of rows as the number of unique IDs in the answer file")

        # Start evaluating
        lcs_df = compute_lcs_scores(pred_df, ans_df) # has len(ans_df) rows of lcs_q' and lcs_r'
        final_score = compute_final_score(lcs_df) # derive the final score by "1/2N (\sum_i^N(max_j(score_q' + score_r')))"
        # Print score
        print(f'# {tempname}-{args["split"]}-ckpt-{ckpt} final score: {final_score}')
    print('# ============')

### Model combination test

In [None]:
import pandas as pd
from utils import *

ans_df = pd.read_csv("/nfs/nas-6.1/wclu/AICUP/full_valid.csv")
max_q = 0
max_r = 0

In [None]:
path = "/nfs/nas-6.1/wclu/corrected_models/janeel/muppet-roberta-base-finetuned-squad-62+2-b_8-seed_26"
for step in range(200, 3001, 200): 
    try:
        pred_df = pd.read_csv(f"{path}/checkpoint-{step}.csv", names=['id', "q'", "r'"]).fillna("")
        q = pred_df[['id', "q'"]]
        empty_r = pd.DataFrame(data={"r'": ["" for _ in range(1598)]})
        r = pred_df[['id', "r'"]]
        empty_q = pd.DataFrame(data={"q'": ["" for _ in range(1598)]})

        q_frame=pd.concat([q, empty_r], axis=1).reset_index(drop=True)
        r_frame=pd.concat([r, empty_q], axis=1).reset_index(drop=True)

        lcs = compute_lcs_scores(q_frame, ans_df)
        score_q = compute_final_score(lcs)

        lcs = compute_lcs_scores(r_frame, ans_df)
        score_r = compute_final_score(lcs)
        if score_q >= max_q:
            best_q = q
            step_q = step
            q_path = path
            max_q = score_q
        if score_r >= max_r:
            best_r = r
            step_r = step
            r_path = path
            max_r = score_r
        print(f"ckpt{step} q_score: {score_q} r_score: {score_r}")
    except:
        continue

best_pred = pd.concat([best_q, best_r["r'"]], axis=1).reset_index(drop=True)
lcs = compute_lcs_scores(best_pred, ans_df)
score = compute_final_score(lcs)
print(f"model for q: {q_path}_{step_q}, score: {max_q}")
print(f"model for r: {r_path}_{step_r}, score: {max_r}")
print(score)