## 데이터셋 관련

- 링크 : https://huggingface.co/datasets/sean0042/KorMedMCQA
- train, dev, test 로 이뤄져 있음.


## Format 예시

- Solar 예시: https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0
- IFT 시 사용 format
```
<s> ### User:
Hello?

### Assistant:
Hello, how can I assist you today? Please feel free to ask any questions or request help with a specific task.</s>
```

In [None]:
from datasets import load_dataset

In [None]:

data_path = "sean0042/KorMedMCQA"

doctor = load_dataset(path=data_path, name="doctor")
nurse = load_dataset(path=data_path,name="nurse")
pharmacist = load_dataset(path=data_path, name="pharm")

In [None]:
import pandas as pd

In [None]:
def merge_data(data_type):
    doctor_set = pd.DataFrame(doctor[data_type])
    nurse_set = pd.DataFrame(nurse[data_type])
    pharmacist_set = pd.DataFrame(pharmacist[data_type])
    
    return pd.concat([doctor_set, nurse_set, pharmacist_set])

In [None]:
train_df = merge_data('train')
val_df = merge_data('dev')
test_df = merge_data('test')

In [None]:
train_df

In [None]:
def refine_text(df):
    answer_list = []
    for row in df.itertuples():
        question = row[5]
        answer_idx = int(row[11])
        answer = row[5 + answer_idx]
        answer_list.append(answer)
    
    input_df = df["question"].to_frame()
    final_df = input_df.assign(output=answer_list)
    # final_df = df.assign(output=answer_list)
    return final_df
    

In [None]:
train_set = refine_text(train_df)
val_set = refine_text(val_df)
test_set = refine_text(test_df)

In [None]:
train_set

In [None]:
train_set['question_length'] = train_set['question'].apply(len)
train_set['output_length'] = train_set['output'].apply(len)
print(train_set.describe())
train_set

In [None]:
cutoff_len = 1024

In [None]:
%store -r

In [None]:
model_download_path

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_download_path)

In [None]:
def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=cutoff_len,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < cutoff_len
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

In [None]:
def prepare_dataset(dataset):
    dataset = dataset.sample(frac=1).reset_index(drop=True)
    
    labels = []
    inputs = []
    masks = []
    for row in dataset.itertuples():
        question = row[1]
        answer = row[2]
        prompt = f"""<s> ### User:
{question}

### Assistant:
{answer} </s>
"""
        tokenized_result = tokenize(prompt)
        inputs.append(tokenized_result['input_ids'])
        labels.append(tokenized_result['labels'])
        masks.append(tokenized_result['attention_mask'])
    
    dataset = dataset.assign(input_ids=inputs)
    dataset = dataset.assign(labels=labels)
    dataset = dataset.assign(attention_mask=masks)
    
    return dataset
        
        

In [None]:
train_dataset = prepare_dataset(train_set)
train_dataset

In [None]:
val_dataset = prepare_dataset(val_set)

In [None]:
from datasets import Dataset

hf_train = Dataset.from_pandas(train_dataset)
hf_val = Dataset.from_pandas(val_dataset)

In [None]:
import os

hf_train.save_to_disk(os.path.join("dataset", "train"))
hf_val.save_to_disk(os.path.join("dataset", "val"))