In [1]:
import warnings
warnings.filterwarnings('ignore')

## 1. Data 준비

In [2]:
%%capture
! pip install datasets

In [3]:
from datasets import load_dataset

klue_mrc_train = load_dataset('klue', 'mrc', split='train')
klue_mrc_test = load_dataset('klue', 'mrc', split='validation')

df_train = klue_mrc_train.to_pandas()
df_test = klue_mrc_test.to_pandas()

df_train = df_train[['title', 'question', 'context']]
df_test = df_test[['title', 'question', 'context']]

README.md:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/17554 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5841 [00:00<?, ? examples/s]

In [4]:
def add_ir_context(df):
  irrelevant_contexts = []
  for idx, row in df.iterrows():
    title = row['title']
    irrelevant_contexts.append(
        df.query(f"title != '{title}'").sample(1)['context'].values[0]
    )

  df['irrelevant_context'] = irrelevant_contexts
  return df

df_train_ir = add_ir_context(df_train)
df_test_ir = add_ir_context(df_test)

In [5]:
from sentence_transformers import InputExample

examples = []
for idx, row in df_train_ir.iterrows():
  examples.append(
      InputExample(texts=[row['question'], row['context']], label=1)
  )
  examples.append(
      InputExample(texts=[row['question'], row['irrelevant_context']], label=0)
  )

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

## 2. Cross Encoder

In [6]:
from sentence_transformers.cross_encoder import CrossEncoder

cross_model = CrossEncoder('klue/roberta-small', num_labels=1)

config.json:   0%|          | 0.00/545 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/273M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [7]:
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator

ce_evaluator = CEBinaryClassificationEvaluator.from_input_examples(examples)
ce_evaluator(cross_model)

0.49614343030035046

In [8]:
train_samples = []
for idx, row in df_train_ir.iterrows():
  train_samples.append(
      InputExample(texts=[row['question'], row['context']], label=1)
  )
  train_samples.append(
      InputExample(texts=[row['question'], row['irrelevant_context']], label=0)
  )

In [9]:
from torch.utils.data import DataLoader

train_batch_size = 16
num_epochs = 1
model_save_path = 'output/training_mrc'

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

cross_model.fit(
    train_dataloader=train_dataloader,
    epochs=num_epochs,
    warmup_steps=100,
    output_path=model_save_path,
    show_progress_bar=True
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2195 [00:00<?, ?it/s]

In [10]:
ce_evaluator(cross_model)

0.9996966708962942

In [11]:
cross_model.save(model_save_path)

In [12]:
from huggingface_hub import login, HfApi

login(token='hf_...')
repo_id = 'whatwant/klue-roberta-small-cross-encoder'

api = HfApi()
api.create_repo(repo_id=repo_id)

api.upload_folder(
    folder_path=model_save_path,
    repo_id=repo_id,
    repo_type='model'
)

model.safetensors:   0%|          | 0.00/272M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/whatwant/klue-roberta-small-cross-encoder/commit/959ad295289b4fdd9a0a56427ac0fd9a8205a527', commit_message='Upload folder using huggingface_hub', commit_description='', oid='959ad295289b4fdd9a0a56427ac0fd9a8205a527', pr_url=None, repo_url=RepoUrl('https://huggingface.co/whatwant/klue-roberta-small-cross-encoder', endpoint='https://huggingface.co', repo_type='model', repo_id='whatwant/klue-roberta-small-cross-encoder'), pr_revision=None, pr_num=None)