# Settings

In [None]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence_transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB

In [None]:
!pip install -q transformers datasets folium==0.2.1 apex

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.0/70.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.8/75.8 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?2

In [None]:
import os
import time
import json
import shutil
import argparse
import numpy as np
from tqdm import tqdm
import random
import pandas as pd
from numpy import dot
from numpy.linalg import norm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torch.utils.data import DataLoader

from transformers import AutoModel, AutoTokenizer, AutoConfig, BertPreTrainedModel, BertModel, BertConfig, BertTokenizer, BertTokenizerFast
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

from dataset import SelectionDataset
from transform import SelectionSequentialTransform, SelectionJoinTransform, SelectionConcatTransform
from encoder import PolyEncoder, BiEncoder, CrossEncoder

from sklearn.metrics import label_ranking_average_precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import logging

In [None]:
args = {
        "bert_model": 'klue/bert-base',
        "eval": False,
        "model_type": 'bert',
        "output_dir": '/content/drive/MyDrive/',
        "train_dir": '/content/drive/MyDrive/',

        "use_pretrain": True,
        "architecture": 'poly',

        "max_contexts_length": 128,
        "max_response_length": 32,
        "train_batch_size": 32,
        "eval_batch_size": 32,
        "print_freq": 100,

        "poly_m": 16,

        "learning_rate": 5e-5,
        "weight_decay": 0.01,
        "warmup_steps": 100,
        "adam_epsilon": 1e-8,
        "max_grad_norm": 1.0,

        "num_train_epochs": 10.0,
        'seed': 12345,
        'gradient_accumulation_steps': 1,
        "fp16": False,
        "fp16_opt_level": "O1",
        'gpu': 0
        }

In [None]:
from easydict import EasyDict as edict
args = edict(args)

In [None]:
# 로깅 레벨을 ERROR로 설정
logging.basicConfig(level=logging.ERROR)

def set_seed(args):
    random.seed(args.seed)  # 랜덤 시드 설정
    np.random.seed(args.seed)  # NumPy 랜덤 시드 설정
    torch.manual_seed(args.seed)  # PyTorch 랜덤 시드 설정

os.environ["CUDA_VISIBLE_DEVICES"] = "%d" % args.gpu  # GPU 설정
set_seed(args)  # 시드 설정 함수 호출


# 데이터셋 및 모델 준비

In [None]:
## 데이터셋 및 BERT 모델 초기화
model_name = args.bert_model
tokenizer = BertTokenizer.from_pretrained(model_name)  # BERT 토크나이저 초기화

# 데이터 변환 함수 초기화
context_transform = SelectionJoinTransform(tokenizer=tokenizer, max_len=args.max_contexts_length)
response_transform = SelectionSequentialTransform(tokenizer=tokenizer, max_len=args.max_response_length)
concat_transform = SelectionConcatTransform(tokenizer=tokenizer, max_len=args.max_response_length+args.max_contexts_length)

print('=' * 80)
print('Train dir:', args.train_dir)
print('Output dir:', args.output_dir)
print('=' * 80)

if not args.eval:
    # 훈련 데이터셋 초기화
    train_dataset = SelectionDataset(os.path.join(args.train_dir, 'train.pickle'),
                                                                    context_transform, response_transform, concat_transform, sample_cnt=None, mode=args.architecture)
    val_dataset = SelectionDataset(os.path.join(args.train_dir, 'val.pickle'),
                                                                context_transform, response_transform, concat_transform, sample_cnt=None, mode=args.architecture)
    train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size, collate_fn=train_dataset.batchify_join_str, shuffle=True, num_workers=0)
    t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
else: # 테스트
    val_dataset = SelectionDataset(os.path.join(args.train_dir, 'test.pickle'),
                                                                context_transform, response_transform, concat_transform, sample_cnt=None, mode=args.architecture)

val_dataloader = DataLoader(val_dataset, batch_size=args.eval_batch_size, collate_fn=val_dataset.batchify_join_str, shuffle=False, num_workers=0)


In [None]:
# 모델 train시 필요한 파라미터 지정.

epoch_start = 1
global_step = 0
best_eval_loss = float('inf')
best_test_loss = float('inf')

if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)

log_wf = open(os.path.join(args.output_dir, 'log.txt'), 'a', encoding='utf-8')
print(args, file=log_wf)

state_save_path = os.path.join(args.output_dir, '{}_{}_pytorch_model.bin'.format(args.architecture, args.poly_m))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
########################################
## BERT encoder build하기
########################################

bert_config = BertConfig.from_pretrained(model_name)

bert = BertModel.from_pretrained(model_name, config=bert_config)

model = PolyEncoder(bert_config, bert=bert, poly_m=args.poly_m)

model.resize_token_embeddings(len(tokenizer))
model.to(device)

# 학습 준비

### 모델 평가 함수 정의

In [None]:
def eval_running_model(dataloader, test=False):
    """
    모델을 평가하고 평가 지표를 반환합니다.

    Args:
        dataloader (DataLoader): 평가 데이터를 로드하는 DataLoader 객체.
        test (bool, optional): 테스트 모드 여부. 기본값은 False.

    Returns:
        dict: 평가 결과를 담은 딕셔너리. 훈련 중이면 'train_loss'도 포함됩니다.
            - 'train_loss' (float): 훈련 손실 (훈련 중일 때만 포함).
            - 'eval_loss' (float): 평가 손실.
            - 'R1' (float): R@1 지표.
            - 'R2' (float): R@2 지표.
            - 'R5' (float): R@5 지표.
            - 'R10' (float): R@10 지표.
            - 'MRR' (float): 평균 상대 순위(MRR) 지표.
            - 'epoch' (int): 현재 에포크 (훈련 중일 때만 포함).
            - 'global_step' (int): 현재 전체 스텝 수 (훈련 중일 때만 포함).
    """
    model.eval()
    eval_loss, eval_hit_times = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    r10 = r2 = r1 = r5 = 0
    mrr = []
    for step, batch in enumerate(dataloader):
        batch = tuple(t.to(device) for t in batch)

        if args.architecture == 'cross':
            text_token_ids_list_batch, text_input_masks_list_batch, text_segment_ids_list_batch, labels_batch = batch
            with torch.no_grad():
                logits = model(text_token_ids_list_batch, text_input_masks_list_batch, text_segment_ids_list_batch)
                loss = F.cross_entropy(logits, torch.argmax(labels_batch, 1))

        else:
            context_token_ids_list_batch, context_input_masks_list_batch, \
            response_token_ids_list_batch, response_input_masks_list_batch, labels_batch = batch

            with torch.no_grad():
                logits = model(context_token_ids_list_batch, context_input_masks_list_batch,
                                              response_token_ids_list_batch, response_input_masks_list_batch)
                loss = F.cross_entropy(logits, torch.argmax(labels_batch, 1))

        r2_indices = torch.topk(logits, 2)[1] # R 2 @ 100
        r5_indices = torch.topk(logits, 5)[1] # R 5 @ 100
        r10_indices = torch.topk(logits, 10)[1] # R 10 @ 100

        r1 += (logits.argmax(-1) == 0).sum().item()
        r2 += ((r2_indices==0).sum(-1)).sum().item()
        r5 += ((r5_indices==0).sum(-1)).sum().item()
        r10 += ((r10_indices==0).sum(-1)).sum().item()

        # mrr
        logits = logits.data.cpu().numpy()
        for logit in logits:
            y_true = np.zeros(len(logit))
            y_true[0] = 1
            mrr.append(label_ranking_average_precision_score([y_true], [logit]))

        eval_loss += loss.item()
        nb_eval_examples += labels_batch.size(0)
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = r1 / nb_eval_examples

    if not test:
        result = {
            'train_loss': tr_loss / nb_tr_steps,
            'eval_loss': eval_loss,
            'R1': r1 / nb_eval_examples,
            'R2': r2 / nb_eval_examples,
            'R5': r5 / nb_eval_examples,
            'R10': r10 / nb_eval_examples,
            'MRR': np.mean(mrr),
            'epoch': epoch,
            'global_step': global_step,
        }

    else:
        result = {
            'eval_loss': eval_loss,
            'R1': r1 / nb_eval_examples,
            'R2': r2 / nb_eval_examples,
            'R5': r5 / nb_eval_examples,
            'R10': r10 / nb_eval_examples,
            'MRR': np.mean(mrr),
        }

    return result

### 학습 및 평가시 파라미터 정의

In [None]:
if args.eval:
    print('Loading parameters from', state_save_path)
    model.load_state_dict(torch.load(state_save_path))  # 저장된 모델 파라미터 불러오기
    test_result = eval_running_model(val_dataloader, test=True)  # 평가 모드에서 모델 평가 실행
    print(test_result)  # 평가 결과 출력
    exit()  # 프로그램 종료

no_decay = ["bias", "LayerNorm.weight"]

# 옵티마이저 그룹화된 매개변수 설정
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": args.weight_decay,
    },
    {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]

# AdamW 옵티마이저 초기화
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)

# 스케줄러 초기화
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
)

# 좀 더 빠른 실행시간을 원한다면? fp16 -> True
if args.fp16:
    try:
        from apex import amp
    except ImportError:
        raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
    model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)


In [None]:
# 출력 빈도와 평가 빈도 설정
print_freq = args.print_freq // args.gradient_accumulation_steps  # 출력 빈도 설정
eval_freq = min(len(train_dataloader) // 2, 1000)  # 평가 빈도 설정 (훈련 데이터의 절반 또는 최대 1000번)
eval_freq = eval_freq // args.gradient_accumulation_steps  # 평가 빈도를 경사 누적 단계로 나눔

# 설정된 빈도 출력
print('Print freq:', print_freq, "Eval freq:", eval_freq)


# Train

In [None]:
for epoch in range(epoch_start, int(args.num_train_epochs) + 1):
    tr_loss = 0  # 훈련 손실 초기화
    nb_tr_steps = 0  # 훈련 스텝 수 초기화
    with tqdm(total=len(train_dataloader) // args.gradient_accumulation_steps) as bar:
        for step, batch in enumerate(train_dataloader):
            model.train()  # 모델을 훈련 모드로 설정
            optimizer.zero_grad()  # 그래디언트 초기화
            batch = tuple(t.to(device) for t in batch)
            if args.architecture == 'cross':
                text_token_ids_list_batch, text_input_masks_list_batch, text_segment_ids_list_batch, labels_batch = batch
                loss = model(text_token_ids_list_batch, text_input_masks_list_batch, text_segment_ids_list_batch, labels_batch)
            else:
                context_token_ids_list_batch, context_input_masks_list_batch, \
                response_token_ids_list_batch, response_input_masks_list_batch, labels_batch = batch
                loss = model(context_token_ids_list_batch, context_input_masks_list_batch,
                                        response_token_ids_list_batch, response_input_masks_list_batch,
                                        labels_batch)

            loss = loss / args.gradient_accumulation_steps  # 그래디언트 누적을 고려한 손실

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()  # 훈련 손실 누적

            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                nb_tr_steps += 1
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1

                if nb_tr_steps and nb_tr_steps % print_freq == 0:
                    bar.update(min(print_freq, nb_tr_steps))
                    time.sleep(0.02)
                    print(global_step, tr_loss / nb_tr_steps)  # 훈련 스텝 및 현재 평균 훈련 손실 출력
                    log_wf.write('%d\t%f\n' % (global_step, tr_loss / nb_tr_steps))

                if global_step and global_step % eval_freq == 0:
                    val_result = eval_running_model(val_dataloader)
                    print('Global Step %d VAL res:\n' % global_step, val_result)  # 검증 결과 출력
                    log_wf.write('Global Step %d VAL res:\n' % global_step)
                    log_wf.write(str(val_result) + '\n')

                    if val_result['eval_loss'] < best_eval_loss:
                        best_eval_loss = val_result['eval_loss']
                        val_result['best_eval_loss'] = best_eval_loss
                        # 모델 저장
                        print('[Saving at]', state_save_path)
                        log_wf.write('[Saving at] %s\n' % state_save_path)
                        torch.save(model.state_dict(), state_save_path)

            log_wf.flush()


    # EVAL
    # 각 에포크 후에 평가 스텝 추가
    val_result = eval_running_model(val_dataloader)
    print('Epoch %d, Global Step %d VAL res:\n' % (epoch, global_step), val_result)  # 에포크별 검증 결과 출력
    log_wf.write('Global Step %d VAL res:\n' % global_step)
    log_wf.write(str(val_result) + '\n')

    if val_result['eval_loss'] < best_eval_loss:
        best_eval_loss = val_result['eval_loss']
        val_result['best_eval_loss'] = best_eval_loss
        # 모델 저장
        print('[Saving at]', state_save_path)
        log_wf.write('[Saving at] %s\n' % state_save_path)
        torch.save(model.state_dict(), state_save_path)
    print(global_step, tr_loss / nb_tr_steps)
    log_wf.write('%d\t%f\n' % (global_step, tr_loss / nb_tr_steps))


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

Train dir: /content/drive/MyDrive/
Output dir: /content/drive/MyDrive/


Downloading model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Print freq: 100 Eval freq: 136


  labels_batch = torch.tensor(labels_batch, dtype=torch.long)
 37%|███▋      | 100/273 [00:47<01:21,  2.11it/s]

100 16.520170426368715
Global Step 136 VAL res:
 {'train_loss': 13.035817979013219, 'eval_loss': 1.7773750305175782, 'R1': 0.3605504587155963, 'R2': 0.5385321100917431, 'R5': 0.8623853211009175, 'R10': 1.0, 'MRR': 0.5572801077617592, 'epoch': 1, 'global_step': 136}
[Saving at] /content/drive/MyDrive/poly_16_pytorch_model.bin


 73%|███████▎  | 200/273 [01:45<00:39,  1.87it/s]

200 9.829939411878586
Global Step 272 VAL res:
 {'train_loss': 7.919717381105704, 'eval_loss': 1.0534618650163923, 'R1': 0.6504587155963303, 'R2': 0.7954128440366972, 'R5': 0.9504587155963303, 'R10': 1.0, 'MRR': 0.7730482743556137, 'epoch': 1, 'global_step': 272}
[Saving at] /content/drive/MyDrive/poly_16_pytorch_model.bin


 73%|███████▎  | 200/273 [02:30<00:54,  1.33it/s]


Epoch 1, Global Step 273 VAL res:
 {'train_loss': 7.898591902666476, 'eval_loss': 1.059678840637207, 'R1': 0.6577981651376147, 'R2': 0.8, 'R5': 0.9467889908256881, 'R10': 1.0, 'MRR': 0.7768039172855687, 'epoch': 1, 'global_step': 273}
273 7.898591902666476


 37%|███▋      | 100/273 [00:43<01:15,  2.30it/s]

373 2.5271753442287443
Global Step 408 VAL res:
 {'train_loss': 2.4798078598799527, 'eval_loss': 0.7827788659504482, 'R1': 0.7412844036697248, 'R2': 0.8688073394495412, 'R5': 0.9743119266055046, 'R10': 1.0, 'MRR': 0.8387698412698413, 'epoch': 2, 'global_step': 408}
[Saving at] /content/drive/MyDrive/poly_16_pytorch_model.bin


 73%|███████▎  | 200/273 [01:41<00:37,  1.93it/s]

473 2.380475359559059
Global Step 544 VAL res:
 {'train_loss': 2.3349383506387804, 'eval_loss': 0.7343863753335816, 'R1': 0.7532110091743119, 'R2': 0.8807339449541285, 'R5': 0.9724770642201835, 'R10': 1.0, 'MRR': 0.846539245667686, 'epoch': 2, 'global_step': 544}
[Saving at] /content/drive/MyDrive/poly_16_pytorch_model.bin


 73%|███████▎  | 200/273 [02:26<00:53,  1.36it/s]


Epoch 2, Global Step 546 VAL res:
 {'train_loss': 2.3318831217594638, 'eval_loss': 0.7510460704565048, 'R1': 0.7532110091743119, 'R2': 0.8733944954128441, 'R5': 0.9761467889908257, 'R10': 1.0, 'MRR': 0.8456844327945244, 'epoch': 2, 'global_step': 546}
546 2.3318831217594638


 37%|███▋      | 100/273 [00:43<01:15,  2.29it/s]

646 1.8736257553100586
Global Step 680 VAL res:
 {'train_loss': 1.8747487655326502, 'eval_loss': 0.6362113628004279, 'R1': 0.7871559633027523, 'R2': 0.8981651376146789, 'R5': 0.9862385321100917, 'R10': 1.0, 'MRR': 0.8693978447648172, 'epoch': 3, 'global_step': 680}
[Saving at] /content/drive/MyDrive/poly_16_pytorch_model.bin


 73%|███████▎  | 200/273 [01:41<00:37,  1.93it/s]

746 1.866945207118988
Global Step 816 VAL res:
 {'train_loss': 1.859788563074889, 'eval_loss': 0.7295055477480803, 'R1': 0.7770642201834862, 'R2': 0.8889908256880734, 'R5': 0.9779816513761468, 'R10': 1.0, 'MRR': 0.8619531818843745, 'epoch': 3, 'global_step': 816}


 73%|███████▎  | 200/273 [02:25<00:52,  1.38it/s]


Epoch 3, Global Step 819 VAL res:
 {'train_loss': 1.8601788470159957, 'eval_loss': 0.6913559880373733, 'R1': 0.7743119266055046, 'R2': 0.8862385321100917, 'R5': 0.9770642201834863, 'R10': 1.0, 'MRR': 0.8596119120431047, 'epoch': 3, 'global_step': 819}
819 1.8601788470159957


 37%|███▋      | 100/273 [00:43<01:15,  2.29it/s]

919 1.5632219541072845
Global Step 952 VAL res:
 {'train_loss': 1.558136793007528, 'eval_loss': 0.6172345971688629, 'R1': 0.7972477064220184, 'R2': 0.9, 'R5': 0.9825688073394495, 'R10': 1.0, 'MRR': 0.8749453910004368, 'epoch': 4, 'global_step': 952}
[Saving at] /content/drive/MyDrive/poly_16_pytorch_model.bin


 73%|███████▎  | 200/273 [01:40<00:37,  1.94it/s]

1019 1.5656852996349335
Global Step 1088 VAL res:
 {'train_loss': 1.5669995681503892, 'eval_loss': 0.6242391529359987, 'R1': 0.8091743119266055, 'R2': 0.9119266055045872, 'R5': 0.9908256880733946, 'R10': 1.0, 'MRR': 0.883839012669288, 'epoch': 4, 'global_step': 1088}


 73%|███████▎  | 200/273 [02:25<00:52,  1.38it/s]


Epoch 4, Global Step 1092 VAL res:
 {'train_loss': 1.563706514381227, 'eval_loss': 0.6031113281846047, 'R1': 0.8201834862385321, 'R2': 0.9146788990825688, 'R5': 0.9889908256880734, 'R10': 1.0, 'MRR': 0.8899534003203727, 'epoch': 4, 'global_step': 1092}
[Saving at] /content/drive/MyDrive/poly_16_pytorch_model.bin
1092 1.563706514381227


 37%|███▋      | 100/273 [00:43<01:14,  2.31it/s]

1192 1.2603074711561204
Global Step 1224 VAL res:
 {'train_loss': 1.2723094008185647, 'eval_loss': 0.6748590599654043, 'R1': 0.7935779816513762, 'R2': 0.908256880733945, 'R5': 0.9889908256880734, 'R10': 1.0, 'MRR': 0.8755118683559051, 'epoch': 5, 'global_step': 1224}


 73%|███████▎  | 200/273 [01:39<00:37,  1.96it/s]

1292 1.3026585605740548
Global Step 1360 VAL res:
 {'train_loss': 1.3210540164762468, 'eval_loss': 0.6466283461345094, 'R1': 0.8128440366972477, 'R2': 0.9091743119266055, 'R5': 0.9871559633027523, 'R10': 1.0, 'MRR': 0.8855231542158147, 'epoch': 5, 'global_step': 1360}


 73%|███████▎  | 200/273 [02:23<00:52,  1.39it/s]


Epoch 5, Global Step 1365 VAL res:
 {'train_loss': 1.3187179058899372, 'eval_loss': 0.6377437405820404, 'R1': 0.8155963302752294, 'R2': 0.9155963302752294, 'R5': 0.9834862385321101, 'R10': 1.0, 'MRR': 0.8873591087811271, 'epoch': 5, 'global_step': 1365}
1365 1.3187179058899372


 37%|███▋      | 100/273 [00:43<01:15,  2.29it/s]

1465 1.111716669201851
Global Step 1496 VAL res:
 {'train_loss': 1.1088416430786365, 'eval_loss': 0.8406884210383266, 'R1': 0.8045871559633028, 'R2': 0.9009174311926605, 'R5': 0.9844036697247707, 'R10': 1.0, 'MRR': 0.8787137760302898, 'epoch': 6, 'global_step': 1496}


 73%|███████▎  | 200/273 [01:39<00:37,  1.96it/s]

1565 1.0914745186269283
Global Step 1632 VAL res:
 {'train_loss': 1.1032291372840324, 'eval_loss': 0.7435031648991364, 'R1': 0.8027522935779816, 'R2': 0.9100917431192661, 'R5': 0.9853211009174312, 'R10': 1.0, 'MRR': 0.8800469637396243, 'epoch': 6, 'global_step': 1632}


 73%|███████▎  | 200/273 [02:23<00:52,  1.39it/s]


Epoch 6, Global Step 1638 VAL res:
 {'train_loss': 1.1042873465316199, 'eval_loss': 0.7420605697402997, 'R1': 0.8146788990825689, 'R2': 0.9128440366972477, 'R5': 0.9880733944954129, 'R10': 1.0, 'MRR': 0.8866906946264744, 'epoch': 6, 'global_step': 1638}
1638 1.1042873465316199


 37%|███▋      | 100/273 [00:43<01:15,  2.30it/s]

1738 0.9336793851852417
Global Step 1768 VAL res:
 {'train_loss': 0.9158465600930727, 'eval_loss': 0.8451083379024307, 'R1': 0.8155963302752294, 'R2': 0.9100917431192661, 'R5': 0.9889908256880734, 'R10': 1.0, 'MRR': 0.8875702635794378, 'epoch': 7, 'global_step': 1768}


 73%|███████▎  | 200/273 [01:39<00:37,  1.97it/s]

1838 0.9115979804098606
Global Step 1904 VAL res:
 {'train_loss': 0.9139301692856882, 'eval_loss': 0.8213760929219072, 'R1': 0.8128440366972477, 'R2': 0.9036697247706422, 'R5': 0.9889908256880734, 'R10': 1.0, 'MRR': 0.884176860346585, 'epoch': 7, 'global_step': 1904}


 73%|███████▎  | 200/273 [02:23<00:52,  1.39it/s]


Epoch 7, Global Step 1911 VAL res:
 {'train_loss': 0.9173891651324737, 'eval_loss': 0.7986633332995032, 'R1': 0.8110091743119267, 'R2': 0.9045871559633027, 'R5': 0.9889908256880734, 'R10': 1.0, 'MRR': 0.8836984855104122, 'epoch': 7, 'global_step': 1911}
1911 0.9173891651324737


 37%|███▋      | 100/273 [00:43<01:15,  2.29it/s]

2011 0.7593646547198296
Global Step 2040 VAL res:
 {'train_loss': 0.7741528560024823, 'eval_loss': 0.8588739726846273, 'R1': 0.818348623853211, 'R2': 0.9091743119266055, 'R5': 0.9908256880733946, 'R10': 1.0, 'MRR': 0.8880169651958643, 'epoch': 8, 'global_step': 2040}


 73%|███████▎  | 200/273 [01:40<00:37,  1.96it/s]

2111 0.7798859715461731
Global Step 2176 VAL res:
 {'train_loss': 0.7769089806754634, 'eval_loss': 0.8788625310940136, 'R1': 0.8192660550458716, 'R2': 0.9110091743119266, 'R5': 0.9853211009174312, 'R10': 1.0, 'MRR': 0.8886562545507499, 'epoch': 8, 'global_step': 2176}


 73%|███████▎  | 200/273 [02:24<00:52,  1.39it/s]


Epoch 8, Global Step 2184 VAL res:
 {'train_loss': 0.7747613473252936, 'eval_loss': 0.908208957707393, 'R1': 0.8229357798165138, 'R2': 0.9100917431192661, 'R5': 0.9871559633027523, 'R10': 1.0, 'MRR': 0.8906877093344984, 'epoch': 8, 'global_step': 2184}
2184 0.7747613473252936


 37%|███▋      | 100/273 [00:43<01:15,  2.30it/s]

2284 0.6656264357268811
Global Step 2312 VAL res:
 {'train_loss': 0.670833621523343, 'eval_loss': 0.9034207739799188, 'R1': 0.8211009174311926, 'R2': 0.9174311926605505, 'R5': 0.9889908256880734, 'R10': 1.0, 'MRR': 0.8911715450706276, 'epoch': 9, 'global_step': 2312}


 73%|███████▎  | 200/273 [01:39<00:37,  1.97it/s]

2384 0.6780416172742844
Global Step 2448 VAL res:
 {'train_loss': 0.6754563568216382, 'eval_loss': 0.8781988701644098, 'R1': 0.8302752293577982, 'R2': 0.9165137614678899, 'R5': 0.9880733944954129, 'R10': 1.0, 'MRR': 0.8955679335954566, 'epoch': 9, 'global_step': 2448}


 73%|███████▎  | 200/273 [02:23<00:52,  1.39it/s]


Epoch 9, Global Step 2457 VAL res:
 {'train_loss': 0.6761278805933593, 'eval_loss': 0.8808647858227882, 'R1': 0.828440366972477, 'R2': 0.9128440366972477, 'R5': 0.9899082568807339, 'R10': 1.0, 'MRR': 0.893826998689384, 'epoch': 9, 'global_step': 2457}
2457 0.6761278805933593


 37%|███▋      | 100/273 [00:43<01:15,  2.30it/s]

2557 0.5887033489346504
Global Step 2584 VAL res:
 {'train_loss': 0.5986467437950644, 'eval_loss': 0.9157308114373206, 'R1': 0.826605504587156, 'R2': 0.9174311926605505, 'R5': 0.9880733944954129, 'R10': 1.0, 'MRR': 0.8937265181301878, 'epoch': 10, 'global_step': 2584}


 73%|███████▎  | 200/273 [01:40<00:37,  1.95it/s]

2657 0.6133759367465973
Global Step 2720 VAL res:
 {'train_loss': 0.6112691159767343, 'eval_loss': 0.9299156976556333, 'R1': 0.8293577981651377, 'R2': 0.9192660550458716, 'R5': 0.9899082568807339, 'R10': 1.0, 'MRR': 0.8954517984563857, 'epoch': 10, 'global_step': 2720}


 73%|███████▎  | 200/273 [02:24<00:52,  1.39it/s]


Epoch 10, Global Step 2730 VAL res:
 {'train_loss': 0.6136849342739625, 'eval_loss': 0.9301141640927848, 'R1': 0.8293577981651377, 'R2': 0.918348623853211, 'R5': 0.9899082568807339, 'R10': 1.0, 'MRR': 0.8952988932576088, 'epoch': 10, 'global_step': 2730}
2730 0.6136849342739625


# Test

## candidates data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/poly-encoder답변모음.csv')
df = df.drop(columns=['Unnamed: 0'])
data = df.iloc[:-4758]
data = data[data['answer'].notna() & (data['answer'] != ' ')]
data

Unnamed: 0,answer
0,"공모주는 상장 후 보유하셔도 되고, 매도하셔도 됩니다."
1,공모가는 개인과 기관 둘 다 동일합니다.
2,주식 계좌의 예수금은 현금으로 출금이 가능하고 주식 매수도 가능한 돈입니다.
3,"계좌 개설이 완료되었다면 그 이후 거래는 별다른 문제가 없을 것으로 보이며, 혹시라..."
4,삼성 증권은 비대면 계좌 개설 20일 제한이 없습니다.
...,...
1618,코넥스는 자본시장을 통한 초기 중소 벤처기업의 성장지원 및 모험자본 선순환 체계 구...
1619,비상장 주식 거래를 손쉽게 할 수 있도록 만들어진 곳이 라고 보면 될 것 같습니다....
1620,주식계좌에 주식구매대금을 이체하는 시점에 현금을 증여한 것으로 보아 이체일이 속하는...
1621,증여세 신고를 한 금전을 재원으로자녀 명의의 계좌에 공모주 투자시자녀 소유의 주식을...


## Vectorizing

In [None]:
# TF-IDF 벡터화를 위한 객체를 생성합니다.
tfidf_vectorizer = TfidfVectorizer()

In [None]:
def get_top_similar_candidates(context, candidates, top_n=10):
    """
    주어진 'context'와 'candidates' 사이의 문장 유사도를 계산하고, 가장 유사한 후보들을 반환합니다.

    Args:
    - context (str): 주어진 문맥입니다.
    - candidates (dict): 후보 답변들을 담고 있는 사전 형태의 데이터 구조입니다.
        candidates['answer'] (list of str): 후보 답변들을 포함한 리스트입니다.
    - top_n (int, optional): 반환할 상위 유사한 후보의 수입니다. 기본값은 10입니다.

    Returns:
    - top_candidates (list of str): 가장 유사한 상위 후보 답변들의 리스트입니다.
    """

    # 후보 답변의 개수를 출력
    print("후보 답변 수:", len(candidates['answer']))

    # 유사도 값을 저장할 리스트 초기화
    lst = []

    # 각 후보 답변과 주어진 문맥 간의 유사도 계산
    for candidate in candidates['answer']:
        # 두 문장을 TF-IDF 벡터로 변환합니다.
        tfidf_matrix = tfidf_vectorizer.fit_transform([context, candidate])
        # 코사인 유사도를 계산합니다.
        cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
        lst.append(cosine_sim)

    # 유사도 리스트 출력 및 리스트의 길이 출력
    print(lst)
    print("유사도 리스트 길이:", len(lst))

    # 유사도 값을 평탄화하고 정렬
    flattened_lst = [item.flatten()[0] for item in lst]
    sorted_indices = np.argsort(flattened_lst)[::-1][:top_n]

    # 상위 유사한 후보 출력
    for i in range(top_n):
        print("상위", top_n, "유사도:", lst[sorted_indices[i]])
        print("상위", top_n, "답변:", candidates['answer'][sorted_indices[i]])

    # 상위 유사한 후보 반환
    top_candidates = [candidates['answer'][index] for index in sorted_indices]

    return top_candidates

In [None]:
def context_input(context):
    """
    입력된 문맥을 처리하여 토큰 ID와 입력 마스크를 생성합니다.

    Args:
        context (str): 처리할 문맥 문자열.

    Returns:
        contexts_token_ids_list_batch: 토큰 ID 텐서.
        contexts_input_masks_list_batch: 입력 마스크 텐서.
    """
    context_input_ids, context_input_masks = context_transform(context)
    contexts_token_ids_list_batch, contexts_input_masks_list_batch = [context_input_ids], [context_input_masks]

    long_tensors = [contexts_token_ids_list_batch, contexts_input_masks_list_batch]

    contexts_token_ids_list_batch, contexts_input_masks_list_batch = (torch.tensor(t, dtype=torch.long, device=device) for t in long_tensors)

    return contexts_token_ids_list_batch, contexts_input_masks_list_batch

def response_input(candidates):
    """
    후보 응답들을 처리하여 토큰 ID와 입력 마스크를 생성합니다.

    Args:
        candidates (list of str): 처리할 후보 응답 문자열들.

    Returns:
        responses_token_ids_list_batch: 토큰 ID 텐서.
        responses_input_masks_list_batch: 입력 마스크 텐서.
    """
    responses_token_ids_list, responses_input_masks_list = response_transform(candidates)
    responses_token_ids_list_batch, responses_input_masks_list_batch = [responses_token_ids_list], [responses_input_masks_list]

    long_tensors = [responses_token_ids_list_batch, responses_input_masks_list_batch]

    responses_token_ids_list_batch, responses_input_masks_list_batch = (torch.tensor(t, dtype=torch.long, device=device) for t in long_tensors)

    return responses_token_ids_list_batch, responses_input_masks_list_batch

def embs_gen(contexts_token_ids_list_batch, contexts_input_masks_list_batch):
    """
    문맥을 기반으로 임베딩을 생성합니다.

    Args:
        contexts_token_ids_list_batch (torch.Tensor): 문맥의 토큰 ID 텐서.
        contexts_input_masks_list_batch (torch.Tensor): 문맥의 입력 마스크 텐서.

    Returns:
         embs: 생성된 임베딩 텐서.
    """
    with torch.no_grad():
        model.eval()

        ctx_out = model.bert(contexts_token_ids_list_batch, contexts_input_masks_list_batch)[0]  # [bs, length, dim]
        poly_code_ids = torch.arange(model.poly_m, dtype=torch.long).to(contexts_token_ids_list_batch.device)
        poly_code_ids = poly_code_ids.unsqueeze(0).expand(1, model.poly_m)
        poly_codes = model.poly_code_embeddings(poly_code_ids) # [bs, poly_m, dim]
        embs = model.dot_attention(poly_codes, ctx_out, ctx_out) # [bs, poly_m, dim]

        return embs

def cand_emb_gen(responses_token_ids_list_batch, responses_input_masks_list_batch):
    """
    응답 후보들의 임베딩을 생성합니다.

    Args:
        responses_token_ids_list_batch (torch.Tensor): 응답 후보들의 토큰 ID 텐서.
        responses_input_masks_list_batch (torch.Tensor): 응답 후보들의 입력 마스크 텐서.

    Returns:
        cand_emb: 생성된 임베딩 텐서.
    """
    with torch.no_grad():
        model.eval()

        batch_size, res_cnt, seq_length = responses_token_ids_list_batch.shape # res_cnt is 1 during training
        responses_token_ids_list_batch = responses_token_ids_list_batch.view(-1, seq_length)
        responses_input_masks_list_batch = responses_input_masks_list_batch.view(-1, seq_length)
        cand_emb = model.bert(responses_token_ids_list_batch, responses_input_masks_list_batch)[0][:,0,:] # [bs, dim]
        cand_emb = cand_emb.view(batch_size, res_cnt, -1) # [bs, res_cnt, dim]

        return cand_emb

def loss(embs, cand_emb, contexts_token_ids_list_batch, responses_token_ids_list_batch):
    """
    손실을 계산합니다.

    Args:
        embs (torch.Tensor): 문맥 임베딩 텐서.
        cand_emb (torch.Tensor): 응답 후보들의 임베딩 텐서.
        contexts_token_ids_list_batch (torch.Tensor): 문맥의 토큰 ID 텐서.
        responses_token_ids_list_batch (torch.Tensor): 응답 후보들의 토큰 ID 텐서.

    Returns:
        loss: 계산된 손실 값.
    """
    batch_size, res_cnt, seq_length = responses_token_ids_list_batch.shape

    ctx_emb = model.dot_attention(cand_emb, embs, embs) # [bs, bs, dim]
    ctx_emb = ctx_emb.squeeze()
    dot_product = (ctx_emb*cand_emb) # [bs, bs]
    dot_product = dot_product.sum(-1)
    mask = torch.eye(batch_size).to(contexts_token_ids_list_batch.device) # [bs, bs]
    loss = F.log_softmax(dot_product, dim=-1)
    loss = loss * mask
    loss = (-loss.sum(dim=1))
    loss = loss.mean()

    return loss

def score(embs, cand_emb):
    """
    점수를 계산합니다.

    Args:
        embs (torch.Tensor): 문맥 임베딩 텐서.
        cand_emb (torch.Tensor): 응답 후보들의 임베딩 텐서.

    Returns:
        dot_product: 계산된 점수 텐서.
    """
    with torch.no_grad():
        model.eval()

        ctx_emb = model.dot_attention(cand_emb, embs, embs) # [bs, res_cnt, dim]
        dot_product = (ctx_emb*cand_emb).sum(-1)

        return dot_product


## Example

In [None]:
context = input('질문을 입력하세요: ')
print('질문:', context)

top_similar_candidates = get_top_similar_candidates(context, data)
print('Top Similar Candidates:')


candidates = top_similar_candidates

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

PATH = '/content/drive/MyDrive/poly_16_pytorch_model.bin'

bert_name = 'klue/bert-base'
bert_config = BertConfig.from_pretrained(bert_name)

tokenizer = BertTokenizer.from_pretrained(bert_name)
tokenizer.add_tokens(['\n'], special_tokens=True)

context_transform = SelectionJoinTransform(tokenizer=tokenizer, max_len=256)
response_transform = SelectionSequentialTransform(tokenizer=tokenizer, max_len=128)

bert = BertModel.from_pretrained(bert_name, config=bert_config)

model = PolyEncoder(bert_config, bert=bert, poly_m=16)
model.resize_token_embeddings(len(tokenizer))
model.load_state_dict(torch.load(PATH))
model.to(device)
model.device

contexts_token_ids_list_batch, contexts_input_masks_list_batch = context_input(context)
responses_token_ids_list_batch, responses_input_masks_list_batch = response_input(candidates)
embs = embs_gen(contexts_token_ids_list_batch, contexts_input_masks_list_batch)
cand_emb = cand_emb_gen(responses_token_ids_list_batch, responses_input_masks_list_batch)
score_ = score(embs, cand_emb)
max_value, max_index = torch.max(score_, dim=1, keepdim=True)

print('답변:', candidates[max_index.item()])

Cloning into 'Poly-Encoder'...
remote: Enumerating objects: 129, done.[K
remote: Counting objects: 100% (121/121), done.[K
remote: Compressing objects: 100% (57/57), done.[K
remote: Total 129 (delta 67), reused 107 (delta 61), pack-reused 8[K
Receiving objects: 100% (129/129), 35.10 KiB | 5.01 MiB/s, done.
Resolving deltas: 100% (69/69), done.
/content/Poly-Encoder/Poly-Encoder/Poly-Encoder/Poly-Encoder/Poly-Encoder
질문을 입력하세요: 미성년자
질문: 미성년자
1623
[array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


답변: 미성년자 계좌 계설은 부모님과 함께 가야합니다. 
