# Pretrained Model을 이용한 RAG - vectorDB v1

1. 카테고리 별 balance
2. 질의 응답 1개씩만 포함

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
!pip install  git+https://github.com/huggingface/peft.git
!pip install bitsandbytes
!pip install accelerate==0.21.0
!pip install datasets

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Pytorch Import
import torch
import torch.nn as nn
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

## Transforemr Import
import transformers
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast, BitsAndBytesConfig
from transformers import AutoTokenizer, AdamW, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoModelForCausalLM

## Accerate
from accelerate import Accelerator

# Tqdm
from tqdm.auto import tqdm, trange

# HuggingFace peft
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType, prepare_model_for_kbit_training
from peft import PeftModel, PeftConfig

# Dataset
import datasets
from datasets import Dataset, DatasetDict, load_dataset

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
import os

base_path = "/content/drive/MyDrive/contest/dacon_hansol_llm"
data_path = os.path.join(base_path, "data")
model_save = os.path.join(base_path,'model/KoAlpaca-5.8B_base_preprocessed/')
# sub_path = base_path + 'sub/KoAlaca/'

# 1. Pretrained Model 가져오기
## 1.a. base model 가져오기

In [None]:
model_id = "beomi/KoAlpaca-Polyglot-5.8B"  # safetensors 컨버팅된 레포

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/36.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/13 [00:00<?, ?it/s]

model-00001-of-00013.safetensors:   0%|          | 0.00/926M [00:00<?, ?B/s]

model-00002-of-00013.safetensors:   0%|          | 0.00/952M [00:00<?, ?B/s]

model-00003-of-00013.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

model-00004-of-00013.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

model-00005-of-00013.safetensors:   0%|          | 0.00/952M [00:00<?, ?B/s]

model-00006-of-00013.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

model-00007-of-00013.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

model-00008-of-00013.safetensors:   0%|          | 0.00/952M [00:00<?, ?B/s]

model-00009-of-00013.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

model-00010-of-00013.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

model-00011-of-00013.safetensors:   0%|          | 0.00/952M [00:00<?, ?B/s]

model-00012-of-00013.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

model-00013-of-00013.safetensors:   0%|          | 0.00/515M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/13 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

## 1.b. BASE 모델에 Pretrained QLoRa 불러오기

In [None]:
# Load the Lora model
model = PeftModel.from_pretrained(model = model,
                                  model_id = model_save + f'output_peft_dir', device_map={"":0})

In [None]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPTNeoXForCausalLM(
      (gpt_neox): GPTNeoXModel(
        (embed_in): Embedding(30080, 5120)
        (emb_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-39): 40 x GPTNeoXLayer(
            (input_layernorm): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)
            (post_attention_layernorm): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)
            (post_attention_dropout): Dropout(p=0.0, inplace=False)
            (post_mlp_dropout): Dropout(p=0.0, inplace=False)
            (attention): GPTNeoXAttention(
              (rotary_emb): GPTNeoXRotaryEmbedding()
              (query_key_value): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=5120, out_features=15360, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# 2. RAG 파이프라인

In [None]:
%%capture
!pip install faiss-gpu
!pip install sentence_transformers
!pip install langchain

In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
import glob
import textwrap
import time

import langchain

# loaders
# from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader

# splits
from langchain.text_splitter import RecursiveCharacterTextSplitter

# prompts
from langchain import PromptTemplate, LLMChain

# vector stores
from langchain.vectorstores import FAISS

# models
from langchain.llms import HuggingFacePipeline
# from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceEmbeddings # HuggingFaceInstructEmbeddings

# retrievers
from langchain.chains import RetrievalQA

import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# from peft import (
#     LoraConfig,
#     PeftConfig,
#     get_peft_model,
#     prepare_model_for_kbit_training, # 4bir Qlora
# )
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

print('LangChain:', langchain.__version__)

LangChain: 0.1.10


# 3. RAG Config

In [None]:
class CFG:
    # LLMs
    # model_name = "beomi/llama-2-ko-7b" # "psymon/KoLlama2-7b" -> 4bit or 8bit model 없음
    temperature = 0,
    top_p = 0.95,
    repetition_penalty = 1.15

    # splitting
    # split_chunk_size = 800
    # split_overlap = 0

    # embeddings
    embeddings_model_repo = 'distiluse-base-multilingual-cased-v1'
    Embeddings_path = "/content/drive/MyDrive/contest/dacon_hansol_llm/vectordb"
    # similar passages
    k = 3

    max_len = 512

In [None]:
pipe = pipeline(
    task = "text-generation",
    model = model,
    tokenizer = tokenizer,
    pad_token_id = tokenizer.eos_token_id,
    # max_length = CFG.max_len,
    temperature = CFG.temperature,
    top_p = CFG.top_p,
    repetition_penalty = CFG.repetition_penalty,
    max_new_tokens=512
    # device=0 # 양자화 버전으로 다운 받아서 device로 이동 못한다는 에러 뜸
)

# pipe.save_pretrained(root_path+'/RAG')

llm = HuggingFacePipeline(pipeline = pipe)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCa

# 4. 데이터 전처리

In [None]:
train_data = pd.read_excel(os.path.join(base_path, "data/train_cleaned_dm.xlsx"))

In [None]:
train_data

Unnamed: 0,id,질문_1,질문_2,category,답변_1,답변_2,답변_3,답변_4,답변_5,적절 질문,특이사항,체크
0,TRAIN_000,면진장치가 뭐야?,,건축구조,면진장치란 지반에서 오는 진동 에너지를 흡수하여 건물에 주는 진동을 줄여주는 진동 ...,"면진장치란 건물의 지반에서 발생하는 진동 에너지를 흡수하여 건물을 보호하고, 진동을...",면진장치란 지반으로부터 발생하는 진동 에너지를 흡수하여 건물에 전달되는 진동을 줄여...,면진장치는 건물의 지반으로부터 오는 진동 에너지를 흡수하여 건물에 전달되는 진동을 ...,면진장치는 건물에 오는 지반 진동의 영향을 최대한으로 흡수하여 건물에 전달되는 진동...,1,질문2 애매함,1.0
1,TRAIN_001,내진설계의 종류 좀 알려줘,내진설계에는 어떤 종류가 있는지 자세히 알려주실 수 있나요?,건축구조,"내진 설계의 종류로 내진구조, 제진구조, 면진구조가 있습니다.","내진설계에는 내진구조, 제진구조, 면진구조가 있습니다. 내진구조는 건물 구조물이 지...","내진설계에는 주로 내진구조, 제진구조, 면진구조의 세 가지 종류가 있습니다. 이들은...","내진설계에는 주로 내진구조, 제진구조, 면진구조가 사용됩니다. 내진구조는 건물 구조...","내진 설계에는 다양한 종류가 있지만, 대표적으로 내진구조, 제진구조, 면진구조가 있...",12,,
2,TRAIN_002,철골구조의 장점이 뭐야?,철골구조의 장점을 알려줘.,건축구조,철골구조는 건물의 외벽에는 그다지 하중이 걸리지 않기 때문에 고층 건물의 건축이 가...,철골구조의 장점은 건물의 외벽에는 그다지 하중이 걸리지 않기 때문에 고층 건물의 건...,철골구조의 장점은 건물의 외벽에 하중이 적게 걸리기 때문에 고층 건물의 건축이 용이...,"철골구조의 장점은 건물의 외벽이 하중이 걸리지 않아 공간 활용이 용이하고, 고층 건...",철골구조의 장점은 건물의 외벽에 하중이 크게 걸리지 않아 고층 건물을 건축할 수 있...,12,질문2- 알려줘?가 잘못 인식될 수 있지 않을까. (해결),1.0
3,TRAIN_003,철골철근 콘크리트 구조가 뭐야?,철골철근 콘크리트 구조의 장점과 단점에는 무엇이 있을까요?,건축구조,"철근철골콘크리트는 철골과 철근, 그리고 콘크리트를 함께 사용하는 건축 구조입니다. ...","철골철근콘크리트 구조는 건축물을 지탱하는 주요 구조물인 철골과 철근, 그리고 콘크리...",철골철근 콘크리트 구조는 건축물을 지탱하기 위한 구조물에서 일반적으로 사용되는 방식...,"철골철근콘크리트 구조는 철골과 철근, 그리고 콘크리트를 함께 사용하여 만들어지는 건...","철골철근 콘크리트 구조는 강철 골조와 강철 철근, 그리고 콘크리트를 함께 사용하여 ...",12,,
4,TRAIN_004,철골구조는 어떤 방식이 있어?,철골구조의 다양한 방식이 무엇인가요?,건축구조,철골구조는 일반철골구조와 경량철골구조가 있습니다.,철골구조는 일반철골구조와 경량철골구조가 있습니다. 일반철골구조는 주로 대형 건물이나...,철골구조는 주로 일반철골구조와 경량철골구조로 나뉘어집니다. 이들은 건축 시스템에 따...,철골구조는 주로 일반철골구조와 경량철골구조로 구분됩니다. 이외에도 최근에는 고층 건...,철골구조는 일반철골구조와 경량철골구조 두 가지 방식이 주로 사용됩니다. 일반철골구조...,12,,
...,...,...,...,...,...,...,...,...,...,...,...,...
639,TRAIN_639,벽장 부위 결로의 원인이 뭐야?,벽장 부위 결로가 발생하는 주된 원인은 무엇일까요?,타 마감하자,벽장 부위 결로의 원인은 난방이 이웃한 방과동일한 조건이 되나 그 방에 비해 저온인...,"벽장 부위 결로의 원인은 주로 난방이 자리잡은 방이 내부 온도가 낮은 반면, 외부 ...",벽장 부위 결로가 발생하는 원인은 난방이 잘 이루어지지 않아 해당 공간이 저온인 반...,"벽장 부위 결로가 발생하는 원인은 난방이 인접한 방과 같은 조건을 갖추고 있지만, ...",벽장 부위 결로의 주된 원인은 충분한 환기가 이루어지지 않는 환경과 과도한 습기가 ...,12,,
640,TRAIN_640,"AD, PD에 면한 벽체 결로의 원인이 뭐야?",벽체 결로가 AD나 PD에 면한다면 그 원인이 무엇인가요?,타 마감하자,"AD, PD에 면한 벽체 결로의 원인은 외기에 접하는 면 좌, 우측 벽체에는 단열재...","AD, PD에 면한 벽체 결로의 원인은 외기에 접하는 면 좌, 우측 벽체에는 단열재...","AD, PD에 면한 벽체 결로의 주된 원인은 외기에 접하는 면 좌, 우측 벽체에 단...","AD, PD에 면한 벽체 결로의 원인으로는 외기에 접하는 면 좌, 우측 벽체는 일반...","AD, PD에 면한 벽체 결로의 원인으로는 외부 공기에 노출된 벽면이 실내보다 냉각...",12,,
641,TRAIN_641,외벽 모서리 부위에 결로가 발생하는 원인이 뭐야?,,타 마감하자,외벽모서리 부위에 결로가 발생하는 원인은 높은 온도차 때문입니다. 외벽 모서리의 경...,외벽 모서리 부위에 결로가 발생하는 원인은 중요한 역할을 합니다. 외부에 비치는 세...,외벽 모서리 부위에 결로가 발생하는 원인으로는 높은 온도차가 대표적입니다. 외벽 모...,외벽 모서리 부위에 결로가 발생하는 원인은 실내와 외부의 온도차 때문입니다. 외벽 ...,외벽 모서리 부위에 결로가 발생하는 주요 원인은 높은 온도차입니다. 외벽 모서리는 ...,1,원인에 관한 것이므로 답변1이 적절,1.0
642,TRAIN_642,창호 결로의 대책은 뭐야?,창호 결로를 해결하기 위한 가장 효과적인 방법은 무엇인가요?,타 마감하자,창호결로의 대책은 제품을 선정할 때 KS에 규정된 프레임을 선정하고 유리의 열관류율...,창호 결로의 대책은 KS에 규정된 프레임을 선정하고 유리의 열관류율 및 결로 발생 ...,"창호 결로를 예방하기 위한 대책은 KS에 규정된 프레임을 선택하고, 열관류율이 적은...","창호 결로의 대책은 여러 가지가 있습니다. 먼저, 창호를 선택할 때 KS에 규정된 ...","창호 결로를 방지하기 위한 대책으로는 KS에 규정된 프레임을 사용하고, 열관류율 및...",12,,


In [None]:
test_data = pd.read_csv(os.path.join(base_path, "data/test.csv"))

In [None]:
train_data

Unnamed: 0,id,질문_1,질문_2,category,답변_1,답변_2,답변_3,답변_4,답변_5,답변_1_길이,답변_2_길이,답변_3_길이,답변_4_길이,답변_5_길이,답변_sum,답변_2-5_sum,gpt요약답변
0,TRAIN_000,면진장치가 뭐야?,면진장치에 사용되는 주요 기술은 무엇인가요?,건축구조,면진장치란 지반에서 오는 진동 에너지를 흡수하여 건물에 주는 진동을 줄여주는 진동 ...,"면진장치란 건물의 지반에서 발생하는 진동 에너지를 흡수하여 건물을 보호하고, 진동을...",면진장치란 지반으로부터 발생하는 진동 에너지를 흡수하여 건물에 전달되는 진동을 줄여...,면진장치는 건물의 지반으로부터 오는 진동 에너지를 흡수하여 건물에 전달되는 진동을 ...,면진장치는 건물에 오는 지반 진동의 영향을 최대한으로 흡수하여 건물에 전달되는 진동...,54,100,150,107,97,면진장치란 지반에서 오는 진동 에너지를 흡수하여 건물에 주는 진동을 줄여주는 진동 ...,"면진장치란 건물의 지반에서 발생하는 진동 에너지를 흡수하여 건물을 보호하고, 진동을...","면진장치는 건물의 안전성과 안정성을 향상시키며, 지진이나 지반 진동으로 인한 피해를..."
1,TRAIN_000,면진장치가 뭐야?,면진장치에 사용되는 주요 기술은 무엇인가요?,건축구조,면진장치란 지반에서 오는 진동 에너지를 흡수하여 건물에 주는 진동을 줄여주는 진동 ...,"면진장치란 건물의 지반에서 발생하는 진동 에너지를 흡수하여 건물을 보호하고, 진동을...",면진장치란 지반으로부터 발생하는 진동 에너지를 흡수하여 건물에 전달되는 진동을 줄여...,면진장치는 건물의 지반으로부터 오는 진동 에너지를 흡수하여 건물에 전달되는 진동을 ...,면진장치는 건물에 오는 지반 진동의 영향을 최대한으로 흡수하여 건물에 전달되는 진동...,54,100,150,107,97,면진장치란 지반에서 오는 진동 에너지를 흡수하여 건물에 주는 진동을 줄여주는 진동 ...,"면진장치란 건물의 지반에서 발생하는 진동 에너지를 흡수하여 건물을 보호하고, 진동을...",면진장치는 건물의 지반으로부터 발생하는 진동 에너지를 흡수하여 건물을 보호하고 진동...
2,TRAIN_000,면진장치가 뭐야?,면진장치에 사용되는 주요 기술은 무엇인가요?,건축구조,면진장치란 지반에서 오는 진동 에너지를 흡수하여 건물에 주는 진동을 줄여주는 진동 ...,"면진장치란 건물의 지반에서 발생하는 진동 에너지를 흡수하여 건물을 보호하고, 진동을...",면진장치란 지반으로부터 발생하는 진동 에너지를 흡수하여 건물에 전달되는 진동을 줄여...,면진장치는 건물의 지반으로부터 오는 진동 에너지를 흡수하여 건물에 전달되는 진동을 ...,면진장치는 건물에 오는 지반 진동의 영향을 최대한으로 흡수하여 건물에 전달되는 진동...,54,100,150,107,97,면진장치란 지반에서 오는 진동 에너지를 흡수하여 건물에 주는 진동을 줄여주는 진동 ...,"면진장치란 건물의 지반에서 발생하는 진동 에너지를 흡수하여 건물을 보호하고, 진동을...","면진장치는 건물의 안전성과 안정성을 향상시키고, 지진 등의 외부 충격으로부터 보호하..."
3,TRAIN_000,면진장치가 뭐야?,면진장치에 사용되는 주요 기술은 무엇인가요?,건축구조,면진장치란 지반에서 오는 진동 에너지를 흡수하여 건물에 주는 진동을 줄여주는 진동 ...,"면진장치란 건물의 지반에서 발생하는 진동 에너지를 흡수하여 건물을 보호하고, 진동을...",면진장치란 지반으로부터 발생하는 진동 에너지를 흡수하여 건물에 전달되는 진동을 줄여...,면진장치는 건물의 지반으로부터 오는 진동 에너지를 흡수하여 건물에 전달되는 진동을 ...,면진장치는 건물에 오는 지반 진동의 영향을 최대한으로 흡수하여 건물에 전달되는 진동...,54,100,150,107,97,면진장치란 지반에서 오는 진동 에너지를 흡수하여 건물에 주는 진동을 줄여주는 진동 ...,"면진장치란 건물의 지반에서 발생하는 진동 에너지를 흡수하여 건물을 보호하고, 진동을...",면진장치는 건물의 안전을 보호하고 지진이나 지반 진동으로 인한 피해를 방지하기 위해...
4,TRAIN_000,면진장치가 뭐야?,면진장치에 사용되는 주요 기술은 무엇인가요?,건축구조,면진장치란 지반에서 오는 진동 에너지를 흡수하여 건물에 주는 진동을 줄여주는 진동 ...,"면진장치란 건물의 지반에서 발생하는 진동 에너지를 흡수하여 건물을 보호하고, 진동을...",면진장치란 지반으로부터 발생하는 진동 에너지를 흡수하여 건물에 전달되는 진동을 줄여...,면진장치는 건물의 지반으로부터 오는 진동 에너지를 흡수하여 건물에 전달되는 진동을 ...,면진장치는 건물에 오는 지반 진동의 영향을 최대한으로 흡수하여 건물에 전달되는 진동...,54,100,150,107,97,면진장치란 지반에서 오는 진동 에너지를 흡수하여 건물에 주는 진동을 줄여주는 진동 ...,"면진장치란 건물의 지반에서 발생하는 진동 에너지를 흡수하여 건물을 보호하고, 진동을...","면진장치는 건물의 안전을 보호하고 진동을 줄여주는 장치로, 지진이나 지반 진동으로 ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6435,TRAIN_643,"AD, PD에 면한 벽체 결로에 대한 대책은 뭐야?","AD, PD에 면한 벽체 결로에 대한 대책은 어떤 것이 있나요?",타 마감하자,"AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD, PD에 면한 벽체의 결로에 대한 대책으로는 단열재를 미실하게 시공하여 결로가...",80,210,273,218,270,"AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD, PD에 면한 벽체 결로에 대한 대책은 다양한 측면을 고려해야 합니다. 먼저,..."
6436,TRAIN_643,"AD, PD에 면한 벽체 결로에 대한 대책은 뭐야?","AD, PD에 면한 벽체 결로에 대한 대책은 어떤 것이 있나요?",타 마감하자,"AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD, PD에 면한 벽체의 결로에 대한 대책으로는 단열재를 미실하게 시공하여 결로가...",80,210,273,218,270,"AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD와 PD에 면한 벽체 결로에 대한 대책은 다양한 측면을 고려해야 합니다. 먼저,..."
6437,TRAIN_643,"AD, PD에 면한 벽체 결로에 대한 대책은 뭐야?","AD, PD에 면한 벽체 결로에 대한 대책은 어떤 것이 있나요?",타 마감하자,"AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD, PD에 면한 벽체의 결로에 대한 대책으로는 단열재를 미실하게 시공하여 결로가...",80,210,273,218,270,"AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD와 PD에 면한 벽체 결로에 대한 대책은 다양한 측면을 고려해야 합니다. 먼저,..."
6438,TRAIN_643,"AD, PD에 면한 벽체 결로에 대한 대책은 뭐야?","AD, PD에 면한 벽체 결로에 대한 대책은 어떤 것이 있나요?",타 마감하자,"AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD, PD에 면한 벽체의 결로에 대한 대책으로는 단열재를 미실하게 시공하여 결로가...",80,210,273,218,270,"AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD, PD에 면한 벽체 결로에 대한 대책은 단열재를 미실하게 시공하여 결로가 생기...","AD, PD에 면한 벽체 결로에 대한 대책은 다양한 방면에서 접근해야 합니다. 먼저..."


In [None]:
from itertools import product

train_data_all = []

for q,a in list(product([f"질문_{x}" for x in range(1,3)],[f"답변_{x}" for x in range(1,6)])):
  for i in range(len(train_data)):
      train_data_all.append(
          "질문: "+ train_data.at[i,q] + " 답변 : " + train_data.at[i,a]
      )

q = '질문_1'
a = '답변_1'
train_data_short = []
for i in range(len(train_data)):
    train_data_short.append(
        "질문: "+ train_data.at[i,q] + " 답변 : " + train_data.at[i,a]
    )

q = '질문_2'
a = '답변_2'
train_data_long = []
for i in range(len(train_data)):
    train_data_long.append(
        "질문: "+ train_data.at[i,q] + " 답변 : " + train_data.at[i,a]
    )


q = '질문_2'
a = 'gpt요약답변'
train_data_gpt = []
for i in range(len(train_data)):
    train_data_gpt.append(
        "질문: "+ train_data.at[i,q] + " 답변 : " + train_data.at[i,a]
    )


In [None]:
print(len(train_data_all))
print(len(train_data_long))
print(len(train_data_short))
print(len(train_data_gpt))

64400
6440
6440
6440


In [None]:
train_data['category'].value_counts()

마감재       2720
인테리어      1230
시공        1110
마감하자       600
건축구조       310
기타         270
타 마감하자     200
Name: category, dtype: int64

In [None]:
model_kwargs = {'device':'cuda'}

encode_kwargs = {'normalize_embeddings': False}

embeddings = HuggingFaceEmbeddings(
    model_name=CFG.embeddings_model_repo,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/452 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

In [None]:
### create embeddings and DB
vectordb_all = FAISS.from_texts(
    texts = train_data_all,
    embedding = embeddings
)
### create embeddings and DB
vectordb_short = FAISS.from_texts(
    texts = train_data_short,
    embedding = embeddings
)
### create embeddings and DB
vectordb_long = FAISS.from_texts(
    texts = train_data_long,
    embedding = embeddings
)

### create embeddings and DB
vectordb_gpt = FAISS.from_texts(
    texts = train_data_gpt,
    embedding = embeddings
)

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


# prompt_template = """
# 마지막에 질문에 답하려면 다음과 같은 맥락을 사용합니다.
# {context}

# 질문: {question}
# 답변:"""


# PROMPT = PromptTemplate(
#     template = prompt_template,
#     input_variables = ["context", "question"]
# )


prompt_template = """
마지막에 질문에 답하려면 다음과 같은 맥락을 사용합니다.
{context}
3문장 이하로 답변해주세요.
확실하지 않은 답변은 최대한 자제해 주세요.

질문: {question}
답변:"""


PROMPT_detail1 = PromptTemplate(
    template = prompt_template,
    input_variables = ["context", "question"]
)


prompt_template = """
마지막에 질문에 답하려면 다음과 같은 맥락을 사용합니다.
{context}

질문: {question}
답변는 다음과 같은 형식을 따라주세요:
- 답변은 3문장 이하로 작성해주세요.
- 이전 답변의 반복을 피해주세요.
- 프롬프트에 사용된 문구나 관련 문서는 답변에서 제외해주세요.

예시:
질문: 최근 인기 있는 여행지에 대해 알려주세요.
답변: 현재 많은 여행객들이 찾는 인기 있는 여행지 중 하나는 일본의 오사카입니다. 오사카는 독특한 문화와 맛있는 음식으로 유명하며, 오사카성, 도톤보리, 유니버셜 스튜디오 재팬 등의 명소가 있습니다.

답변:"""
PROMPT_detail2 = PromptTemplate(
    template = prompt_template,
    input_variables = ["context", "question"]
)

In [None]:
retriever_all = vectordb_all.as_retriever(search_kwargs = {"k": CFG.k, "search_type" : "similarity"})
retriever_short = vectordb_short.as_retriever(search_kwargs = {"k": CFG.k, "search_type" : "similarity"})
retriever_long = vectordb_long.as_retriever(search_kwargs = {"k": CFG.k, "search_type" : "similarity"})
retriever_gpt = vectordb_gpt.as_retriever(search_kwargs = {"k": CFG.k, "search_type" : "similarity"})

In [None]:
qa_chain_RetrievalAll_PromptBase = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever_all,
    return_source_documents=True,
    verbose=True,
    chain_type_kwargs={"prompt": PROMPT},
)
qa_chain_RetrievalAll_PromptDetail = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever_all,
    return_source_documents=True,
    verbose=True,
    chain_type_kwargs={"prompt": PROMPT_detail},
)
qa_chain_RetrievalShort_PromptBase = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever_short,
    return_source_documents=True,
    verbose=True,
    chain_type_kwargs={"prompt": PROMPT},
)
qa_chain_RetrievalShort_PromptDetail = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever_short,
    return_source_documents=True,
    verbose=True,
    chain_type_kwargs={"prompt": PROMPT_detail},
)
qa_chain_RetrievalLong_PromptBase = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever_long,
    return_source_documents=True,
    verbose=True,
    chain_type_kwargs={"prompt": PROMPT},
)
qa_chain_RetrievalLong_PromptDetail = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever_long,
    return_source_documents=True,
    verbose=True,
    chain_type_kwargs={"prompt": PROMPT_detail},
)

qa_chain_RetrievalGPT_PromptBase = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever_gpt,
    return_source_documents=True,
    verbose=True,
    chain_type_kwargs={"prompt": PROMPT},
)
qa_chain_RetrievalGPT_PromptDetail = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever_gpt,
    return_source_documents=True,
    verbose=True,
    chain_type_kwargs={"prompt": PROMPT_detail},
)

# 답변 테스트

In [None]:
from tqdm import tqdm
import pickle
import random
import json

In [None]:
path_ = '/content/drive/MyDrive/contest/dacon_hansol_llm/result/V1_Pretrained_KoAlpaca_RAG/train_ans_FineTuned_QLoRa1_RetrievalAll_Prompt_20240225.json'

pred_before = json.load(open(path_))
pred_before.keys()


dict_keys(['TRAIN_001', 'TRAIN_002', 'TRAIN_003', 'TRAIN_009', 'TRAIN_019', 'TRAIN_071', 'TRAIN_089', 'TRAIN_090', 'TRAIN_132', 'TRAIN_140', 'TRAIN_149', 'TRAIN_165', 'TRAIN_192', 'TRAIN_298', 'TRAIN_314', 'TRAIN_355', 'TRAIN_418', 'TRAIN_467', 'TRAIN_494', 'TRAIN_553', 'TRAIN_562', 'TRAIN_593', 'TRAIN_643'])

In [None]:
sample_ = list(pred_before.keys())

In [None]:
# sample_ = list(set([1,2,3] + random.sample(range(train_data.shape[0]),20)))
# sample_ = np.sort(sample_)
# sample_

array([  1,   2,   3,   9,  19,  71,  89,  90, 132, 140, 149, 165, 192,
       298, 314, 355, 418, 467, 494, 553, 562, 593, 643])

In [None]:
train_data_sample = pd.concat(list(map(lambda id_: train_data.groupby('id').get_group(id_).head(1), sample_))).reset_index(drop=True)

- qa_chain_RetrievalAll_Prompt
- qa_chain_RetrievalAll_PROMPT_detail
- qa_chain_RetrievalShort_Prompt
- qa_chain_RetrievalShort_PROMPT_detail
- qa_chain_RetrievalLong_Prompt
- qa_chain_RetrievalLong_PROMPT_detail

In [None]:
result = dict()
for idx, row in tqdm(train_data_sample.iterrows()):
  Q1 = row['질문_1']
  Q2 = row['질문_2']

  A1 = qa_chain_RetrievalAll_PromptBase(Q1)
  A2 = qa_chain_RetrievalAll_PromptBase(Q2)

  result.update({row['id']:{'Q1':Q1,
                            'A1':A1['result'],
                            'source1':[str(doc) for doc in A1['source_documents']],
                            'Q2':Q2,
                            'A2':A2['result'],
                            'source2':[str(doc) for doc in A2['source_documents']],
                            'truth2':row['답변_1'],
                            'truth2':row['답변_2']}
  })

# Serialize data into file:
json.dump( result, open( os.path.join(base_path,"train_ans_FineTuned_QLoRa1_RetrievalAll_Prompt_20240225.json"), 'w' ) )

  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


1it [03:55, 235.93s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


2it [06:58, 204.72s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


3it [09:31, 180.75s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


4it [12:08, 171.76s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


5it [14:41, 164.64s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


6it [17:13, 160.51s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


7it [18:22, 130.61s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


8it [20:59, 139.05s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


9it [23:26, 141.36s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


10it [26:03, 146.44s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


11it [28:38, 148.83s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


12it [31:05, 148.37s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


13it [33:38, 149.77s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


14it [36:12, 150.91s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


15it [38:46, 152.03s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


16it [41:24, 153.71s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


17it [43:11, 139.77s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


18it [45:48, 144.79s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


19it [48:17, 146.07s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


20it [50:56, 150.18s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


21it [53:29, 151.03s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


22it [56:08, 153.18s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


23it [58:47, 153.38s/it]


[1m> Finished chain.[0m





In [None]:
result = dict()
for idx, row in tqdm(train_data_sample.iterrows()):
  Q1 = row['질문_1']
  Q2 = row['질문_2']

  A1 = qa_chain_RetrievalAll_PromptDetail(Q1)
  A2 = qa_chain_RetrievalAll_PromptDetail(Q2)

  result.update({row['id']:{'Q1':Q1,
                            'A1':A1['result'],
                            'source1':[str(doc) for doc in A1['source_documents']],
                            'Q2':Q2,
                            'A2':A2['result'],
                            'source2':[str(doc) for doc in A2['source_documents']],
                            'truth2':row['답변_1'],
                            'truth2':row['답변_2']}
  })

# Serialize data into file:
json.dump( result, open( os.path.join(base_path,"train_ans_FineTuned_QLoRa1_RetrievalAll_PrompDetail_20240302.json"), 'w' ) )

0it [00:00, ?it/s]



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


1it [02:29, 149.47s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


2it [05:09, 155.80s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


3it [07:23, 145.96s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


4it [10:02, 151.04s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


5it [12:34, 151.44s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


6it [15:07, 151.82s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


7it [17:41, 152.38s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


8it [20:18, 153.85s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


9it [22:44, 151.63s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


10it [25:22, 153.52s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


11it [27:57, 153.88s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


12it [30:26, 152.50s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


13it [31:58, 134.11s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


14it [34:35, 140.94s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


15it [37:13, 146.18s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


16it [39:52, 149.99s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


17it [42:23, 150.49s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


18it [45:00, 152.23s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


19it [47:28, 151.07s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


20it [50:07, 153.36s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


21it [52:38, 152.62s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


22it [55:15, 153.93s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


23it [57:53, 151.04s/it]


[1m> Finished chain.[0m





In [None]:
result = dict()
for idx, row in tqdm(train_data_sample.iterrows()):
  Q1 = row['질문_1']
  Q2 = row['질문_2']

  A1 = qa_chain_RetrievalLong_PromptBase(Q1)
  A2 = qa_chain_RetrievalLong_PromptBase(Q2)

  result.update({row['id']:{'Q1':Q1,
                            'A1':A1['result'],
                            'source1':[str(doc) for doc in A1['source_documents']],
                            'Q2':Q2,
                            'A2':A2['result'],
                            'source2':[str(doc) for doc in A2['source_documents']],
                            'truth2':row['답변_1'],
                            'truth2':row['답변_2']}
  })

# Serialize data into file:
json.dump( result, open( os.path.join(base_path,"train_ans_FineTuned_QLoRa1_RetrievalLong_PromptBase_20240302.json"), 'w' ) )

0it [00:00, ?it/s]



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


1it [01:38, 98.53s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


2it [04:26, 139.57s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


3it [07:08, 149.45s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


4it [09:40, 150.75s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


5it [12:15, 152.10s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


6it [14:49, 152.83s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


7it [17:26, 154.21s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


8it [20:00, 154.08s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


9it [22:34, 154.14s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


10it [25:08, 154.10s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


11it [27:41, 153.85s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


12it [29:07, 133.05s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


13it [31:39, 138.67s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


14it [33:16, 126.22s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


15it [35:51, 134.82s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


16it [38:27, 141.31s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


17it [41:01, 144.99s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


18it [43:35, 147.94s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


19it [46:14, 151.24s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


20it [48:48, 152.10s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


21it [50:22, 134.59s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


22it [52:57, 140.53s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


23it [55:34, 144.99s/it]


[1m> Finished chain.[0m





In [None]:
result = dict()
for idx, row in tqdm(train_data_sample.iterrows()):
  Q1 = row['질문_1']
  Q2 = row['질문_2']

  A1 = qa_chain_RetrievalLong_PromptDetail(Q1)
  A2 = qa_chain_RetrievalLong_PromptDetail(Q2)

  result.update({row['id']:{'Q1':Q1,
                            'A1':A1['result'],
                            'source1':[str(doc) for doc in A1['source_documents']],
                            'Q2':Q2,
                            'A2':A2['result'],
                            'source2':[str(doc) for doc in A2['source_documents']],
                            'truth2':row['답변_1'],
                            'truth2':row['답변_2']}
  })

# Serialize data into file:
json.dump( result, open( os.path.join(base_path,"train_ans_FineTuned_QLoRa1_RetrievalLong_PromptDetail_20240302.json"), 'w' ) )

0it [00:00, ?it/s]



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


1it [01:32, 92.21s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


2it [04:19, 136.53s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


3it [07:00, 147.70s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


4it [09:31, 148.96s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


5it [12:05, 150.71s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


6it [14:39, 151.92s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


7it [17:16, 153.51s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


8it [19:50, 153.76s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


9it [22:24, 153.88s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


10it [25:00, 154.39s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


11it [26:34, 135.88s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


12it [28:03, 121.55s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


13it [29:32, 111.77s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


14it [32:04, 123.84s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


15it [34:38, 133.11s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


16it [37:14, 140.06s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


17it [39:49, 144.47s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


18it [42:26, 148.13s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


19it [45:07, 152.07s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


20it [47:42, 152.91s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


21it [50:17, 153.69s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


22it [52:53, 154.28s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


In [None]:
result = dict()
for idx, row in tqdm(train_data_sample.iterrows()):
  Q1 = row['질문_1']
  Q2 = row['질문_2']

  A1 = qa_chain_RetrievalShort_PromptBase(Q1)
  A2 = qa_chain_RetrievalShort_PromptBase(Q2)

  result.update({row['id']:{'Q1':Q1,
                            'A1':A1['result'],
                            'source1':[str(doc) for doc in A1['source_documents']],
                            'Q2':Q2,
                            'A2':A2['result'],
                            'source2':[str(doc) for doc in A2['source_documents']],
                            'truth2':row['답변_1'],
                            'truth2':row['답변_2']}
  })

# Serialize data into file:
json.dump( result, open( os.path.join(base_path,"train_ans_FineTuned_QLoRa1_RetrievalShort_PromptBase_20240302.json"), 'w' ) )

0it [00:00, ?it/s]



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


1it [01:27, 87.90s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


2it [03:54, 122.32s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


3it [06:34, 139.39s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


4it [08:59, 141.98s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


5it [11:27, 143.83s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


6it [13:55, 145.32s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


7it [16:21, 145.62s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


8it [18:02, 131.54s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


9it [20:27, 135.56s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


10it [22:03, 123.46s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


11it [24:32, 131.08s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


12it [26:56, 135.04s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


13it [29:22, 138.53s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


14it [29:39, 101.78s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


15it [32:19, 119.37s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


16it [34:47, 127.87s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


17it [37:15, 133.90s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


18it [39:41, 137.66s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


19it [42:08, 140.44s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


20it [44:41, 144.17s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


21it [45:12, 110.36s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


22it [46:38, 103.05s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


23it [49:07, 128.17s/it]


[1m> Finished chain.[0m





In [None]:
result = dict()
for idx, row in tqdm(train_data_sample.iterrows()):
  Q1 = row['질문_1']
  Q2 = row['질문_2']

  A1 = qa_chain_RetrievalShort_PromptDetail(Q1)
  A2 = qa_chain_RetrievalShort_PromptDetail(Q2)

  result.update({row['id']:{'Q1':Q1,
                            'A1':A1['result'],
                            'source1':[str(doc) for doc in A1['source_documents']],
                            'Q2':Q2,
                            'A2':A2['result'],
                            'source2':[str(doc) for doc in A2['source_documents']],
                            'truth2':row['답변_1'],
                            'truth2':row['답변_2']}
  })

# Serialize data into file:
json.dump( result, open( os.path.join(base_path,"train_ans_FineTuned_QLoRa1_RetrievalShort_PromptDetail_20240302.json"), 'w' ) )

0it [00:00, ?it/s]



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


1it [02:25, 145.72s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


2it [04:52, 146.48s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


3it [07:33, 153.15s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


4it [09:59, 150.38s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


5it [12:27, 149.44s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


6it [13:50, 126.95s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


7it [16:17, 133.50s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


8it [18:46, 138.24s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


9it [21:12, 140.65s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


10it [22:38, 123.93s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


11it [24:16, 115.91s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


12it [25:34, 104.41s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


13it [26:55, 97.32s/it] 


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


14it [29:23, 112.68s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


15it [32:05, 127.39s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


16it [34:33, 133.63s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


17it [37:02, 138.20s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


18it [39:29, 140.86s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


19it [41:56, 142.72s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


20it [43:41, 131.40s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


In [None]:
result = dict()
for idx, row in tqdm(train_data_sample.iterrows()):
  Q1 = row['질문_1']
  Q2 = row['질문_2']

  A1 = qa_chain_RetrievalGPT_PromptBase(Q1)
  A2 = qa_chain_RetrievalGPT_PromptBase(Q2)

  result.update({row['id']:{'Q1':Q1,
                            'A1':A1['result'],
                            'source1':[str(doc) for doc in A1['source_documents']],
                            'Q2':Q2,
                            'A2':A2['result'],
                            'source2':[str(doc) for doc in A2['source_documents']],
                            'truth2':row['답변_1'],
                            'truth2':row['답변_2']}
  })

# Serialize data into file:
json.dump( result, open( os.path.join(base_path,"train_ans_FineTuned_QLoRa1_RetrievalGPT_PromptBase_20240302.json"), 'w' ) )

0it [00:00, ?it/s]



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


1it [02:39, 159.36s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


2it [05:32, 167.63s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


3it [08:29, 171.90s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


4it [11:12, 168.13s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


5it [13:58, 167.57s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


6it [14:46, 126.72s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


7it [17:30, 138.94s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


8it [20:15, 147.42s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


9it [22:52, 150.44s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


10it [25:34, 153.86s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


11it [28:14, 155.63s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


12it [30:55, 157.36s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


13it [33:29, 156.52s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


14it [36:13, 158.50s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


15it [38:52, 158.73s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


16it [41:32, 159.26s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


17it [44:07, 158.03s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


18it [46:18, 149.71s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


19it [49:02, 154.13s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


20it [51:54, 159.46s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


21it [53:51, 146.57s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


22it [56:37, 152.45s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


23it [59:21, 154.84s/it]


[1m> Finished chain.[0m





In [None]:
result = dict()
for idx, row in tqdm(train_data_sample.iterrows()):
  Q1 = row['질문_1']
  Q2 = row['질문_2']

  A1 = qa_chain_RetrievalGPT_PromptDetail(Q1)
  A2 = qa_chain_RetrievalGPT_PromptDetail(Q2)

  result.update({row['id']:{'Q1':Q1,
                            'A1':A1['result'],
                            'source1':[str(doc) for doc in A1['source_documents']],
                            'Q2':Q2,
                            'A2':A2['result'],
                            'source2':[str(doc) for doc in A2['source_documents']],
                            'truth2':row['답변_1'],
                            'truth2':row['답변_2']}
  })

# Serialize data into file:
json.dump( result, open( os.path.join(base_path,"train_ans_FineTuned_QLoRa1_RetrievalGPT_PromptDetail_20240302.json"), 'w' ) )

0it [00:00, ?it/s]



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


1it [01:48, 108.29s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


2it [04:08, 127.21s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


3it [07:07, 150.63s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


4it [09:01, 136.28s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


5it [11:48, 147.35s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


6it [14:34, 153.55s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


7it [17:18, 156.95s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


8it [20:03, 159.72s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


9it [22:40, 158.75s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


10it [24:39, 146.60s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


11it [27:19, 150.49s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


12it [29:59, 153.57s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


13it [32:33, 153.75s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


14it [35:15, 156.21s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


15it [37:53, 156.82s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


16it [40:34, 157.98s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


17it [43:09, 157.15s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


18it [45:14, 147.26s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


19it [47:59, 152.61s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


20it [50:51, 158.46s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


21it [53:38, 160.99s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


22it [56:24, 162.68s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


23it [59:09, 154.35s/it]


[1m> Finished chain.[0m





In [None]:
# 샘플에 대한 Cosine Similarity 산식
def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b) if norm_a != 0 and norm_b != 0 else 0

In [None]:
sample_scores = []
for pred, gt in zip(preds, gts):
    # 생성된 답변 내용을 512 Embedding Vector로 변환
    pred_embed = model.encode(pred)
    gt_embed = model.encode(gt)

    sample_score = cosine_similarity(gt_embed, pred_embed)
    # Cosine Similarity Score가 0보다 작으면 0으로 간주
    sample_score = max(sample_score, 0)
    print('예측 : ', pred)
    print('정답 : ', gt)
    print('Cosine Similarity Score : ', sample_score)
    print('-'*20)
    sample_scores.append(sample_score)
print('전체 샘플의 Cosine Similarity Score 평균 : ', np.mean(sample_scores))