# 0. Env

In [None]:
import os
import json
import pandas as pd
from tqdm.auto import tqdm

import torch
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from transformers import (AutoTokenizer,
                          AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          pipeline,
                          TrainingArguments)
from peft import (LoraConfig,
                  PeftModel)

import faiss

# 1. RAG

In [None]:
# 추론에 사용할 LLM
SLLM_MODEL_ID = 'google/gemma-1.1-2b-it'
# Embedding 모델 ID
EMBE_MODEL_ID = 'snunlp/KR-SBERT-V40K-klueNLI-augSTS'
# hugging face access token을 복사하세요.
HF_TOKEN = ""

In [None]:
# SentenceBERT 모델 생성
embd_model = SentenceTransformer(EMBE_MODEL_ID)

In [None]:
# full chunks 읽어오기
full_chunks = []
with open("data/chunk_db.json") as f:
    for line in f:
        row = json.loads(line)
        full_chunks.append(row['chunk'])
len(full_chunks)

In [None]:
# chunk embedding index 읽어오기
faissindex_file = "data/faiss_flat_l2.index"
vdb_index = faiss.read_index(faissindex_file)
type(vdb_index), vdb_index.ntotal

In [None]:
# declare 4 bits quantize
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [None]:
# load 4 bits model
sllm_model = AutoModelForCausalLM.from_pretrained(SLLM_MODEL_ID,
                                                  device_map='auto',
                                                  quantization_config=quantization_config,
                                                  token=HF_TOKEN)

In [None]:
# load tokenizer
sllm_tokenizer = AutoTokenizer.from_pretrained(SLLM_MODEL_ID,
                                          add_special_tokens=True,
                                          token=HF_TOKEN)
sllm_tokenizer.padding_side = 'right'

In [None]:
# llm 추론 pipeline
# https://huggingface.co/docs/transformers/main_classes/pipelines
pipe = pipeline("text-generation",
                model=sllm_model,
                tokenizer=sllm_tokenizer,
                max_new_tokens=512)
pipe

In [None]:
def gen_context(query, top_n):
    query_embedding = embd_model.encode(query, normalize_embeddings=True)
    query_embeddings = query_embedding.reshape(1, -1)
    D, I = vdb_index.search(query_embeddings, top_n)
    context = []
    for i in I[0]:
        context.append(full_chunks[i])
    return '\n\n'.join(context)

In [None]:
def gen_prompt(pipe, context, query):
    messages = [
        {
            "role": "user",
            "content": """당신이 가진 지식을 의존하지 말고 다음 내용을 참고해서 '질문'에 대해서 답변해 주세요.:

{}

질문: {}""".format(context, query)
        }
    ]
    prompt = pipe.tokenizer.apply_chat_template(messages,
                                                tokenize=False,
                                                add_generation_prompt=True)
    return prompt

In [None]:
def gen_response(pipe, query, top_n=5):
    context = gen_context(query, top_n)
    prompt = gen_prompt(pipe, context, query)
    # print(prompt)

    outputs = pipe(
        prompt,
        do_sample=True,
        temperature=0.2,
        top_k=50,
        top_p=0.95
    )
    return outputs[0]["generated_text"][len(prompt):]

In [None]:
while True:
    doc = input('질문 > ')
    doc = doc.strip()
    if len(doc) == 0:
        break
    result = gen_response(pipe, doc)
    print(f'답변 > {result}\n\n')