# RAG 방식을 이용한 번역기 예

In [None]:
!pip install accelerate 
!pip install einops 
!pip install langchain 
!pip install xformers 
!pip install bitsandbytes 
!pip install sentence_transformers 
!pip install chromadb 
!pip install langchain_community

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
!pip install --upgrade transformers tokenizers

In [None]:
import sys
from torch import cuda, bfloat16
import torch
import transformers
from transformers import AutoTokenizer
from time import time
#import chromadb
#from chromadb.config import Settings
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
## transformers 버전은 4.34 이상이 되어야합니다. 안될시 여러가지 오류가 발생 위험이 존재합니다.
print(transformers.__version__)

In [None]:
import torch
print(torch.cuda.is_available())

In [None]:
# M1 설정을 위해 라이브러리 변경
!pip install torch torchvision torchaudio

In [None]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"using device: {device}")

In [9]:
import transformers
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# model_id = 'MLP-KTLim/llama-3-Korean-Bllossom-8B'
model_id = 'Bllossom/llama-3.2-Korean-Bllossom-3B'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16,
)
# print(cuda.current_device())

In [None]:
time_start = time()
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    trust_remote_code=True,
    max_new_tokens=1024
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config = model_config,
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
time_end = time()
print(f"Prepare model, tokenizer: {round(time_end-time_start, 3)} sec.")

In [None]:
import transformers
import torch
from time import time

time_start = time()

# model_id = 'MLP-KTLim/llama-3-Korean-Bllossom-8B'
model_id = 'Bllossom/llama-3.2-Korean-Bllossom-3B'

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# 모델과 토크나이저 로드
model = transformers.AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map={"":0},
)
pipeline.model.eval()

prompt = "You are a helpful AI assistant. Please answer the user's questions kindly. 당신은 유능한 AI 어시스턴트 입니다. 사용자의 질문에 대해 친절하게 답변해주세요."
question = "서울의 유명한 관광 코스를 만들어줄래?"

messages = [
            {"role": "system", "content": f"{prompt}"},
            {"role": "system", "content": f"{question}"}
            ]

chat_prompt = pipeline.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
)
terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipeline(
    chat_prompt,
    max_new_tokens=2048,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9
)


In [None]:
# dataset upload
import transformers
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
from langchain_community.document_loaders import DataFrameLoader

# df = pd.read_csv("/content/drive/MyDrive/part4/history.csv", encoding='utf-8-sig')
df = pd.read_csv("/rag_csv_data/history.csv", encoding='utf-8-sig')

df = df[['extracted_text', 'translate_text']]
df = df[:100]
print(df.head(10))
loader = DataFrameLoader(df, page_content_column="extracted_text")
df = loader.load()

In [None]:
from langchain_community.document_loaders import DataFrameLoader
from langchain.document_loaders import TextLoader
from langchain.vectorstores.utils import filter_complex_metadata
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import llamacpp
from langchain.text_splitter import RecursiveCharacterTextSplitter
from textwrap import fill
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.chains import LLMChain
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.vectorstores.utils import DistanceStrategy
import pandas as pd

df = pd.read_csv("/rag_csv_data/history.csv", encoding='utf-8-sig')

df = df[['extracted_text', 'translate_text']]
df =  df[:100]
print(df.heade(10))
loader = DataFrameLoader(df, ipage_content_column="extracted_text")
df = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
all_splits = text_splitter.split_documents(df)
all_splits[0]

In [None]:
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings
)
model_name = "jhgan/ko-sroberta-multitask"
# model_kwargs = {'device': 'cpu'}
model_kwargs = {'device': 'mps'}
encode_kwargs = {'normalize_embeddings' : False}
embeddings = SentenceTransformerEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [None]:
from langchain_chroma import Chroma
vectordb = Chroma.from_dcouments(documents=all_splits, embedding=embeddings)
vectordb.get(include=["metadatas", "documents", "embeddings"], limit=10, offset=1)

In [None]:
query = " Please translate the following sentence. Please construct the translation by referring to metadat. \n\"new\" British history and Atlantic history in the early 1970"
docs = vectordb.similarity_search(query)

print(docs[0].page_content)
print(docs[0].metadata)


In [None]:
retriever = vectordb.as_retriever()
llm = HuggingFacePipeline(pipeline=pipeline)
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=True,
    return_source_docuemnts=True
)
retriever
qa

In [None]:
query = "Please translate the following sentences into Korean. Please construct the translation by referring to metadat. \n\"new\" British history and Atlantic history in the early 1970"

response = qa(query)
print(response)

In [None]:
print(response["source_documents"])

query = "\"new\" British history and Atlantic history in the early 1970"

response = qa(query)
result = response['source_documents']
print(response['source_documents'])

result = response['source_documents'][0]
print(result)

In [None]:
df = pd.read_scv("/rag_csv_data/train_0.csv", encoding='utf-8-sig')
df = df[['original', 'modern translation']]
df = df[:100]
print(df.head(10))
loader = DataFrameLoader(df, page_content_column="modern translation")
df = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
all_splits = text_splitter.split_documents(df)

In [None]:
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings
)
model_name = "jhgan/ko-sroberta-multitask"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = SentenceTransformerEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [None]:
from langchain_chroma import Chroma
vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings)
vectordb.get(include=["metadatas", "documents", "embeddings"], limit=10, offset=1)

query = "전하 그 덕망을 승히 여기사 벼슬을 돋우어 이조판서로 좌의정을 하게 하시니, 승상이 국은을 감동하여 갈충보국하니 사방에 일이 업고 도적이 없으매 시화연풍하여 나라가 태평하더라"
docs = vectordb.similarity_search(query)

print(docs[0].page_content)
print(docs[0].metadata)

In [None]:
retriever = vectordb.as_retriever()
llm = HuggingFacePipeline(pipeline=pipeline)
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=True,
    return_source_documents=True,
)

In [None]:
query = "전하 그 덕망을 승히 여기사 벼슬을 돋우어 이조판서로 좌의정을 하게 하시니, 승상이 국은을 감동하여 갈충보국하니 사방에 일이 업고 도적이 없으매 시화연풍하여 나라가 태평하더라"
response = qa(query)
print(response)

response = qa(query)
result = response['source_documents']
print(response['source_documents'])


In [None]:
result = response['source_documents'][0]
print(result)