# RAG 실습
- RAG with ChromaDB
- RAG CSV

## RAG with ChromaDB

In [2]:
!pip uninstall -y pyarrow
!pip install -U datasets langchain_community

Found existing installation: pyarrow 19.0.0
Uninstalling pyarrow-19.0.0:
  Successfully uninstalled pyarrow-19.0.0
Collecting pyarrow>=15.0.0 (from datasets)
  Using cached pyarrow-19.0.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (3.3 kB)
Using cached pyarrow-19.0.0-cp312-cp312-macosx_12_0_arm64.whl (30.7 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-19.0.0


In [3]:
!pip install google-labs-html-chunker



In [4]:
from google_labs_html_chunker.html_chunker import HtmlChunker

from urllib.request import urlopen

with urlopen(
    "https://developers.googleblog.com/en/gemma-family-and-toolkit-expansion-io-2024/"
) as f:
    html = f.read().decode("utf-8")

# Chunk the file using HtmlChunker
chunker = HtmlChunker(
    max_words_per_aggregate_passage=200,
    greedily_aggregate_sibling_nodes=True,
    html_tags_to_exclude={"noscript", "script", "style"}, # 필요없는 요소 제거
)
passages = chunker.chunk(html)

In [5]:
for passage in passages:
    print(passage)

Introducing PaliGemma, Gemma 2, and an Upgraded Responsible AI Toolkit
            
            
            - Google Developers Blog
Products Develop Android Chrome ChromeOS Cloud Firebase Flutter Google Assistant Google Maps Platform Google Workspace TensorFlow YouTube Grow Firebase Google Ads Google Analytics Google Play Search Web Push and Notification APIs Earn AdMob Google Ads API Google Pay Google Play Billing Interactive Media Ads Solutions Events Learn Community Groups Google Developer Groups Google Developer Student Clubs Woman Techmakers Google Developer Experts Tech Equity Collective Programs Accelerator Solution Challenge DevFest Stories All Stories Developer Program Blog Search English English Español (Latam) Bahasa Indonesia 日本語 한국어 Português (Brasil) 简体中文
Products More Solutions Events Learn Community More Developer Program Blog Develop Android Chrome ChromeOS Cloud Firebase Flutter Google Assistant Google Maps Platform Google Workspace TensorFlow YouTube Grow Firebase 

In [5]:
!pip install chromadb



In [10]:
import chromadb

chroma_client = chromadb.Client()
# chroma_client.delete_collection("cookbook_collection")
collection = chroma_client.create_collection(name="cookbook_collection")
collection.add(documents=passages, ids=[str(i) for i in range(len(passages))])

/Users/wonik/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:10<00:00, 7.71MiB/s]


In [11]:
prompt_template = """You are an expert in answering user questions. You always understand user questions well, and then provide high-quality answers based on the information provided in the context.

If the provided context does not contain relevent information, just respond "I could not find the answer based on the context you provided."

User question: {}

Context:
{}
"""

user_question = "how many parameters does Gemma 2 have?"

results = collection.query(query_texts=user_question, n_results=3)

context = "\n".join(
    [f"{i+1}. {passage}" for i, passage in enumerate(results["documents"][0])]
)
prompt = f"{prompt_template.format(user_question, context)}"

In [12]:
print(prompt)

You are an expert in answering user questions. You always understand user questions well, and then provide high-quality answers based on the information provided in the context.

If the provided context does not contain relevent information, just respond "I could not find the answer based on the context you provided."

User question: how many parameters does Gemma 2 have?

Context:
1. Gemma 2 is still pretraining. This chart shows performance from the latest Gemma 2 checkpoint along with benchmark pretraining metrics. Source: Hugging Face Open LLM Leaderboard (April 22, 2024) and Grok announcement blog
2. Stay tuned for the official launch of Gemma 2 in the coming weeks! Expanding the Responsible Generative AI Toolkit For this reason we're expanding our Responsible Generative AI Toolkit to help developers conduct more robust model evaluations by releasing the LLM Comparator in open source. The LLM Comparator is a new interactive and visual tool to perform effective side-by-side evaluat

### Generate the answer
- 4bit 양자화 gemma 이용

In [None]:
import huggingface_hub
huggingface_hub.login()

In [None]:
!pip install bitsandbytes accelerate

In [None]:
from transformers import AutoTokenizer
import transformers
import torch
import bitsandbytes, accelerate

model = "google/gemma-1.1-7b-it"
#mdel = "google/gemma_instruct_2b_en"

tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    model_kwargs={
        "torch_dtype": torch.float16,
        "quantization_config": {"load_in_4bit": True},
    },
)

In [None]:
messages = [
    {"role": "user", "content": prompt},
]
prompt = pipeline.tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
outputs = pipeline(prompt, max_new_tokens=256, do_sample=True, temperature=0.1)
print(outputs[0]["generated_text"][len(prompt) :])

## RAG CSV 이용

In [None]:
# 구글 드라이브 마운트 (필요 없을 경우 skip)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install datasets langchain_community

In [None]:
from datasets import load_dataset
import pandas as pd
from tqdm.notebook import tqdm

#get movies dataset
dataset = load_dataset("MongoDB/embedded_movies")

df = pd.read_csv('/content/drive/MyDrive/example_pdfs/train_0.csv')
df
dataset_df = df #pd.DataFrame(dataset['train'])

In [None]:
dataset_df = dataset_df.dropna(subset=["original"])
# 필요한 정보들만 추출
main_dataset_df = dataset_df[["original", "modern translation"]]

In [None]:
main_dataset_df

In [None]:
from langchain_community.document_loaders import DataFrameLoader

#convert DataFrane into Langchain Document format for further processing
#"fullplot" will be the main content information, "title" and "generes" will be used as metadata
loader = DataFrameLoader(main_dataset_df, page_content_column="original")
dataset_docs = loader.load()

dataset_docs

In [None]:
#chunking using embedding model
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer

# 문장단위로 나누기 위한 구분자들
MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\*\*\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

#"thenlper/gte-small" with 512 dimentional embedding is used as embedding model
EMB_MODEL_CKP = "thenlper/gte-small" # 임베딩 모델
#get enbedding_tokenizer
embedding_tokenizer = AutoTokenizer.from_pretrained(EMB_MODEL_CKP)

def split_documents(chunk_size, KB, tokenizer=embedding_tokenizer):
  """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
  """
  text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        embedding_tokenizer, #tokenizer to be used to determine number of tokens
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True, # If `True`, includes chunk's start index in metadata
        strip_whitespace=True, # If `True`, strips whitespace from the start and end of every document
        separators=MARKDOWN_SEPARATORS, #use seperators for chunking
    )

  docs_processed = []
  for doc in KB:
      docs_processed += text_splitter.split_documents([doc])

  #중복된 문장 제거
  unique_texts = {}
  docs_processed_unique = []
  for doc in tqdm(docs_processed):
    if doc.page_content not in unique_texts:
      unique_texts[doc.page_content] = True
      docs_processed_unique.append(doc)

  return docs_processed_unique

In [None]:
#split documents
docs_processed_tok = split_documents(512, dataset_docs, EMB_MODEL_CKP)

In [None]:
# Let's visualize the chunk sizes we would have in tokens from a common model
import matplotlib.pyplot as plt

lengths = [len(embedding_tokenizer.encode(doc.page_content)) for doc in tqdm(dataset_docs)]
fig = pd.Series(lengths).hist()
plt.title("Distribution of document lengths in the knowledge base (in count of tokens) before")
plt.show()

lengths = [len(embedding_tokenizer.encode(doc.page_content)) for doc in tqdm(docs_processed_tok)]
fig = pd.Series(lengths).hist()
plt.title("Distribution of document lengths in the knowledge base (in count of tokens) after chunking")
plt.show()

### FAISS이용 

In [None]:
!pip install faiss-cpu


In [None]:
!pip install sentence-transformers

In [None]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy


embedding_model = HuggingFaceEmbeddings(
    model_name = EMB_MODEL_CKP,
    multi_process = True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # set True for cosine similarity
)

#create FAISS indices for approximate nearest neighbour search
KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    dataset_docs, embedding_model, distance_strategy=DistanceStrategy.COSINE
)

In [None]:

def get_search_result(query, vector_db):
  """
  given a "query" search for top k "original" embedded in vector database
  """
  #get top k documents similar to "query"
  retrieved_docs = vector_db.similarity_search(query=user_query, k=3)
  search_result = ""
  for result in retrieved_docs:
      retrieved_plot = result.page_content if result.page_content else "N/A"

      retrieved_translation = result.metadata['modern translation'] if result.metadata['modern translation'] else "N/A"
      search_result += f"original: {result.page_content}, Modern Translation: {retrieved_translation}"

  return search_result

### Gemma모델 불러오기

In [None]:
from huggingface_hub import notebook_login
notebook_login()


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
# CPU Enabled uncomment below 👇🏽
# model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
# GPU Enabled use below 👇🏽
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="auto")

In [None]:
# Conduct query with retrieval of sources
#user_query = "What is a good romance movie to watch and why?"
user_query = ""
retrieved_results = get_search_result(user_query, KNOWLEDGE_VECTOR_DATABASE)
combined_information = f"Please answer the following query using the context provided. Please find the translation. \n :\n{retrieved_results}. \n : {user_query}"
#chat template for gemma model conversation
chat = [
    { "role": "user", "content": combined_information },
]
prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

In [None]:
# Moving tensors to GPU
input_ids = tokenizer(combined_information, return_tensors="pt").to("cuda")

pipe = pipeline(model=model,
                tokenizer = tokenizer,
                task="text-generation",
                return_full_text=False,
                max_new_tokens=500,
                do_sample=True,
                temperature=2.1,
                top_k=50,
                top_p=1,
                #repetition_penalty=1.1,
                num_return_sequences=1,
                #add_special_tokens=True,
                )
print(f"Query: {user_query}\n")
print(pipe(prompt)[0]["generated_text"])

In [None]:
combined_information

In [None]:
# Conduct query with retrieval of sources
user_query = "'신즁의부인의도도ᄒᆞᆫ고집을ᄋᆡ달나무슈히ᄎᆞ탄ᄒᆞ시고외당으로나오시니마ᄎᆞᆷ시비춘셤이상을드리거날좌우고요' 해석해줘"
retrieved_results = get_search_result(user_query, KNOWLEDGE_VECTOR_DATABASE)
combined_information = f"Please answer the following query using the context provided. Please find the translation. \n :\n{retrieved_results}. \n : {user_query}"
#chat template for gemma model conversation
chat = [
    { "role": "user", "content": combined_information },
]
prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

In [None]:
# Moving tensors to GPU
input_ids = tokenizer(combined_information, return_tensors="pt").to("cuda")

pipe = pipeline(model=model,
                tokenizer = tokenizer,
                task="text-generation",
                return_full_text=False,
                max_new_tokens=500,
                do_sample=True,
                temperature=2.1,
                #top_k=50,
                #top_p=1,
                #repetition_penalty=1.1,
                num_return_sequences=1,
                #add_special_tokens=True,
                )
print(f"Query: {user_query}\n")
print(pipe(prompt)[0]["generated_text"])

In [None]:
retrieved_results