# RAG 기초

## Enviornment (.env)

In [78]:
# .env 파일을 불러옵니다.
from dotenv import load_dotenv
load_dotenv()

True

## Evaluation Dataset Generation

#### 1) (pydantic) Schema

In [3]:
from typing import Optional
from langchain_core.pydantic_v1 import BaseModel, Field

class Car(BaseModel):
    """Information about a car."""
    make: Optional[str] = Field(default=None, description="The make of the car")
    model_name: Optional[str] = Field(default=None, description="The model name of the car")
    model_year: Optional[int] = Field(
        default=None, description="The year the car model was manufactured"
    )
    color: Optional[str] = Field(default=None, description="The color of the car")
    price: Optional[float] = Field(default=None, description="The price of the car")
    mileage: Optional[float] = Field(default=None, description="The mileage of the car")


#### 2) Synthetic Data

https://python.langchain.com/v0.2/docs/tutorials/data_generation/

In [35]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_core.pydantic_v1 import BaseModel
from langchain_experimental.tabular_synthetic_data.openai import create_openai_data_generator
from langchain_experimental.tabular_synthetic_data.prompts import (
    SYNTHETIC_FEW_SHOT_PREFIX,
    SYNTHETIC_FEW_SHOT_SUFFIX,
)
from langchain_openai import ChatOpenAI


examples = [
    {
        "example": """make: 현대, model_name: 소나타, model_year: 2022, color: 흰색, price: 25000000, mileage: 15000.0"""
    },
    {
        "example": """make: 기아, model_name: K5, model_year: 2021, color: None, price: 23000000, mileage: 20000.0"""
    },
]


OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

prompt_template = FewShotPromptTemplate( 
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)

synthetic_data_generator = create_openai_data_generator(
    output_schema=Car,
    llm=ChatOpenAI(
        model="gpt-3.5-turbo-0125",
        temperature=0.7,
    ), 
    prompt=prompt_template,
)

synthetic_results = synthetic_data_generator.generate(
    subject="car data",
    extra="Use Korean language. Make it something you wouldn't normally choose. Around 30 percent of the values should be None at random. ",
    runs=10,
)

len(synthetic_results)

10

In [79]:
SYNTHETIC_FEW_SHOT_PREFIX

'This is a test about generating synthetic data about {subject}. Examples below:'

In [80]:
SYNTHETIC_FEW_SHOT_SUFFIX

'Now you generate synthetic data about {subject}. Make sure to {extra}:'

In [36]:
synthetic_results

[Car(make='르노', model_name='메간', model_year=2019, color='파랑', price=18000000.0, mileage=25000.0),
 Car(make='쌍용', model_name='티볼리', model_year=2020, color=None, price=27000000.0, mileage=18000.0),
 Car(make='기아', model_name='스팅어', model_year=2017, color='은색', price=21000000.0, mileage=32000.0),
 Car(make='르노', model_name='카프쳐', model_year=2019, color=None, price=24000000.0, mileage=25000.0),
 Car(make='쌍용', model_name='티볼리', model_year=2015, color='블루', price=18000000.0, mileage=38000.0),
 Car(make='기아', model_name='K7', model_year=2017, color='그레이', price=21000000.0, mileage=30000.0),
 Car(make='르노삼성', model_name='SM6', model_year=2014, color='화이트', price=None, mileage=45000.0),
 Car(make='쌍용', model_name='렉스턴', model_year=2015, color='블랙', price=18000000.0, mileage=65000.0),
 Car(make='기아', model_name='모하비', model_year=2012, color='은색', price=25000000.0, mileage=72000.0),
 Car(make='쌍용', model_name='티볼리', model_year=2018, color='그레이', price=20000000.0, mileage=80000.0)]

In [37]:
import pandas as pd

car_dicts = [car.dict() for car in synthetic_results]

df = pd.DataFrame(car_dicts)
df.to_csv("car_data.csv", index=False)
df

Unnamed: 0,make,model_name,model_year,color,price,mileage
0,르노,메간,2019,파랑,18000000.0,25000.0
1,쌍용,티볼리,2020,,27000000.0,18000.0
2,기아,스팅어,2017,은색,21000000.0,32000.0
3,르노,카프쳐,2019,,24000000.0,25000.0
4,쌍용,티볼리,2015,블루,18000000.0,38000.0
5,기아,K7,2017,그레이,21000000.0,30000.0
6,르노삼성,SM6,2014,화이트,,45000.0
7,쌍용,렉스턴,2015,블랙,18000000.0,65000.0
8,기아,모하비,2012,은색,25000000.0,72000.0
9,쌍용,티볼리,2018,그레이,20000000.0,80000.0


In [48]:
from langchain_experimental.synthetic_data import DatasetGenerator

# Dataset Generator
model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.7)
generator = DatasetGenerator(model, {"style": "informal", "minimal length": 300, "language": "Korean"})
dataset = generator(synthetic_results)

len(dataset)

10

In [49]:
dataset[0]

{'fields': Car(make='르노', model_name='메간', model_year=2019, color='파랑', price=18000000.0, mileage=25000.0),
 'preferences': {'style': 'informal',
  'minimal length': 300,
  'language': 'Korean'},
 'text': '르노 메간 2019년식 차량은 파랑색으로, 25000km 주행한 중고차로 1800만 원에 판매 중이에요.'}

In [74]:
from langchain_community.chat_models import ChatOllama

model2 = ChatOllama(model='qwen2', temperature=0.7)
generator2 = DatasetGenerator(model2, {"style": "informal", "minimal length": 300, "language": "Korean"})
dataset2 = generator2(synthetic_results)

len(dataset2)

10

In [76]:
dataset2[0]

{'fields': Car(make='르노', model_name='메간', model_year=2019, color='파랑', price=18000000.0, mileage=25000.0),
 'preferences': {'style': 'informal',
  'minimal length': 300,
  'language': 'Korean'},
 'text': '르노 제조사에서 제작된 메간 2019년형은 파란색으로 빛나며, 가격이 무려 1800만원에 판매되고 있습니다. 이 차량은 누적 마일리지가 약 2만5천㎞에 달하여 사용 경험이 상당히 풍부한 상황입니다. 저렴한 가격과 고성능, 그리고 멋진 디자인이 결합된 이 차량은 당신의 취향을 충족시킬 수 있을 것입니다.'}

## Extraction 

#### 1) Prompt

In [51]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value.",
        ),
        ("human", "{text}"),
    ]
)

#### 2) OpenAI

In [52]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

extract_chain = prompt | llm.with_structured_output(schema=Car)

extract_result = extract_chain.invoke({"text": dataset[0]['text']})

extract_result

Car(make='르노', model_name='메간', model_year=2019, color='파랑', price=18000000.0, mileage=25000.0)

## Evaluation

In [55]:
extract_result == dataset[0]['fields']

True

In [56]:
extract_result = extract_chain.invoke({"text": dataset[1]['text']})

extract_result

Car(make='쌍용', model_name='티볼리', model_year=2020, color=None, price=27000000.0, mileage=18000.0)

In [57]:
extract_result == dataset[1]['fields']

True

## Ollama

In [71]:
from langchain_core.prompts import PromptTemplate
from langchain_experimental.llms.ollama_functions import OllamaFunctions

prompt = PromptTemplate.from_template(
    """<|start_header_id|>system<|end_header_id|>
You are an expert extraction algorithm. Only extract relevant information from the text.
If you do not know the value of an attribute asked to extract, return null for the attribute's value.
<|eot_id|><|start_header_id|>user<|end_header_id|>

TEXT: {text}
JSON:
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
"""
)

llm = OllamaFunctions(model="llama3", format="json")

extract_chain = prompt | llm.with_structured_output(schema=Car)

extract_result = extract_chain.invoke({"text": dataset[0]['text']})

extract_result

Car(make='르노 메간', model_name=None, model_year=2019, color='파랑색', price=18000000.0, mileage=25000.0)

In [72]:
extract_result == dataset[0]['fields']

False

In [73]:
dataset[0]['fields']

Car(make='르노', model_name='메간', model_year=2019, color='파랑', price=18000000.0, mileage=25000.0)