In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
print(OPENAI_API_KEY[:2])

gs


### CommaSeparatedListOutputParser

In [6]:
from langchain_core.output_parsers import CommaSeparatedListOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
import csv
from pprint import pprint

# 콤마로 구분된 리스트 출력 파서 초기화
output_parser = CommaSeparatedListOutputParser()

# 출력 형식 지침 가져오기
format_instructions = output_parser.get_format_instructions()

# 프롬프트 템플릿 설정
prompt = PromptTemplate(
    template="List five {subject}.\n{format_instructions}",
    input_variables=["subject"],
    partial_variables={"format_instructions": format_instructions},
)
pprint(prompt.partial_variables)

{'format_instructions': 'Your response should be a list of comma separated '
                        'values, eg: `foo, bar, baz` or `foo,bar,baz`'}


In [7]:

# OpenAI 모델 설정
#model = ChatOpenAI(temperature=0)
model = ChatOpenAI(
    base_url="https://api.groq.com/openai/v1",  # Groq API 엔드포인트
    model="meta-llama/llama-4-scout-17b-16e-instruct",
    temperature=0
)

# 프롬프트, 모델, 출력 파서를 연결하여 체인 생성
chain = prompt | model | output_parser

# "AI 관련 기술"에 대한 체인 호출 실행
result = chain.invoke({"subject": "AI 관련 기술"})

# 쉼표로 구분된 리스트 출력
print(" AI 관련 기술 목록:")
print(result)

# 결과 활용 예시: CSV 파일로 저장
csv_filename = "./data/ai_technologies.csv"
with open(csv_filename, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["AI 기술"])  # 헤더 추가
    for item in result:
        writer.writerow([item])

print(f" '{csv_filename}' 파일로 저장 완료!")


 AI 관련 기술 목록:
['Machine Learning', 'Deep Learning', 'Natural Language Processing', 'Computer Vision', 'Robotics']
 './data/ai_technologies.csv' 파일로 저장 완료!


### JsonOutputParser

In [8]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_openai import ChatOpenAI
import json

# JSON 출력 파서 초기화
parser = JsonOutputParser()

# 프롬프트 템플릿을 설정합니다.
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "당신은 과학 분야 전문가 AI입니다. 질문에 대해 체계적이고 간결한 답변을 JSON 형식으로 제공하세요."),
        ("user", "#Format: {format_instructions}\n\n#Question: {question}"),
    ]
)

# JSON 출력 형식 지침을 프롬프트에 적용
prompt = prompt.partial(format_instructions=parser.get_format_instructions())
print(prompt)

input_variables=['question'] input_types={} partial_variables={'format_instructions': 'Return a JSON object.'} messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='당신은 과학 분야 전문가 AI입니다. 질문에 대해 체계적이고 간결한 답변을 JSON 형식으로 제공하세요.'), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['format_instructions', 'question'], input_types={}, partial_variables={}, template='#Format: {format_instructions}\n\n#Question: {question}'), additional_kwargs={})]


In [9]:

# OpenAI 모델 설정
#model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
model = ChatOpenAI(
    base_url="https://api.groq.com/openai/v1",  # Groq API 엔드포인트
    model="meta-llama/llama-4-scout-17b-16e-instruct",
    temperature=0
)

# 프롬프트, 모델, 출력 파서를 연결하는 체인 생성
chain = prompt | model | parser

# 질문 설정 (우주 탐사 관련 질문)
question = "최근 10년간 진행된 주요 우주 탐사 미션 3가지를 알려주세요. \
각 미션의 이름은 `mission_name`에, 목표는 `goal`에, 주관 기관은 `agency`에 담아 주세요."

# 체인 실행 및 JSON 응답 받기
response = chain.invoke({"question": question})

# JSON 데이터 출력
print(json.dumps(response, indent=4, ensure_ascii=False))


[
    {
        "mission_name": "뉴호라이즌스",
        "goal": "명왕성 탐사",
        "agency": "NASA"
    },
    {
        "mission_name": "카시니-호이겐스",
        "goal": "토성의 위성 타이탄 탐사",
        "agency": "NASA, ESA, 이탈리아 우주국"
    },
    {
        "mission_name": "창어 4호",
        "goal": "달의 뒷면 탐사",
        "agency": "중국 우주국"
    }
]


### PandasDataFrameOutputParser

In [3]:
import pandas as pd
from langchain.output_parsers import PandasDataFrameOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
import re

# Titanic 데이터셋 로드
df = pd.read_csv('data/titanic.csv')

# Pandas DataFrame Output Parser 설정
parser = PandasDataFrameOutputParser(dataframe=df)

# 형식 지침 출력
format_instructions = parser.get_format_instructions()
print("Format Instructions:\n", format_instructions)


Format Instructions:
 The output should be formatted as a string as the operation, followed by a colon, followed by the column or row to be queried on, followed by optional array parameters.
1. The column names are limited to the possible columns below.
2. Arrays must either be a comma-separated list of numbers formatted as [1,3,5], or it must be in range of numbers formatted as [0..4].
3. Remember that arrays are optional and not necessarily required.
4. If the column is not in the possible columns or the operation is not a valid Pandas DataFrame operation, return why it is invalid as a sentence starting with either "Invalid column" or "Invalid operation".

As an example, for the formats:
1. String "column:num_legs" is a well-formatted instance which gets the column num_legs, where num_legs is a possible column.
2. String "row:1" is a well-formatted instance which gets row 1.
3. String "column:num_legs[1,2]" is a well-formatted instance which gets the column num_legs for rows 1 and 2,

In [4]:

# ChatOpenAI 모델 초기화
model = ChatOpenAI(temperature=0, model_name="gpt-4o")
# model = ChatOpenAI(
#     base_url="https://api.groq.com/openai/v1",  # Groq API 엔드포인트
#     model="meta-llama/llama-4-scout-17b-16e-instruct",
#     temperature=0
# )

# 프롬프트 템플릿 설정
prompt = PromptTemplate(
    template=""" 
    You are a helpful assistant that interacts with a Pandas DataFrame.
    The DataFrame contains the following columns: {columns}.
    
    Your task is to answer the user's query by generating a command in the following format:
    {format_instructions}
    
    User Query: {query}    
    """,
    input_variables=["query"],
    partial_variables={
        "format_instructions": format_instructions,
        "columns": ", ".join(df.columns)
    },
)
print(prompt.partial_variables['columns'])

Survived, Pclass, Name, Sex, Age, Siblings/Spouses Aboard, Parents/Children Aboard, Fare


In [None]:

# 체인 생성
chain = prompt | model | parser

# 모델 응답 받기
try:
    # **Name 열을 표시하십시오.**
    print('Name 컬럼 출력')
    df_query = "Show the Name column"

    parser_output = chain.invoke({"query": df_query})
    print(type(parser_output))
    print(parser_output)

    # **첫번째 행을 표시하십시오.**
    print('첫번째 행 출력')
    df_query2 = "Show first row"

    parser_output2 = chain.invoke({"query": df_query2})
    print(parser_output2)

except Exception as e:
    print(f"오류 발생: {e}")

Name 컬럼 출력
<class 'dict'>
{'Name': 0                                 Mr. Owen Harris Braund
1      Mrs. John Bradley (Florence Briggs Thayer) Cum...
2                                  Miss. Laina Heikkinen
3            Mrs. Jacques Heath (Lily May Peel) Futrelle
4                                Mr. William Henry Allen
                             ...                        
882                                 Rev. Juozas Montvila
883                          Miss. Margaret Edith Graham
884                       Miss. Catherine Helen Johnston
885                                 Mr. Karl Howell Behr
886                                   Mr. Patrick Dooley
Name: Name, Length: 887, dtype: object}
첫번째 행 출력
오류 발생: Unsupported request type '```
row'.                         Please check the format instructions.
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 


In [7]:
import pandas as pd
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

# OpenAI 모델 초기화
model = ChatOpenAI(temperature=0, model_name="gpt-4o")
# model = ChatOpenAI(
#     base_url="https://api.groq.com/openai/v1",  # Groq API 엔드포인트
#     model="meta-llama/llama-4-scout-17b-16e-instruct",
#     temperature=0
# )

# 응답 스키마 정의 {data : [{},{},{}] }
response_schemas = [
    ResponseSchema(name="data", description="A list of dictionaries representing table rows."),
]

# Output Parser 설정
parser = StructuredOutputParser.from_response_schemas(response_schemas)

# 프롬프트 템플릿 설정
prompt = PromptTemplate(
    template="""
    You are an AI assistant that generates tabular data. 
    You must return the data in JSON format that follows this schema:
    
    {format_instructions}
        
    **User Query:**
    {query}
    """,
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)
print(prompt)

input_variables=['query'] input_types={} partial_variables={'format_instructions': 'The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n\n```json\n{\n\t"data": string  // A list of dictionaries representing table rows.\n}\n```'} template='\n    You are an AI assistant that generates tabular data. \n    You must return the data in JSON format that follows this schema:\n\n    {format_instructions}\n\n    **User Query:**\n    {query}\n    '


In [9]:

# 체인 생성 (프롬프트 → 모델 → OutputParser)
chain = prompt | model | parser

# 실행 함수
def generate_dataframe(user_query):
    try:
        # 모델 호출
        json_response = chain.invoke({"query": user_query})
        print(json_response)
        
        # 모델이 반환한 JSON을 Pandas DataFrame으로 변환
        df = pd.DataFrame(json_response["data"])

        # 결과 출력
        print("\n🔹 Generated DataFrame:\n")
        return df

    except Exception as e:
        print(f"❌ 오류 발생: {e}")
        return None

In [13]:
# [예제 1] 2024년 상반기 서울 아파트 평균 매매 가격 데이터 생성
print('2024년 하반기 서울 아파트 평균 매매 가격 데이터 생성')
df_seoul_housing = generate_dataframe(
    "Create a dataset of the average apartment sale prices in Seoul for the second half of 2024 with columns: District (구), Average Price (in KRW), Number of Transactions, and Year-over-Year Change (%)."
)
print(df_seoul_housing.shape)
df_seoul_housing

2024년 하반기 서울 아파트 평균 매매 가격 데이터 생성
{'data': [{'District': 'Gangnam-gu', 'Average Price': 1500000000, 'Number of Transactions': 1200, 'Year-over-Year Change (%)': 3.5}, {'District': 'Jongno-gu', 'Average Price': 950000000, 'Number of Transactions': 800, 'Year-over-Year Change (%)': 2.1}, {'District': 'Mapo-gu', 'Average Price': 1100000000, 'Number of Transactions': 950, 'Year-over-Year Change (%)': 4.0}, {'District': 'Seocho-gu', 'Average Price': 1450000000, 'Number of Transactions': 1100, 'Year-over-Year Change (%)': 3.8}, {'District': 'Songpa-gu', 'Average Price': 1300000000, 'Number of Transactions': 1050, 'Year-over-Year Change (%)': 3.2}, {'District': 'Yongsan-gu', 'Average Price': 1250000000, 'Number of Transactions': 900, 'Year-over-Year Change (%)': 2.9}, {'District': 'Gwanak-gu', 'Average Price': 850000000, 'Number of Transactions': 700, 'Year-over-Year Change (%)': 1.5}, {'District': 'Dongdaemun-gu', 'Average Price': 800000000, 'Number of Transactions': 750, 'Year-over-Year Chan

Unnamed: 0,District,Average Price,Number of Transactions,Year-over-Year Change (%)
0,Gangnam-gu,1500000000,1200,3.5
1,Jongno-gu,950000000,800,2.1
2,Mapo-gu,1100000000,950,4.0
3,Seocho-gu,1450000000,1100,3.8
4,Songpa-gu,1300000000,1050,3.2
5,Yongsan-gu,1250000000,900,2.9
6,Gwanak-gu,850000000,700,1.5
7,Dongdaemun-gu,800000000,750,1.8


In [15]:
print('2024년 서울 지하철역별 유동 인구 데이터')
# [예제 2] 2024년 서울 지하철역별 유동 인구 데이터
df_seoul_subway = generate_dataframe(
    "Generate a dataset of the top 10 busiest subway stations in Seoul in 2024 with columns: Station Name, Line Number, Daily Passenger Volume, and Weekday vs Weekend Ratio."
)
if df_seoul_subway is not None:
    #print(df_seoul_subway.shape)
    df_seoul_subway.head()

2024년 서울 지하철역별 유동 인구 데이터
{'data': [{'Station Name': 'Gangnam', 'Line Number': 'Line 2', 'Daily Passenger Volume': 150000, 'Weekday vs Weekend Ratio': '1.2'}, {'Station Name': 'Jamsil', 'Line Number': 'Line 2', 'Daily Passenger Volume': 140000, 'Weekday vs Weekend Ratio': '1.1'}, {'Station Name': 'Seoul Station', 'Line Number': 'Line 1', 'Daily Passenger Volume': 130000, 'Weekday vs Weekend Ratio': '1.3'}, {'Station Name': 'Hongdae', 'Line Number': 'Line 2', 'Daily Passenger Volume': 125000, 'Weekday vs Weekend Ratio': '1.0'}, {'Station Name': 'Samseong', 'Line Number': 'Line 2', 'Daily Passenger Volume': 120000, 'Weekday vs Weekend Ratio': '1.2'}, {'Station Name': 'Express Bus Terminal', 'Line Number': 'Line 3', 'Daily Passenger Volume': 115000, 'Weekday vs Weekend Ratio': '1.1'}, {'Station Name': 'Yeouido', 'Line Number': 'Line 5', 'Daily Passenger Volume': 110000, 'Weekday vs Weekend Ratio': '1.4'}, {'Station Name': 'Dongdaemun', 'Line Number': 'Line 4', 'Daily Passenger Volume': 105

In [16]:
print('한국 5대 편의점 브랜드별 2024년 매출 및 점포 수')
# [예제 3] 한국 5대 편의점 브랜드별 2024년 매출 및 점포 수
df_korean_convenience_stores = generate_dataframe(
    "Create a dataset of the top 5 convenience store brands in Korea in 2024 with columns: Brand Name, Number of Stores, Total Revenue (in billion KRW), and Market Share (%)."
)
df_korean_convenience_stores.head()

한국 5대 편의점 브랜드별 2024년 매출 및 점포 수
{'data': [{'Brand Name': 'CU', 'Number of Stores': 15000, 'Total Revenue (in billion KRW)': 5000, 'Market Share (%)': 35.0}, {'Brand Name': 'GS25', 'Number of Stores': 14000, 'Total Revenue (in billion KRW)': 4800, 'Market Share (%)': 33.5}, {'Brand Name': '7-Eleven', 'Number of Stores': 10000, 'Total Revenue (in billion KRW)': 3000, 'Market Share (%)': 20.0}, {'Brand Name': 'Emart24', 'Number of Stores': 5000, 'Total Revenue (in billion KRW)': 1500, 'Market Share (%)': 7.5}, {'Brand Name': 'Ministop', 'Number of Stores': 2000, 'Total Revenue (in billion KRW)': 700, 'Market Share (%)': 4.0}]}

🔹 Generated DataFrame:



Unnamed: 0,Brand Name,Number of Stores,Total Revenue (in billion KRW),Market Share (%)
0,CU,15000,5000,35.0
1,GS25,14000,4800,33.5
2,7-Eleven,10000,3000,20.0
3,Emart24,5000,1500,7.5
4,Ministop,2000,700,4.0


### PydanticOutputParser 

In [None]:
# poetry add pydantic
# %pip install pydantic 

from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser

from pydantic import BaseModel, Field
from typing import List

In [5]:

# 출력 구조를 정의하는 Pydantic 모델
class MovieRecommendation(BaseModel):
    movie_title: str = Field(description="추천 영화 제목")
    reason: str = Field(description="추천 이유")
    genre: List[str] = Field(description="영화 장르")
    estimated_rating: float = Field(description="10점 만점에서 예상 평점")
    
# Pydantic 출력 파서 초기화
parser = PydanticOutputParser(pydantic_object=MovieRecommendation)

# 프롬프트 템플릿 설정
template = """
다음 사용자 요청에 따라 영화를 추천해주세요.
요청: {query}

{format_instructions}
"""

prompt = ChatPromptTemplate.from_template(template)

# 파서의 지시사항을 프롬프트에 주입
prompt = prompt.partial(
    format_instructions=parser.get_format_instructions()
)
print(prompt)

input_variables=['query'] input_types={} partial_variables={'format_instructions': 'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"properties": {"movie_title": {"description": "추천 영화 제목", "title": "Movie Title", "type": "string"}, "reason": {"description": "추천 이유", "title": "Reason", "type": "string"}, "genre": {"description": "영화 장르", "items": {"type": "string"}, "title": "Genre", "type": "array"}, "estimated_rating": {"description": "10점 만점에서 예상 평점", "title": "Estimated Rating", "type": "number"}}, "required": ["movie_title", "reason", "genre", "estimated_rating"]}\n```'} messag

In [6]:

# ChatOpenAI 모델 초기화
#model = ChatOpenAI(temperature=0.7, model="gpt-3.5-turbo")
model = ChatOpenAI(
    base_url="https://api.groq.com/openai/v1",  # Groq API 엔드포인트
    model="meta-llama/llama-4-scout-17b-16e-instruct",
    temperature=0.7
)

# 체인 구성 및 실행
query = "1990년대 클래식한 느낌의 공포 영화 추천해줘"
chain = prompt | model | parser
output = chain.invoke({"query": query})

# 결과 출력
print(f"추천 영화: {output.movie_title}")
print(f"추천 이유: {output.reason}")
print(f"장르: {', '.join(output.genre)}")
print(f"예상 평점: {output.estimated_rating}/10")

추천 영화: The Sixth Sense (1999)
추천 이유: 1990년대 클래식한 느낌의 공포 영화로, 반전의 묘미가 있는 작품입니다.
장르: 공포, 미스터리, 스릴러
예상 평점: 8.5/10
