In [1]:
import os
import pandas as pd
from langchain_core.prompts import PromptTemplate
from langchain.output_parsers import PandasDataFrameOutputParser
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

# .env 파일에서 환경 변수를 불러옵니다.
# OPENAI_API_KEY가 .env 파일에 설정되어 있어야 합니다.
load_dotenv()

# 예제로 사용할 가상의 타이타닉 데이터를 생성합니다.
data = {
    "PassengerId": range(1, 11),
    "Survived": [0, 1, 1, 1, 0, 0, 0, 1, 0, 1],
    "Pclass": [3, 1, 3, 1, 3, 1, 3, 2, 3, 2],
    "Name": [
        "Braund, Mr. Owen Harris",
        "Cumings, Mrs. John Bradley",
        "Heikkinen, Miss. Laina",
        "Futrelle, Mrs. Jacques Heath",
        "Allen, Mr. William Henry",
        "Moran, Mr. James",
        "McCarthy, Mr. Timothy J",
        "Palsson, Master. Gosta Leonard",
        "Johnson, Mrs. Oscar W",
        "Nasser, Mrs. Nicholas",
    ],
    "Sex": [
        "male",
        "female",
        "female",
        "female",
        "male",
        "male",
        "male",
        "male",
        "female",
        "female",
    ],
    "Age": [22, 38, 26, 35, 35, 27, 54, 2, 27, 14],
}
df = pd.DataFrame(data)

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   PassengerId  10 non-null     int64 
 1   Survived     10 non-null     int64 
 2   Pclass       10 non-null     int64 
 3   Name         10 non-null     object
 4   Sex          10 non-null     object
 5   Age          10 non-null     int64 
dtypes: int64(4), object(2)
memory usage: 612.0+ bytes


In [4]:
# PandasDataFrameOutputParser를 초기화합니다.
# 이 파서는 LLM의 출력을 Pandas DataFrame으로 변환합니다.
parser = PandasDataFrameOutputParser(dataframe=df)
print(parser.get_format_instructions())

The output should be formatted as a string as the operation, followed by a colon, followed by the column or row to be queried on, followed by optional array parameters.
1. The column names are limited to the possible columns below.
2. Arrays must either be a comma-separated list of numbers formatted as [1,3,5], or it must be in range of numbers formatted as [0..4].
3. Remember that arrays are optional and not necessarily required.
4. If the column is not in the possible columns or the operation is not a valid Pandas DataFrame operation, return why it is invalid as a sentence starting with either "Invalid column" or "Invalid operation".

As an example, for the formats:
1. String "column:num_legs" is a well-formatted instance which gets the column num_legs, where num_legs is a possible column.
2. String "row:1" is a well-formatted instance which gets row 1.
3. String "column:num_legs[1,2]" is a well-formatted instance which gets the column num_legs for rows 1 and 2, where num_legs is a p

In [5]:
df.to_string()

'   PassengerId  Survived  Pclass                            Name     Sex  Age\n0            1         0       3         Braund, Mr. Owen Harris    male   22\n1            2         1       1      Cumings, Mrs. John Bradley  female   38\n2            3         1       3          Heikkinen, Miss. Laina  female   26\n3            4         1       1    Futrelle, Mrs. Jacques Heath  female   35\n4            5         0       3        Allen, Mr. William Henry    male   35\n5            6         0       1                Moran, Mr. James    male   27\n6            7         0       3         McCarthy, Mr. Timothy J    male   54\n7            8         1       2  Palsson, Master. Gosta Leonard    male    2\n8            9         0       3           Johnson, Mrs. Oscar W  female   27\n9           10         1       2           Nasser, Mrs. Nicholas  female   14'

In [None]:
# 프롬프트 템플릿을 생성합니다.
# `format_instructions` 변수는 파서가 요구하는 출력 형식을 동적으로 삽입합니다.
prompt_template = """
주어진 타이타닉 데이터를 기반으로 질문에 답하세요.
답변은 지정된 형식에 맞는 Pandas DataFrame으로 생성해야 합니다.

{format_instructions}

데이터:
{df_as_string}

질문: {query}
"""

# PromptTemplate 객체를 생성합니다.
prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["df_as_string", "query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

# 모델을 초기화합니다.
model = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")

# 프롬프트, 모델, 파서를 체인으로 연결합니다.
chain = prompt | model | parser

# LLM에 질문을 던지고 결과를 DataFrame으로 받습니다.
query_text = "생존 여부(Survived)별로 승객들의 평균 나이를 계산하고, 그 결과를 'Survived', 'Mean_Age' 컬럼을 가진 DataFrame으로 요약해줘."
result_df = chain.invoke(
    {
        "df_as_string": df.to_string(),
        "query": query_text,
    }
)

# 결과 출력
print("--- 원본 데이터 ---")
print(df)
print("\n" + "=" * 50 + "\n")
print("--- LLM이 생성한 DataFrame ---")
print(result_df)
print("\n" + "=" * 50 + "\n")
print(f"결과 타입: {type(result_df)}")

--- 원본 데이터 ---
   PassengerId  Survived  Pclass                            Name     Sex  Age
0            1         0       3         Braund, Mr. Owen Harris    male   22
1            2         1       1      Cumings, Mrs. John Bradley  female   38
2            3         1       3          Heikkinen, Miss. Laina  female   26
3            4         1       1    Futrelle, Mrs. Jacques Heath  female   35
4            5         0       3        Allen, Mr. William Henry    male   35
5            6         0       1                Moran, Mr. James    male   27
6            7         0       3         McCarthy, Mr. Timothy J    male   54
7            8         1       2  Palsson, Master. Gosta Leonard    male    2
8            9         0       3           Johnson, Mrs. Oscar W  female   27
9           10         1       2           Nasser, Mrs. Nicholas  female   14


--- LLM이 생성한 DataFrame ---
{'mean': 28.0}


결과 타입: <class 'dict'>
