In [2]:
# 选择模型
import os 
os.environ['OPENAI_API_KEY'] = 'sk-H44EbzZSAeJFexZH7ur6X6tbvsLoeAxPVSjE47WakODjXuEF' # 导入的api key
os.environ['OPENAI_API_BASE'] = 'https://api.34ku.com/v1'  # 一定要有 /v1
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model = "gpt-4o-mini",temperature=0)


In [None]:
import pandas as pd
from typing import Optional

class GlobalState:
    df: Optional[pd.DataFrame] = None
    models = {}  # model_id 

STATE = GlobalState()


In [4]:
# 定义工具(tools)
import io
import json
import uuid
import numpy as np
import matplotlib.pyplot as plt
from typing import List, Dict, Any, Optional
from langchain.tools import StructuredTool
from pydantic import BaseModel, Field

# 1) 载入 CSV
class LoadCSVInput(BaseModel):
    path: str = Field(..., description="本地 CSV 文件路径，比如 './titanic_cleaned.csv'")

def load_csv_tool(path: str) -> Dict[str, Any]:
    df = pd.read_csv(path)
    STATE.df = df
    return {"rows": len(df), "columns": list(df.columns)}

load_csv = StructuredTool.from_function(
    func=load_csv_tool,
    name="load_csv",
    description="加载一个CSV文件到内存，后续所有操作都基于此数据集。",
    args_schema=LoadCSVInput
)

# 2) 数据 summary（均值、方差、极值等，仅数值列）
class SummaryInput(BaseModel):
    columns: Optional[List[str]] = Field(None, description="要统计的列名列表，空则使用所有数值列")

def data_summary_tool(columns: Optional[List[str]] = None) -> Dict[str, Any]:
    assert STATE.df is not None, "请先用 load_csv 加载数据"
    df = STATE.df
    num_df = df.select_dtypes(include=[np.number])
    if columns:
        num_df = num_df[columns]
    desc = num_df.agg(["mean", "var", "min", "max"]).to_dict()
    return {"summary": desc}

data_summary = StructuredTool.from_function(
    func=data_summary_tool,
    name="data_summary",
    description="对数值列做统计汇总：mean/var/min/max。",
    args_schema=SummaryInput
)

# 3) 缺失值均值填充（只对数值列；可指定列）
class ImputeMeanInput(BaseModel):
    columns: Optional[List[str]] = Field(None, description="要填充的数值列，空则为所有数值列")

def impute_mean_tool(columns: Optional[List[str]] = None) -> Dict[str, Any]:
    assert STATE.df is not None, "请先用 load_csv 加载数据"
    df = STATE.df
    num_cols = list(df.select_dtypes(include=[np.number]).columns)
    target_cols = columns or num_cols
    before_na = df[target_cols].isna().sum().sum()
    for c in target_cols:
        if c in df.columns and pd.api.types.is_numeric_dtype(df[c]):
            df[c] = df[c].fillna(df[c].mean())
    after_na = df[target_cols].isna().sum().sum()
    imputed = int(before_na - after_na)
    STATE.df = df
    return {"updated_columns": target_cols, "n_imputed": imputed}

impute_mean = StructuredTool.from_function(
    func=impute_mean_tool,
    name="impute_mean",
    description="将数值列中的缺失值用该列均值进行填充。",
    args_schema=ImputeMeanInput
)

# 4) 画分布图（默认 Survived），保存为PNG并返回路径
class PlotDistInput(BaseModel):
    column: str = Field("Survived", description="要绘制分布的列名")
    bins: int = Field(20, description="直方图箱数，类别变量自动用条形图")

def plot_distribution_tool(column: str = "Survived", bins: int = 20) -> Dict[str, Any]:
    assert STATE.df is not None, "请先用 load_csv 加载数据"
    df = STATE.df
    if column not in df.columns:
        raise ValueError(f"列 {column} 不在数据中")
    path = f"plot_{column}.png"

    plt.figure()
    if pd.api.types.is_numeric_dtype(df[column]):
        df[column].plot(kind="hist", bins=bins, edgecolor="black")
        plt.xlabel(column)
        plt.ylabel("Count")
        plt.title(f"Distribution of {column}")
    else:
        df[column].value_counts().plot(kind="bar")
        plt.xlabel(column)
        plt.ylabel("Count")
        plt.title(f"Distribution of {column}")
    plt.tight_layout()
    plt.savefig(path, dpi=150)
    plt.close()

    return {"image_path": path}

plot_distribution = StructuredTool.from_function(
    func=plot_distribution_tool,
    name="plot_distribution",
    description="绘制某一列的分布图，数值列用直方图，类别列用条形图。将图片保存为本地PNG并返回路径。",
    args_schema=PlotDistInput
)

# 5) 训练模型 + 评估（默认逻辑回归预测 Survived）
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix

class TrainModelInput(BaseModel):
    target: str = Field("Survived", description="目标列名")
    features: Optional[List[str]] = Field(None, description="特征列列表，空则自动选择除目标外的所有列")
    model_type: str = Field("logreg", description="模型类型。目前支持 logreg")
    test_size: float = Field(0.2, description="测试集比例")
    random_state: int = Field(42, description="随机种子")

def train_model_tool(
    target: str = "Survived",
    features: Optional[List[str]] = None,
    model_type: str = "logreg",
    test_size: float = 0.2,
    random_state: int = 42,
) -> Dict[str, Any]:
    assert STATE.df is not None, "请先用 load_csv 加载数据"
    df = STATE.df.copy()
    if target not in df.columns:
        raise ValueError(f"目标列 {target} 不在数据中")

    X = df.drop(columns=[target]) if features is None else df[features]
    y = df[target]

    # 自动区分数值/类别列
    num_cols = list(X.select_dtypes(include=[np.number]).columns)
    cat_cols = [c for c in X.columns if c not in num_cols]

    pre = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ],
        remainder="drop",
        verbose_feature_names_out=False,
    )

    if model_type == "logreg":
        model = LogisticRegression(max_iter=200)
    else:
        raise ValueError("暂只支持 logreg")

    pipe = Pipeline(steps=[("pre", pre), ("clf", model)])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y if y.nunique() <= 20 else None
    )
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    metrics = {
        "accuracy": float(accuracy_score(y_test, y_pred)),
        "f1": float(f1_score(y_test, y_pred, average="weighted")),
    }
    # 二分类可给出 ROC-AUC（需要概率 & 目标二值）
    try:
        if y.nunique() == 2 and hasattr(pipe, "predict_proba"):
            y_proba = pipe.predict_proba(X_test)[:, 1]
            metrics["roc_auc"] = float(roc_auc_score(y_test, y_proba))
    except Exception:
        pass

    cm = confusion_matrix(y_test, y_pred).tolist()
    model_id = str(uuid.uuid4())[:8]
    STATE.models[model_id] = pipe

    return {"model_id": model_id, "metrics": metrics, "confusion_matrix": cm}

train_model = StructuredTool.from_function(
    func=train_model_tool,
    name="train_model",
    description="训练一个用于分类的模型（默认逻辑回归）并返回评估指标与 model_id。",
    args_schema=TrainModelInput
)

# 6) 预测（预览前 N 条）
class PredictInput(BaseModel):
    model_id: str = Field(..., description="训练时返回的 model_id")
    head: int = Field(10, description="返回前 N 条预测")

def predict_tool(model_id: str, head: int = 10) -> Dict[str, Any]:
    assert STATE.df is not None, "请先用 load_csv 加载数据"
    if model_id not in STATE.models:
        raise ValueError(f"找不到模型 {model_id}")
    model = STATE.models[model_id]

    # 使用整表的前 head 行进行演示（真实场景可传入自定义数据）
    X = STATE.df.drop(columns=[c for c in ["Survived"] if c in STATE.df.columns]).head(head)
    preds = model.predict(X).tolist()
    out = X.copy()
    out["pred"] = preds
    return {"preview": out.to_dict(orient="records")}

predict = StructuredTool.from_function(
    func=predict_tool,
    name="predict",
    description="使用已训练的模型进行预测，返回前N条样例的预测结果。",
    args_schema=PredictInput
)

TOOLS = [load_csv, data_summary, impute_mean, plot_distribution, train_model, predict]



In [None]:
# 创建agent
from langchain.agents import initialize_agent, AgentType
from langchain_core.callbacks.base import BaseCallbackHandler

class PrintCallback(BaseCallbackHandler):
    def on_chain_start(self, *args, **kwargs):
        print("=== Chain start ===")

    def on_tool_start(self, serialized, input_str, **kwargs):
        print(f"[Tool start] {serialized.get('name')}  input={input_str}")

    def on_tool_end(self, output, **kwargs):
        print(f"[Tool end]   output={str(output)[:300]} ...")

    def on_chain_end(self, outputs, **kwargs):
        print("=== Chain end ===")

callbacks = [PrintCallback()]

system_prompt = (
    "You are a data-exploration assistant. "
    "You MUST use the provided tools to load data, summarize, impute, plot, train and predict. "
    "If the user request is ambiguous, ask for clarification or make sensible defaults and state them."
)
from langchain.agents import initialize_agent, AgentType

agent = initialize_agent(
    tools=TOOLS,
    llm=llm,
    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,  
    verbose=True,
    handle_parsing_errors=True,
    agent_kwargs={"system_message": system_prompt},
    callbacks=callbacks,
)


In [8]:
agent.run("用工具把 ./titanic_cleaned.csv 加载进来，并告诉我有多少行，以及所有列名。")


=== Chain start ===


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to load the CSV file to check the number of rows and the column names. 

Action:
```
{
  "action": "load_csv",
  "action_input": {
    "path": "./titanic_cleaned.csv"
  }
}
```
[0m
Observation: [36;1m[1;3m{'rows': 891, 'columns': ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']}[0m
Thought:[32;1m[1;3mI have successfully loaded the CSV file and obtained the required information.

Action:
```
{
  "action": "Final Answer",
  "action_input": "The dataset contains 891 rows and the following columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']."
}
```
[0m
=== Chain end ===

[1m> Finished chain.[0m


"The dataset contains 891 rows and the following columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']."

In [9]:
agent.run("请给出数值列的统计汇总，包括均值、方差、最小值和最大值。")


=== Chain start ===


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to load the CSV file first to access the numerical columns for statistical summary. I will ask for the file path to proceed.

Action:
```
{
  "action": "load_csv",
  "action_input": {
    "path": "./titanic_cleaned.csv"
  }
}
```
[0m
Observation: [36;1m[1;3m{'rows': 891, 'columns': ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']}[0m
Thought:[32;1m[1;3mI have successfully loaded the CSV file and can now access the numerical columns for statistical summary. I will proceed to generate the summary statistics including mean, variance, minimum, and maximum for all numerical columns.

Action:
```
{
  "action": "data_summary",
  "action_input": {
    "columns": null
  }
}
```
[0m
Observation: [33;1m[1;3m{'summary': {'PassengerId': {'mean': 446.0, 'var': 66231.0, 'min': 1.0, 'max': 891.0}, 'Survived': {'mean': 0.3838383838

'I have successfully generated the statistical summary for the numerical columns, which includes the mean, variance, minimum, and maximum values. Here are the results:\n\n- **PassengerId**: \n  - Mean: 446.0\n  - Variance: 66231.0\n  - Min: 1.0\n  - Max: 891.0\n\n- **Survived**: \n  - Mean: 0.384\n  - Variance: 0.237\n  - Min: 0.0\n  - Max: 1.0\n\n- **Pclass**: \n  - Mean: 2.309\n  - Variance: 0.699\n  - Min: 1.0\n  - Max: 3.0\n\n- **Age**: \n  - Mean: 29.699\n  - Variance: 169.052\n  - Min: 0.42\n  - Max: 80.0\n\n- **SibSp**: \n  - Mean: 0.523\n  - Variance: 1.216\n  - Min: 0.0\n  - Max: 8.0\n\n- **Parch**: \n  - Mean: 0.382\n  - Variance: 0.650\n  - Min: 0.0\n  - Max: 6.0\n\n- **Fare**: \n  - Mean: 32.204\n  - Variance: 2469.437\n  - Min: 0.0\n  - Max: 512.329\n\nIf you need any further analysis or information, feel free to ask!'

In [10]:
agent.run("画一下 Survived 的分布图，并告诉我图片的保存路径。")


=== Chain start ===


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to plot the distribution of the "Survived" column and save the image, then provide the path to the saved image. First, I will load the dataset to access the "Survived" column.

Action:
```
{
  "action": "load_csv",
  "action_input": {
    "path": "./titanic_cleaned.csv"
  }
}
```
[0m
Observation: [36;1m[1;3m{'rows': 891, 'columns': ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']}[0m
Thought:[32;1m[1;3mI have successfully loaded the dataset and can now plot the distribution of the "Survived" column.

Action:
```
{
  "action": "plot_distribution",
  "action_input": {
    "column": "Survived",
    "bins": 20
  }
}
```

[0m
Observation: [36;1m[1;3m{'image_path': 'plot_Survived.png'}[0m
Thought:[32;1m[1;3mI have plotted the distribution of the "Survived" column and saved the image. The image is saved at the followin

'I have plotted the distribution of the "Survived" column and saved the image. The image is saved at the following path:\n\n'

In [11]:
resp = agent.run("用逻辑回归训练一个预测 Survived 的模型，给出准确率、F1、如可用再给 ROC-AUC，并返回 model_id。")
print(resp)


=== Chain start ===


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: 首先，我需要加载数据集，然后训练一个逻辑回归模型来预测 "Survived"。在训练模型时，我将使用默认的特征列和测试集比例。训练完成后，我将获取模型的评估指标，包括准确率、F1 分数和 ROC-AUC（如果可用）。接下来，我将返回 model_id。

Action:
```
{
  "action": "load_csv",
  "action_input": {
    "path": "./titanic_cleaned.csv"
  }
}
```
[0m
Observation: [36;1m[1;3m{'rows': 891, 'columns': ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']}[0m
Thought:[32;1m[1;3m我已经成功加载了数据集。接下来，我将训练一个逻辑回归模型来预测 "Survived"，并获取模型的评估指标。

Action:
```
{
  "action": "train_model",
  "action_input": {
    "target": "Survived",
    "features": null,
    "model_type": "logreg",
    "test_size": 0.2,
    "random_state": 42
  }
}
```

[0m
Observation: [33;1m[1;3m{'model_id': 'a313126e', 'metrics': {'accuracy': 0.8268156424581006, 'f1': 0.8248588228685629, 'roc_auc': 0.8545454545454545}, 'confusion_matrix': [[98, 12], [19, 50]]}[0m
Thought:[32;1m[1;3m我已经成功

In [13]:
agent.run("用上一步训练好的模型进行预测，model_id为a313126e，返回前10条样例的预测结果。")


=== Chain start ===


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction:
```
{
  "action": "predict",
  "action_input": {
    "model_id": "a313126e",
    "head": 10
  }
}
```
[0m
Observation: [38;5;200m[1;3m{'preview': [{'PassengerId': 1, 'Pclass': 3, 'Name': 'Braund, Mr. Owen Harris', 'Sex': 'male', 'Age': 22.0, 'SibSp': 1, 'Parch': 0, 'Ticket': 'A/5 21171', 'Fare': 7.25, 'Cabin': 'C92', 'Embarked': 'S', 'pred': 0}, {'PassengerId': 2, 'Pclass': 1, 'Name': 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 'Sex': 'female', 'Age': 38.0, 'SibSp': 1, 'Parch': 0, 'Ticket': 'PC 17599', 'Fare': 71.2833, 'Cabin': 'C85', 'Embarked': 'C', 'pred': 1}, {'PassengerId': 3, 'Pclass': 3, 'Name': 'Heikkinen, Miss. Laina', 'Sex': 'female', 'Age': 26.0, 'SibSp': 0, 'Parch': 0, 'Ticket': 'STON/O2. 3101282', 'Fare': 7.925, 'Cabin': 'C92', 'Embarked': 'S', 'pred': 1}, {'PassengerId': 4, 'Pclass': 1, 'Name': 'Futrelle, Mrs. Jacques Heath (Lily May Peel)', 'Sex': 'female', 'Age': 35.0,

'I have successfully used the trained model with model_id "a313126e" to make predictions. Here are the predictions for the first 10 samples:\n\n1. **PassengerId**: 1, **Predicted Survival**: 0\n2. **PassengerId**: 2, **Predicted Survival**: 1\n3. **PassengerId**: 3, **Predicted Survival**: 1\n4. **PassengerId**: 4, **Predicted Survival**: 1\n5. **PassengerId**: 5, **Predicted Survival**: 0\n6. **PassengerId**: 6, **Predicted Survival**: 0\n7. **PassengerId**: 7, **Predicted Survival**: 0\n8. **PassengerId**: 8, **Predicted Survival**: 0\n9. **PassengerId**: 9, **Predicted Survival**: 1\n10. **PassengerId**: 10, **Predicted Survival**: 1\n\nIf you need further analysis or details, feel free to ask!'