In [None]:
from pathlib import Path
from multi_agents import BusinessAnalyst
from autogen import ConversableAgent, AssistantAgent, LLMConfig, UserProxyAgent, GroupChatManager, GroupChat
from autogen.coding.jupyter import LocalJupyterServer, JupyterCodeExecutor
from utils.utils import request_clarification

def custom_speaker_selection_func(last_speaker: ConversableAgent, group_chat: GroupChat):
    if len(group_chat.messages) == 1:
        return group_chat.agent_by_name("Data_Explorer")

    last_message = group_chat.messages[-1]["content"]
    role = group_chat.messages[-1]["role"]
    if role == "tool":
        return group_chat.agent_by_name("User")
    if last_speaker.name == "Data_Explorer":
        if "python" in last_message:
            return group_chat.agent_by_name("CodeExecutor")
        else:
            return group_chat.agent_by_name("Project_Manager")
    elif last_speaker.name == 'CodeExecutor' or last_speaker.name == 'User':
        last_second_speaker_name = group_chat.messages[-2]["name"]
        return group_chat.agent_by_name(last_second_speaker_name)
    return group_chat.agent_by_name("Project_Manager")

llm_config = LLMConfig.from_json(path="configs/llm_config.json")

output_dir = Path("./artifacts")
output_dir.mkdir(parents=True, exist_ok=True)

server = LocalJupyterServer(log_file='./logs/jupyter_gateway.log')
executor = JupyterCodeExecutor(server, output_dir=output_dir)

code_executor = ConversableAgent(
    name="CodeExecutor",
    llm_config=False,               # stays tool-only / non-reasoning
    human_input_mode="NEVER",
    code_execution_config={"executor": executor},  # will execute fenced ```python blocks
)

data_explorer = AssistantAgent(
    name="Data_Explorer",
    llm_config=llm_config,
    human_input_mode="NEVER",
    system_message="""You are the data explorer. Given a dataset and a task, please write code to explore and understand the properties of the dataset.
        For example, you can:
        - get the shape of the dataset
        - get the first several rows of the dataset
        - get the information of the dataset use `df.info()` or `df.describe()`
        - plot the plots as needed (i.e. histogram, distribution)
        - check the missing values
        Only perform necessary data exploration steps.
        When you need to execute Python (load CSVs, transform data, plot), write a complete Python cell fenced with ```python ...``` so the CodeExecutor can run it, 
    """,
)

project_manager = AssistantAgent(
    name="Project_Manager",
    llm_config=llm_config,
    human_input_mode="NEVER",
    system_message="""
        You are the project manager. Your role is to answer other agets' questions and evaluate the results.
        if you are not sure about something, use the request_clarification tool to ask the user for more specifics
    """,
    functions = [request_clarification]
)

user = UserProxyAgent(name="User",code_execution_config={"use_docker": False})

task_prompt = """Explore the dataset data/data_science_salaries/data_science_salaries.csv and provide insights."""

pattern = GroupChat(
    agents=[user, data_explorer,code_executor, project_manager],
    speaker_selection_method=custom_speaker_selection_func,
    max_round=10
)

# group_chat_manager = GroupChatManager(pattern, llm_config=llm_config)

# result = user.initiate_chat(group_chat_manager, message=task_prompt)

bis_analyst = BusinessAnalyst()
user.initiate_chat(bis_analyst, message="Analyst for me this data")

In [None]:
import json
from typing import Optional, List, Dict
from pydantic import BaseModel, Field
from autogen import (
    ConversableAgent,
    UserProxyAgent,
    LLMConfig,
)
from autogen.agentchat import initiate_group_chat
from autogen.agentchat.group.patterns import DefaultPattern
from autogen.agentchat.group import (
    AgentTarget,
    AgentNameTarget,
    RevertToUserTarget,
    ContextVariables,
)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import os

# Setup LLM configuration (using a placeholder)
# In a real scenario, you would configure this with your actual LLM provider
llm_config = LLMConfig(api_type="openai", model="gpt-4", cache_seed=None)

# --- Pydantic Models for Data Science Workflow ---

class DataIngestionResult(BaseModel):
    """Result of the data ingestion stage."""
    data_loaded: bool = Field(..., description="Indicates if the data was loaded successfully.")
    dataset_name: str = Field(..., description="Name of the dataset loaded.")
    num_rows: int = Field(..., description="Number of rows in the loaded dataset.")
    num_columns: int = Field(..., description="Number of columns in the loaded dataset.")
    error_message: Optional[str] = Field(None, description="Error message if ingestion failed.")

class DataCleaningResult(BaseModel):
    """Result of the data cleaning stage."""
    cleaning_successful: bool = Field(..., description="Indicates if data cleaning was successful.")
    rows_after_cleaning: int = Field(..., description="Number of rows remaining after cleaning.")
    columns_after_cleaning: int = Field(..., description="Number of columns remaining after cleaning.")
    missing_values_handled: bool = Field(..., description="Indicates if missing values were handled.")
    duplicates_removed: bool = Field(..., description="Indicates if duplicate rows were removed.")
    error_message: Optional[str] = Field(None, description="Error message if cleaning failed.")

class FeatureEngineeringResult(BaseModel):
    """Result of the feature engineering stage."""
    features_created: bool = Field(..., description="Indicates if new features were successfully created.")
    feature_list: List[str] = Field(..., description="List of features for model training.")
    target_variable: str = Field(..., description="The target variable for the model.")
    error_message: Optional[str] = Field(None, description="Error message if feature engineering failed.")

class ModelTrainingResult(BaseModel):
    """Result of the model training stage."""
    model_trained: bool = Field(..., description="Indicates if the model was trained successfully.")
    model_type: str = Field(..., description="Type of model that was trained (e.g., RandomForestClassifier).")
    training_accuracy: float = Field(..., description="Accuracy of the model on the training set.")
    error_message: Optional[str] = Field(None, description="Error message if training failed.")

class ModelEvaluationResult(BaseModel):
    """Result of the model evaluation stage."""
    evaluation_complete: bool = Field(..., description="Indicates if the model evaluation is complete.")
    test_accuracy: float = Field(..., description="Accuracy of the model on the test set.")
    classification_report: Dict = Field(..., description="Classification report including precision, recall, f1-score.")
    model_performance_summary: str = Field(..., description="A summary of the model's performance.")


# --- Shared Context for the Pipeline ---

shared_context = ContextVariables(
    data={
        "pipeline_started": False,
        "data_ingestion_completed": False,
        "data_cleaning_completed": False,
        "feature_engineering_completed": False,
        "model_training_completed": False,
        "model_evaluation_completed": False,
        "has_error": False,
        "error_message": "",
        "error_stage": "",
        "raw_dataframe": None,
        "cleaned_dataframe": None,
        "features": None,
        "target": None,
        "model": None
    }
)


# --- Pipeline Stage Functions (Tools for Agents) ---

def ingest_data(file_path: str, context_variables: ContextVariables):
    """Loads the initial dataset from a given file path into the pipeline."""
    try:
        df = pd.read_csv(file_path)
        context_variables["raw_dataframe"] = df.to_json()
        context_variables["data_ingestion_completed"] = True
        return DataIngestionResult(
            data_loaded=True,
            dataset_name=file_path,
            num_rows=len(df),
            num_columns=len(df.columns)
        )
    except FileNotFoundError:
        error_msg = f"Error: The file was not found at the path: {file_path}"
        context_variables["has_error"] = True
        context_variables["error_stage"] = "data_ingestion"
        context_variables["error_message"] = error_msg
        return DataIngestionResult(data_loaded=False, dataset_name=file_path, num_rows=0, num_columns=0, error_message=error_msg)
    except Exception as e:
        context_variables["has_error"] = True
        context_variables["error_stage"] = "data_ingestion"
        context_variables["error_message"] = str(e)
        return DataIngestionResult(data_loaded=False, dataset_name=file_path, num_rows=0, num_columns=0, error_message=str(e))


def clean_data(context_variables: ContextVariables):
    """Cleans the loaded data by handling missing values and duplicates."""
    try:
        df = pd.read_json(context_variables["raw_dataframe"])
        
        # Handle missing values
        if 'age' in df.columns:
            df['age'].fillna(df['age'].median(), inplace=True)
        missing_handled = True
        
        # Remove duplicates
        initial_rows = len(df)
        df.drop_duplicates(inplace=True)
        duplicates_removed = initial_rows > len(df)

        context_variables["cleaned_dataframe"] = df.to_json()
        context_variables["data_cleaning_completed"] = True
        return DataCleaningResult(
            cleaning_successful=True,
            rows_after_cleaning=len(df),
            columns_after_cleaning=len(df.columns),
            missing_values_handled=missing_handled,
            duplicates_removed=duplicates_removed
        )
    except Exception as e:
        context_variables["has_error"] = True
        context_variables["error_stage"] = "data_cleaning"
        context_variables["error_message"] = str(e)
        return DataCleaningResult(cleaning_successful=False, rows_after_cleaning=0, columns_after_cleaning=0, missing_values_handled=False, duplicates_removed=False, error_message=str(e))

def engineer_features(context_variables: ContextVariables):
    """Creates new features for the model."""
    try:
        df = pd.read_json(context_variables["cleaned_dataframe"])
        
        # One-hot encode categorical features
        if 'department' in df.columns:
            df = pd.get_dummies(df, columns=['department'], drop_first=True)
        
        target_variable = 'has_purchased'
        if target_variable not in df.columns:
            raise ValueError(f"Target variable '{target_variable}' not found in the dataframe.")

        feature_list = [col for col in df.columns if col != target_variable]
        
        context_variables["features"] = df[feature_list].to_json()
        context_variables["target"] = df[target_variable].to_json()
        context_variables["feature_engineering_completed"] = True
        
        return FeatureEngineeringResult(
            features_created=True,
            feature_list=feature_list,
            target_variable=target_variable
        )
    except Exception as e:
        context_variables["has_error"] = True
        context_variables["error_stage"] = "feature_engineering"
        context_variables["error_message"] = str(e)
        return FeatureEngineeringResult(features_created=False, feature_list=[], target_variable="", error_message=str(e))


def train_model(context_variables: ContextVariables):
    """Trains a machine learning model."""
    try:
        X = pd.read_json(context_variables["features"])
        y = pd.read_json(context_variables["target"], typ='series')

        X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=42)

        model = RandomForestClassifier(random_state=42)
        model.fit(X_train, y_train)
        
        # Save model - in a real scenario, you'd serialize this properly
        context_variables["model"] = model
        context_variables["model_training_completed"] = True
        
        train_accuracy = accuracy_score(y_train, model.predict(X_train))
        
        return ModelTrainingResult(
            model_trained=True,
            model_type="RandomForestClassifier",
            training_accuracy=train_accuracy
        )
    except Exception as e:
        context_variables["has_error"] = True
        context_variables["error_stage"] = "model_training"
        context_variables["error_message"] = str(e)
        return ModelTrainingResult(model_trained=False, model_type="", training_accuracy=0.0, error_message=str(e))

def evaluate_model(context_variables: ContextVariables):
    """Evaluates the trained model on the test set."""
    try:
        X = pd.read_json(context_variables["features"])
        y = pd.read_json(context_variables["target"], typ='series')
        
        _, X_test, _, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        model = context_variables["model"]
        predictions = model.predict(X_test)
        
        test_accuracy = accuracy_score(y_test, predictions)
        report = classification_report(y_test, predictions, output_dict=True, zero_division=0)
        
        summary = f"Model achieved a test accuracy of {test_accuracy:.2f}."
        
        context_variables["model_evaluation_completed"] = True
        
        return ModelEvaluationResult(
            evaluation_complete=True,
            test_accuracy=test_accuracy,
            classification_report=report,
            model_performance_summary=summary
        )
    except Exception as e:
        context_variables["has_error"] = True
        context_variables["error_stage"] = "model_evaluation"
        context_variables["error_message"] = str(e)
        return ModelEvaluationResult(evaluation_complete=False, test_accuracy=0.0, classification_report={}, model_performance_summary=str(e))

# --- Conversable Agents for Each Pipeline Stage ---

with llm_config:
    entry_agent = ConversableAgent(
        name="Data_Ingestion_Agent",
        system_message="Your task is to start the data science pipeline by loading the data from a user-provided file path. Call the ingest_data tool with the file path.",
        llm_config=llm_config,
        functions=[ingest_data]
    )

    cleaning_agent = ConversableAgent(
        name="Data_Cleaning_Agent",
        system_message="Your task is to clean the dataset. Handle missing values and remove duplicates by calling the clean_data tool.",
        llm_config=llm_config,
        functions=[clean_data]
    )

    feature_agent = ConversableAgent(
        name="Feature_Engineering_Agent",
        system_message="Your task is to perform feature engineering. Create new features from the cleaned data by calling the engineer_features tool.",
        llm_config=llm_config,
        functions=[engineer_features]
    )

    training_agent = ConversableAgent(
        name="Model_Training_Agent",
        system_message="Your task is to train a machine learning model on the engineered features. Call the train_model tool.",
        llm_config=llm_config,
        functions=[train_model]
    )

    evaluation_agent = ConversableAgent(
        name="Model_Evaluation_Agent",
        system_message="Your task is to evaluate the trained model's performance on the test data. Call the evaluate_model tool and provide a summary.",
        llm_config=llm_config,
        functions=[evaluate_model]
    )

# --- User Proxy Agent ---
user_proxy = UserProxyAgent(
    name="user_proxy",
    is_termination_msg=lambda x: "FINAL REPORT" in x.get("content", ""),
    human_input_mode="NEVER",
    max_consecutive_auto_reply=10,
    code_execution_config=False,
)

entry_agent.handoffs.set_after_work(AgentTarget(cleaning_agent))
cleaning_agent.handoffs.set_after_work(AgentTarget(feature_agent))
feature_agent.handoffs.set_after_work(AgentTarget(training_agent))
training_agent.handoffs.set_after_work(AgentTarget(evaluation_agent))
evaluation_agent.handoffs.set_after_work(RevertToUserTarget())



# --- Define the Agent Flow ---
agent_pattern = DefaultPattern(
    initial_agent=entry_agent,
    agents=[entry_agent, cleaning_agent, feature_agent, training_agent, evaluation_agent],
    user_agent=user_proxy,
    context_variables=shared_context,
)


# --- Function to run the pipeline ---
def run_data_science_pipeline(file_path: str):
    """Initiates and runs the data science pipeline for a given data file."""
    print(f"🚀 Starting Data Science Pipeline for file: {file_path}...\n")
    
    chat_result = initiate_group_chat(
        pattern=agent_pattern,
        messages=f"Start the data science pipeline. The data is located at: {file_path}",
        max_rounds=20,
    )

    print("\n🏁 Data Science Pipeline Finished.\n")
    
    # Print a summary from the last message or context
    final_context = chat_result.final_context
    if not final_context["has_error"]:
        print("✅ Pipeline completed successfully!\n")
        print("--- FINAL REPORT ---")
        
        # Extracting details from the final message from the evaluation agent
        last_message_content = chat_result.chat_history[-2].get('content', '')
        summary = "No summary found."
        try:
            # The tool call result is a string, so we need to parse it
            result_json = json.loads(last_message_content)
            summary = result_json.get('model_performance_summary', summary)
            test_accuracy = result_json.get('test_accuracy', 0)
            report = result_json.get('classification_report', {})
            
            print(f"📄 Summary: {summary}")
            print(f"🎯 Test Accuracy: {test_accuracy:.4f}")
            print("\n📊 Classification Report:")
            print(json.dumps(report, indent=2))

        except (json.JSONDecodeError, AttributeError, IndexError) as e:
            print(f"Could not parse final report from agent message. Error: {e}")
            print("Last message content:", last_message_content)

    else:
        print(f"❌ Pipeline failed at stage: {final_context['error_stage']}")
        print(f"   Error: {final_context['error_message']}")


if __name__ == "__main__":
    run_data_science_pipeline(file_path="data/house_prices/train.csv")
