In [None]:
!pip install pyautogen google-generativeai pandas matplotlib autogen

Collecting autogen
  Downloading autogen-0.9.6-py3-none-any.whl.metadata (24 kB)
Collecting ag2==0.9.6 (from autogen)
  Downloading ag2-0.9.6-py3-none-any.whl.metadata (35 kB)
Collecting asyncer==0.0.8 (from ag2==0.9.6->autogen)
  Downloading asyncer-0.0.8-py3-none-any.whl.metadata (6.7 kB)
Collecting diskcache (from ag2==0.9.6->autogen)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting docker (from ag2==0.9.6->autogen)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting python-dotenv (from ag2==0.9.6->autogen)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Downloading autogen-0.9.6-py3-none-any.whl (13 kB)
Downloading ag2-0.9.6-py3-none-any.whl (859 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m859.2/859.2 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading asyncer-0.0.8-py3-none-any.whl (9.2 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Any, Optional
import autogen_agentchat as autogen
from autogen import AssistantAgent, UserProxyAgent, GroupChat, GroupChatManager
import google.generativeai as genai
from io import StringIO
import warnings
warnings.filterwarnings('ignore')

# Configuration for Gemini
class GeminiConfig:
    def __init__(self, api_key: str, model_name: str = "gemini-1.5-flash"):
        genai.configure(api_key=api_key)
        self.model_name = model_name

    def get_config(self) -> Dict[str, Any]:
        return {
            "model": self.model_name,
            "api_key": "your_key",
            "api_type": "gemini"
        }

class EDAMultiAgentSystem:
    def __init__(self, gemini_api_key: str):
        self.gemini_config = GeminiConfig(gemini_api_key)
        self.agents = {}
        self.data = None
        self.results = {}
        self.setup_agents()

    def setup_agents(self):
        """Initialize all agents with their specific roles and configurations"""

        # Base configuration for all agents
        base_config = {
            "config_list": [self.gemini_config.get_config()],
            "temperature": 0.1,
            "timeout": 300,
        }

        # 1. Data Preparation Agent
        self.agents['data_prep'] = AssistantAgent(
            name="DataPrepAgent",
            system_message="""You are a Data Preparation Agent specialized in data cleaning and preprocessing.
            Your responsibilities include:
            - Loading and inspecting datasets
            - Handling missing values
            - Detecting and treating outliers
            - Data type conversions
            - Feature engineering basics
            - Data validation and quality checks

            Always provide clear explanations for your preprocessing decisions and document any assumptions made.
            Return clean, well-structured data ready for analysis.""",
            llm_config=base_config
        )

        # 2. EDA Agent
        self.agents['eda'] = AssistantAgent(
            name="EDAAgent",
            system_message="""You are an Exploratory Data Analysis Agent specialized in statistical analysis and visualization.
            Your responsibilities include:
            - Generating descriptive statistics
            - Creating informative visualizations
            - Identifying patterns and trends
            - Correlation analysis
            - Distribution analysis
            - Feature importance assessment

            Focus on generating actionable insights and clear visualizations that tell a story about the data.
            Always explain what the visualizations reveal about the data.""",
            llm_config=base_config
        )

        # 3. Report Generator Agent
        self.agents['report'] = AssistantAgent(
            name="ReportAgent",
            system_message="""You are a Report Generator Agent specialized in creating comprehensive EDA reports.
            Your responsibilities include:
            - Structuring findings into a coherent report
            - Summarizing key insights and discoveries
            - Creating executive summaries
            - Organizing visualizations with clear captions
            - Providing actionable recommendations

            Create well-formatted, professional reports that are accessible to both technical and non-technical audiences.
            Include methodology, findings, and recommendations sections.""",
            llm_config=base_config
        )

        # 4. Critic Agent
        self.agents['critic'] = AssistantAgent(
            name="CriticAgent",
            system_message="""You are a Critic Agent responsible for quality assurance and feedback.
            Your responsibilities include:
            - Reviewing analysis quality and accuracy
            - Checking for logical inconsistencies
            - Ensuring completeness of analysis
            - Providing constructive feedback
            - Suggesting improvements
            - Validating conclusions

            Be thorough but constructive in your feedback. Focus on improving clarity, accuracy, and actionability.
            Always provide specific suggestions for improvement.""",
            llm_config=base_config
        )

        # 5. Executor Agent
        self.agents['executor'] = AssistantAgent(
            name="ExecutorAgent",
            system_message="""You are an Executor Agent responsible for code validation and result verification.
            Your responsibilities include:
            - Executing and validating code
            - Checking for errors and bugs
            - Ensuring reproducibility
            - Verifying statistical calculations
            - Testing code functionality

            Focus on technical accuracy and ensure all code runs correctly and produces valid results.
            Report any issues or inconsistencies found during execution.""",
            llm_config=base_config
        )

        # 6. Admin Agent (User Proxy)
        self.agents['admin'] = UserProxyAgent(
            name="AdminAgent",
            system_message="""You are the Admin Agent overseeing the EDA workflow.
            Your responsibilities include:
            - Coordinating between agents
            - Managing the overall workflow
            - Ensuring alignment with project goals
            - Making final decisions on analysis direction
            - Facilitating communication between agents

            Keep the analysis focused and ensure all agents work toward the common goal of comprehensive EDA.""",
            human_input_mode="NEVER",
            max_consecutive_auto_reply=3,
            code_execution_config={"work_dir": "eda_output", "use_docker": False}
        )

    def load_data(self, data_path: str) -> pd.DataFrame:
        """Load data from various formats"""
        try:
            if data_path.endswith('.csv'):
                self.data = pd.read_csv(data_path)
            elif data_path.endswith('.xlsx') or data_path.endswith('.xls'):
                self.data = pd.read_excel(data_path)
            elif data_path.endswith('.json'):
                self.data = pd.read_json(data_path)
            else:
                raise ValueError("Unsupported file format")

            print(f"Data loaded successfully. Shape: {self.data.shape}")
            return self.data
        except Exception as e:
            print(f"Error loading data: {e}")
            return None

    def create_sample_data(self) -> pd.DataFrame:
        """Create sample data for demonstration"""
        np.random.seed(42)
        n_samples = 1000

        self.data = pd.DataFrame({
            'age': np.random.randint(18, 80, n_samples),
            'income': np.random.normal(50000, 15000, n_samples),
            'education_years': np.random.randint(8, 20, n_samples),
            'experience_years': np.random.randint(0, 40, n_samples),
            'satisfaction_score': np.random.uniform(1, 10, n_samples),
            'department': np.random.choice(['Engineering', 'Sales', 'Marketing', 'HR'], n_samples),
            'city': np.random.choice(['New York', 'San Francisco', 'Chicago', 'Boston'], n_samples),
            'performance_rating': np.random.choice(['Poor', 'Fair', 'Good', 'Excellent'], n_samples)
        })

        # Add some missing values
        missing_indices = np.random.choice(n_samples, size=int(0.05 * n_samples), replace=False)
        self.data.loc[missing_indices, 'satisfaction_score'] = np.nan

        # Add some outliers
        outlier_indices = np.random.choice(n_samples, size=20, replace=False)
        self.data.loc[outlier_indices, 'income'] = np.random.uniform(200000, 500000, 20)

        print(f"Sample data created successfully. Shape: {self.data.shape}")
        return self.data

    def setup_group_chat(self) -> GroupChatManager:
        """Setup group chat for agent collaboration"""
        agents_list = list(self.agents.values())

        group_chat = GroupChat(
            agents=agents_list,
            messages=[],
            max_round=20,
            speaker_selection_method="round_robin"
        )

        manager = GroupChatManager(
            groupchat=group_chat,
            llm_config={"config_list": [self.gemini_config.get_config()]},
            name="EDAManager"
        )

        return manager

    def run_data_preparation(self) -> Dict[str, Any]:
        """Run data preparation phase"""
        print("Starting Data Preparation Phase...")

        if self.data is None:
            print("No data loaded. Creating sample data...")
            self.create_sample_data()

        # Data preparation tasks
        prep_tasks = {
            "data_info": self.get_data_info(),
            "missing_values": self.handle_missing_values(),
            "outliers": self.detect_outliers(),
            "data_types": self.optimize_data_types()
        }

        self.results['data_preparation'] = prep_tasks
        print("Data Preparation Phase completed.")
        return prep_tasks

    def get_data_info(self) -> Dict[str, Any]:
        """Get basic information about the dataset"""
        info = {
            "shape": self.data.shape,
            "columns": list(self.data.columns),
            "dtypes": self.data.dtypes.to_dict(),
            "memory_usage": self.data.memory_usage(deep=True).sum(),
            "null_counts": self.data.isnull().sum().to_dict()
        }
        return info

    def handle_missing_values(self) -> Dict[str, Any]:
        """Handle missing values in the dataset"""
        missing_summary = self.data.isnull().sum()
        missing_percentage = (missing_summary / len(self.data)) * 100

        # Simple imputation strategy
        for column in self.data.columns:
            if self.data[column].isnull().sum() > 0:
                if self.data[column].dtype in ['int64', 'float64']:
                    self.data[column].fillna(self.data[column].median(), inplace=True)
                else:
                    self.data[column].fillna(self.data[column].mode()[0], inplace=True)

        return {
            "missing_counts": missing_summary.to_dict(),
            "missing_percentage": missing_percentage.to_dict(),
            "imputation_strategy": "median for numeric, mode for categorical"
        }

    def detect_outliers(self) -> Dict[str, Any]:
        """Detect outliers using IQR method"""
        numeric_columns = self.data.select_dtypes(include=[np.number]).columns
        outlier_info = {}

        for column in numeric_columns:
            Q1 = self.data[column].quantile(0.25)
            Q3 = self.data[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            outliers = ((self.data[column] < lower_bound) | (self.data[column] > upper_bound)).sum()
            outlier_info[column] = {
                "count": outliers,
                "percentage": (outliers / len(self.data)) * 100,
                "bounds": {"lower": lower_bound, "upper": upper_bound}
            }

        return outlier_info

    def optimize_data_types(self) -> Dict[str, Any]:
        """Optimize data types for better performance"""
        original_memory = self.data.memory_usage(deep=True).sum()

        # Convert object columns to category if cardinality is low
        for column in self.data.select_dtypes(include=['object']).columns:
            if self.data[column].nunique() < 0.5 * len(self.data):
                self.data[column] = self.data[column].astype('category')

        new_memory = self.data.memory_usage(deep=True).sum()

        return {
            "original_memory": original_memory,
            "optimized_memory": new_memory,
            "memory_reduction": ((original_memory - new_memory) / original_memory) * 100,
            "optimized_dtypes": self.data.dtypes.to_dict()
        }

    def run_eda_analysis(self) -> Dict[str, Any]:
        """Run comprehensive EDA analysis"""
        print("Starting EDA Analysis Phase...")

        eda_results = {
            "descriptive_stats": self.get_descriptive_statistics(),
            "correlations": self.analyze_correlations(),
            "distributions": self.analyze_distributions(),
            "categorical_analysis": self.analyze_categorical_variables(),
            "insights": self.generate_insights()
        }

        self.results['eda_analysis'] = eda_results
        print("EDA Analysis Phase completed.")
        return eda_results

    def get_descriptive_statistics(self) -> Dict[str, Any]:
        """Generate descriptive statistics"""
        numeric_stats = self.data.describe()
        categorical_stats = self.data.describe(include=['object', 'category'])

        return {
            "numeric_summary": numeric_stats.to_dict(),
            "categorical_summary": categorical_stats.to_dict() if not categorical_stats.empty else {},
            "unique_counts": self.data.nunique().to_dict(),
            "value_counts": {col: self.data[col].value_counts().head().to_dict()
                             for col in self.data.select_dtypes(include=['object', 'category']).columns}
        }

    def analyze_correlations(self) -> Dict[str, Any]:
        """Analyze correlations between numeric variables"""
        numeric_data = self.data.select_dtypes(include=[np.number])
        correlation_matrix = numeric_data.corr()

        # Find strong correlations
        strong_correlations = []
        for i in range(len(correlation_matrix.columns)):
            for j in range(i+1, len(correlation_matrix.columns)):
                corr_value = correlation_matrix.iloc[i, j]
                if abs(corr_value) > 0.5:  # Threshold for strong correlation
                    strong_correlations.append({
                        'var1': correlation_matrix.columns[i],
                        'var2': correlation_matrix.columns[j],
                        'correlation': corr_value
                    })

        return {
            "correlation_matrix": correlation_matrix.to_dict(),
            "strong_correlations": strong_correlations
        }

    def analyze_distributions(self) -> Dict[str, Any]:
        """Analyze distributions of variables"""
        distribution_analysis = {}

        for column in self.data.select_dtypes(include=[np.number]).columns:
            distribution_analysis[column] = {
                "mean": self.data[column].mean(),
                "median": self.data[column].median(),
                "std": self.data[column].std(),
                "skewness": self.data[column].skew(),
                "kurtosis": self.data[column].kurtosis(),
                "min": self.data[column].min(),
                "max": self.data[column].max()
            }

        return distribution_analysis

    def analyze_categorical_variables(self) -> Dict[str, Any]:
        """Analyze categorical variables"""
        categorical_analysis = {}

        for column in self.data.select_dtypes(include=['object', 'category']).columns:
            categorical_analysis[column] = {
                "unique_count": self.data[column].nunique(),
                "most_frequent": self.data[column].mode().iloc[0] if not self.data[column].mode().empty else None,
                "value_counts": self.data[column].value_counts().to_dict(),
                "missing_count": self.data[column].isnull().sum()
            }

        return categorical_analysis

    def generate_insights(self) -> List[str]:
        """Generate key insights from the analysis"""
        insights = []

        # Data quality insights
        missing_data = self.data.isnull().sum().sum()
        if missing_data > 0:
            insights.append(f"Dataset contains {missing_data} missing values across all columns")

        # Distribution insights
        numeric_columns = self.data.select_dtypes(include=[np.number]).columns
        for column in numeric_columns:
            skew = self.data[column].skew()
            if abs(skew) > 1:
                insights.append(f"{column} shows {'positive' if skew > 0 else 'negative'} skewness ({skew:.2f})")

        # Correlation insights
        numeric_data = self.data.select_dtypes(include=[np.number])
        if len(numeric_data.columns) > 1:
            corr_matrix = numeric_data.corr()
            max_corr = corr_matrix.abs().unstack().sort_values(ascending=False)
            max_corr = max_corr[max_corr < 1.0].iloc[0]
            if max_corr > 0.7:
                insights.append(f"Strong correlation detected (r={max_corr:.2f}) between variables")

        # Categorical insights
        categorical_columns = self.data.select_dtypes(include=['object', 'category']).columns
        for column in categorical_columns:
            unique_ratio = self.data[column].nunique() / len(self.data)
            if unique_ratio > 0.8:
                insights.append(f"{column} has high cardinality ({self.data[column].nunique()} unique values)")

        return insights

    def create_visualizations(self) -> Dict[str, Any]:
        """Create key visualizations"""
        print("Creating visualizations...")

        # Set style
        plt.style.use('seaborn-v0_8')
        sns.set_palette("husl")

        visualizations = {}

        # 1. Correlation heatmap
        numeric_data = self.data.select_dtypes(include=[np.number])
        if len(numeric_data.columns) > 1:
            plt.figure(figsize=(10, 8))
            sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', center=0)
            plt.title('Correlation Matrix')
            plt.tight_layout()
            plt.savefig('eda_output/correlation_heatmap.png', dpi=300, bbox_inches='tight')
            plt.close()
            visualizations['correlation_heatmap'] = 'correlation_heatmap.png'

        # 2. Distribution plots
        numeric_columns = numeric_data.columns[:4]  # Limit to first 4 columns
        if len(numeric_columns) > 0:
            fig, axes = plt.subplots(2, 2, figsize=(12, 10))
            axes = axes.flatten()

            for i, column in enumerate(numeric_columns):
                if i < 4:
                    axes[i].hist(self.data[column], bins=30, alpha=0.7, color='skyblue')
                    axes[i].set_title(f'Distribution of {column}')
                    axes[i].set_ylabel('Frequency')

            plt.tight_layout()
            plt.savefig('eda_output/distributions.png', dpi=300, bbox_inches='tight')
            plt.close()
            visualizations['distributions'] = 'distributions.png'

        # 3. Categorical variable analysis
        categorical_columns = self.data.select_dtypes(include=['object', 'category']).columns
        if len(categorical_columns) > 0:
            fig, axes = plt.subplots(1, min(2, len(categorical_columns)), figsize=(12, 5))
            if len(categorical_columns) == 1:
                axes = [axes]

            for i, column in enumerate(categorical_columns[:2]):
                value_counts = self.data[column].value_counts()
                axes[i].bar(value_counts.index, value_counts.values, alpha=0.7)
                axes[i].set_title(f'Distribution of {column}')
                axes[i].tick_params(axis='x', rotation=45)

            plt.tight_layout()
            plt.savefig('eda_output/categorical_analysis.png', dpi=300, bbox_inches='tight')
            plt.close()
            visualizations['categorical_analysis'] = 'categorical_analysis.png'

        return visualizations

    def generate_report(self) -> str:
        """Generate comprehensive EDA report"""
        print("Generating comprehensive report...")

        report = f"""
# Exploratory Data Analysis Report

## Executive Summary
This report presents a comprehensive analysis of the dataset containing {self.data.shape[0]} observations and {self.data.shape[1]} variables.

## Dataset Overview
- **Shape**: {self.data.shape[0]} rows × {self.data.shape[1]} columns
- **Memory Usage**: {self.data.memory_usage(deep=True).sum() / 1024**2:.2f} MB
- **Missing Values**: {self.data.isnull().sum().sum()} total missing values

## Data Quality Assessment

### Missing Values
{self._format_missing_values_summary()}

### Data Types
{self._format_data_types_summary()}

### Outliers
{self._format_outliers_summary()}

## Statistical Summary

### Numeric Variables
{self._format_numeric_summary()}

### Categorical Variables
{self._format_categorical_summary()}

## Key Insights
{self._format_insights()}

## Correlations
{self._format_correlation_analysis()}

## Recommendations
{self._generate_recommendations()}

## Methodology
This analysis was conducted using a multi-agent system with the following components:
- Data Preparation Agent: Handled data cleaning and preprocessing
- EDA Agent: Performed statistical analysis and visualizations
- Report Generator: Created this comprehensive report
- Critic Agent: Provided quality assurance and feedback
- Executor Agent: Validated code and results
- Admin Agent: Coordinated the overall workflow

## Conclusion
{self._generate_conclusion()}
"""

        # Save report
        os.makedirs('eda_output', exist_ok=True)
        with open('eda_output/eda_report.md', 'w') as f:
            f.write(report)

        return report

    def _format_missing_values_summary(self) -> str:
        """Format missing values summary"""
        missing_summary = self.data.isnull().sum()
        if missing_summary.sum() == 0:
            return "No missing values detected in the dataset."

        missing_info = []
        for column, count in missing_summary.items():
            if count > 0:
                percentage = (count / len(self.data)) * 100
                missing_info.append(f"- {column}: {count} ({percentage:.1f}%)")

        return "\n".join(missing_info)

    def _format_data_types_summary(self) -> str:
        """Format data types summary"""
        type_summary = self.data.dtypes.value_counts()
        return "\n".join([f"- {dtype}: {count} columns" for dtype, count in type_summary.items()])

    def _format_outliers_summary(self) -> str:
        """Format outliers summary"""
        if 'data_preparation' not in self.results:
            return "Outlier analysis not available."

        outlier_info = self.results['data_preparation']['outliers']
        outlier_summary = []

        for column, info in outlier_info.items():
            if info['count'] > 0:
                outlier_summary.append(f"- {column}: {info['count']} outliers ({info['percentage']:.1f}%)")

        return "\n".join(outlier_summary) if outlier_summary else "No significant outliers detected."

    def _format_numeric_summary(self) -> str:
        """Format numeric variables summary"""
        numeric_data = self.data.select_dtypes(include=[np.number])
        if numeric_data.empty:
            return "No numeric variables found in the dataset."

        summary = []
        for column in numeric_data.columns:
            stats = numeric_data[column].describe()
            summary.append(f"**{column}**:")
            summary.append(f"  - Mean: {stats['mean']:.2f}")
            summary.append(f"  - Median: {stats['50%']:.2f}")
            summary.append(f"  - Std Dev: {stats['std']:.2f}")
            summary.append(f"  - Range: [{stats['min']:.2f}, {stats['max']:.2f}]")
            summary.append("")

        return "\n".join(summary)

    def _format_categorical_summary(self) -> str:
        """Format categorical variables summary"""
        categorical_data = self.data.select_dtypes(include=['object', 'category'])
        if categorical_data.empty:
            return "No categorical variables found in the dataset."

        summary = []
        for column in categorical_data.columns:
            unique_count = self.data[column].nunique()
            most_frequent = self.data[column].mode().iloc[0] if not self.data[column].mode().empty else "N/A"
            summary.append(f"**{column}**:")
            summary.append(f"  - Unique values: {unique_count}")
            summary.append(f"  - Most frequent: {most_frequent}")
            summary.append("")

        return "\n".join(summary)

    def _format_insights(self) -> str:
        """Format key insights"""
        if 'eda_analysis' not in self.results:
            return "Insights not available."

        insights = self.results['eda_analysis']['insights']
        return "\n".join([f"- {insight}" for insight in insights])

    def _format_correlation_analysis(self) -> str:
        """Format correlation analysis"""
        if 'eda_analysis' not in self.results:
            return "Correlation analysis not available."

        strong_correlations = self.results['eda_analysis']['correlations']['strong_correlations']
        if not strong_correlations:
            return "No strong correlations (>0.5) detected between variables."

        corr_summary = []
        for corr in strong_correlations:
            corr_summary.append(f"- {corr['var1']} ↔ {corr['var2']}: {corr['correlation']:.3f}")

        return "\n".join(corr_summary)

    def _generate_recommendations(self) -> str:
        """Generate recommendations based on analysis"""
        recommendations = []

        # Data quality recommendations
        missing_data = self.data.isnull().sum().sum()
        if missing_data > 0:
            recommendations.append("- Address missing values through appropriate imputation or collection strategies")

        # Distribution recommendations
        numeric_columns = self.data.select_dtypes(include=[np.number]).columns
        for column in numeric_columns:
            skew = self.data[column].skew()
            if abs(skew) > 1:
                recommendations.append(f"- Consider transformation for {column} due to high skewness")

        # Correlation recommendations
        if 'eda_analysis' in self.results:
            strong_correlations = self.results['eda_analysis']['correlations']['strong_correlations']
            if strong_correlations:
                recommendations.append("- Investigate multicollinearity issues in predictive modeling")

        # General recommendations
        recommendations.extend([
            "- Validate findings with domain experts",
            "- Consider feature engineering opportunities",
            "- Plan for appropriate statistical modeling approaches"
        ])

        return "\n".join(recommendations)

    def _generate_conclusion(self) -> str:
        """Generate conclusion"""
        return f"""
The dataset contains {self.data.shape[0]} observations across {self.data.shape[1]} variables with varying data types and quality characteristics. The analysis reveals important patterns, relationships, and data quality issues that should inform subsequent modeling and analysis decisions.

Key findings include data distribution patterns, correlation structures, and quality assessments that provide a solid foundation for further analytical work. The multi-agent approach ensured comprehensive coverage of all critical EDA components while maintaining quality and consistency throughout the analysis process.
"""

    def run_complete_analysis(self, data_path: Optional[str] = None) -> Dict[str, Any]:
        """Run the complete EDA analysis workflow"""
        print("Starting Complete EDA Analysis...")

        # Load data
        if data_path:
            self.load_data(data_path)
        else:
            self.create_sample_data()

        # Run analysis phases
        try:
            # Phase 1: Data Preparation
            prep_results = self.run_data_preparation()

            # Phase 2: EDA Analysis
            eda_results = self.run_eda_analysis()

            # Phase 3: Create Visualizations
            visualizations = self.create_visualizations()

            # Phase 4: Generate Report
            report = self.generate_report()

            # Compile final results
            final_results = {
                "data_preparation": prep_results,
                "eda_analysis": eda_results,
                "visualizations": visualizations,
                "report": report,
                "data_shape": self.data.shape,
                "analysis_summary": {
                    "total_variables": self.data.shape[1],
                    "total_observations": self.data.shape[0],
                    "missing_values": self.data.isnull().sum().sum(),
                    "numeric_variables": len(self.data.select_dtypes(include=[np.number]).columns),
                    "categorical_variables": len(self.data.select_dtypes(include=['object', 'category']).columns)
                }
            }

            print("Complete EDA Analysis finished successfully!")
            print(f"Results saved to: eda_output/")
            print(f"Report available at: eda_output/eda_report.md")

            return final_results

        except Exception as e:
            print(f"Error during analysis: {e}")
            return {"error": str(e)}

# Usage Example and Demo
def main():
    """Main function to demonstrate the EDA system"""

    # Initialize the system (replace with your actual Gemini API key)
    GEMINI_API_KEY = "your_gemini_api_key_here"  # Replace with actual key

    try:
        # Create EDA system
        eda_system = EDAMultiAgentSystem(GEMINI_API_KEY)

        # Run complete analysis with sample data
        results = eda_system.run_complete_analysis()

        # Print summary
        if "error" not in results:
            print("\n" + "="*60)
            print("ANALYSIS SUMMARY")
            print("="*60)
            print(f"Dataset Shape: {results['data_shape']}")
            print(f"Total Variables: {results['analysis_summary']['total_variables']}")
            print(f"Total Observations: {results['analysis_summary']['total_observations']}")
            print(f"Missing Values: {results['analysis_summary']['missing_values']}")
            print(f"Numeric Variables: {results['analysis_summary']['numeric_variables']}")
            print(f"Categorical Variables: {results['analysis_summary']['categorical_variables']}")
            print("\nFiles generated:")
            print("- eda_output/eda_report.md")
            for viz_name, viz_file in results['visualizations'].items():
                print(f"- eda_output/{viz_file}")

            print("\nKey Insights:")
            for insight in results['eda_analysis']['insights']:
                print(f"- {insight}")

        else:
            print(f"Analysis failed: {results['error']}")

    except Exception as e:
        print(f"System initialization failed: {e}")
        print("Please ensure you have a valid Gemini API key and all required packages installed.")

# Additional utility functions for advanced analysis
class AdvancedEDAFeatures:
    """Extended features for advanced EDA analysis"""

    def __init__(self, data: pd.DataFrame):
        self.data = data

    def detect_data_drift(self, reference_data: pd.DataFrame) -> Dict[str, Any]:
        """Detect data drift between current and reference datasets"""
        drift_results = {}

        for column in self.data.columns:
            if column in reference_data.columns:
                if self.data[column].dtype in ['int64', 'float64']:
                    # Statistical tests for numeric data
                    from scipy import stats
                    statistic, p_value = stats.ks_2samp(self.data[column].dropna(),
                                                      reference_data[column].dropna())
                    drift_results[column] = {
                        'test': 'Kolmogorov-Smirnov',
                        'statistic': statistic,
                        'p_value': p_value,
                        'drift_detected': p_value < 0.05
                    }
                else:
                    # Chi-square test for categorical data
                    current_counts = self.data[column].value_counts()
                    reference_counts = reference_data[column].value_counts()

                    # Align categories
                    all_categories = set(current_counts.index) | set(reference_counts.index)
                    current_aligned = [current_counts.get(cat, 0) for cat in all_categories]
                    reference_aligned = [reference_counts.get(cat, 0) for cat in all_categories]

                    if sum(current_aligned) > 0 and sum(reference_aligned) > 0:
                        chi2, p_value = stats.chisquare(current_aligned, reference_aligned)
                        drift_results[column] = {
                            'test': 'Chi-square',
                            'statistic': chi2,
                            'p_value': p_value,
                            'drift_detected': p_value < 0.05
                        }

        return drift_results

    def feature_importance_analysis(self, target_column: str) -> Dict[str, Any]:
        """Analyze feature importance using various methods"""
        if target_column not in self.data.columns:
            return {"error": f"Target column '{target_column}' not found"}

        importance_results = {}

        # Correlation-based importance for numeric target
        if self.data[target_column].dtype in ['int64', 'float64']:
            numeric_features = self.data.select_dtypes(include=[np.number]).columns
            numeric_features = numeric_features.drop(target_column)

            correlations = {}
            for feature in numeric_features:
                corr = self.data[feature].corr(self.data[target_column])
                correlations[feature] = abs(corr)

            importance_results['correlation_importance'] = dict(
                sorted(correlations.items(), key=lambda x: x[1], reverse=True)
            )

        # Mutual information for all features
        from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
        from sklearn.preprocessing import LabelEncoder

        # Prepare features
        feature_data = self.data.drop(columns=[target_column])

        # Encode categorical variables
        encoded_features = feature_data.copy()
        label_encoders = {}

        for column in encoded_features.select_dtypes(include=['object', 'category']).columns:
            le = LabelEncoder()
            encoded_features[column] = le.fit_transform(encoded_features[column].astype(str))
            label_encoders[column] = le

        # Calculate mutual information
        if self.data[target_column].dtype in ['int64', 'float64']:
            mi_scores = mutual_info_regression(encoded_features, self.data[target_column])
        else:
            le_target = LabelEncoder()
            encoded_target = le_target.fit_transform(self.data[target_column].astype(str))
            mi_scores = mutual_info_classif(encoded_features, encoded_target)

        mi_importance = dict(zip(encoded_features.columns, mi_scores))
        importance_results['mutual_information'] = dict(
                sorted(mi_importance.items(), key=lambda x: x[1], reverse=True)
            )

        return importance_results

    def automated_insights_generator(self) -> List[str]:
        """Generate automated insights using statistical analysis"""
        insights = []

        # Dataset size insights
        if self.data.shape[0] < 100:
            insights.append("Small dataset size may limit statistical power of analyses")
        elif self.data.shape[0] > 100000:
            insights.append("Large dataset provides good statistical power but may require sampling for visualization")

        # Missing data patterns
        missing_pattern = self.data.isnull().sum()
        if missing_pattern.sum() > 0:
            worst_missing = missing_pattern.idxmax()
            worst_pct = (missing_pattern.max() / len(self.data)) * 100
            insights.append(f"'{worst_missing}' has the highest missing data rate at {worst_pct:.1f}%")

        # Cardinality insights
        for column in self.data.select_dtypes(include=['object', 'category']).columns:
            cardinality = self.data[column].nunique()
            if cardinality == len(self.data):
                insights.append(f"'{column}' appears to be a unique identifier")
            elif cardinality > 0.5 * len(self.data):
                insights.append(f"'{column}' has high cardinality ({cardinality} unique values)")

        # Numeric distribution insights
        for column in self.data.select_dtypes(include=[np.number]).columns:
            skewness = self.data[column].skew()
            if abs(skewness) > 2:
                insights.append(f"'{column}' is highly skewed (skewness: {skewness:.2f})")

            # Check for potential log-normal distribution
            if self.data[column].min() > 0 and skewness > 1:
                log_skewness = np.log(self.data[column]).skew()
                if abs(log_skewness) < abs(skewness):
                    insights.append(f"'{column}' may benefit from log transformation")

        # Correlation insights
        numeric_data = self.data.select_dtypes(include=[np.number])
        if len(numeric_data.columns) > 1:
            corr_matrix = numeric_data.corr()

            # Find perfect correlations (excluding diagonal)
            perfect_corr = []
            for i in range(len(corr_matrix.columns)):
                for j in range(i+1, len(corr_matrix.columns)):
                    if abs(corr_matrix.iloc[i, j]) > 0.95:
                        perfect_corr.append((corr_matrix.columns[i], corr_matrix.columns[j]))

            if perfect_corr:
                insights.append(f"Near-perfect correlations detected: {perfect_corr}")

        # Outlier insights
        for column in self.data.select_dtypes(include=[np.number]).columns:
            Q1 = self.data[column].quantile(0.25)
            Q3 = self.data[column].quantile(0.75)
            IQR = Q3 - Q1
            outliers = ((self.data[column] < (Q1 - 1.5 * IQR)) |
                         (self.data[column] > (Q3 + 1.5 * IQR))).sum()

            if outliers > 0.05 * len(self.data):  # More than 5% outliers
                insights.append(f"'{column}' has {outliers} outliers ({outliers/len(self.data)*100:.1f}%)")

        return insights

# Integration with existing system
def enhanced_eda_system_example():
    """Example of enhanced EDA system with advanced features"""

    # Create sample data with more complex patterns
    np.random.seed(42)
    n_samples = 1000

    # Generate correlated features
    feature1 = np.random.normal(0, 1, n_samples)
    feature2 = 0.8 * feature1 + np.random.normal(0, 0.5, n_samples)  # Correlated
    feature3 = np.random.exponential(2, n_samples)  # Skewed

    enhanced_data = pd.DataFrame({
        'feature1': feature1,
        'feature2': feature2,
        'feature3': feature3,
        'target': 2 * feature1 + 1.5 * feature2 + np.random.normal(0, 0.5, n_samples),
        'category': np.random.choice(['A', 'B', 'C'], n_samples, p=[0.5, 0.3, 0.2]),
        'high_cardinality': [f'ID_{i}' for i in range(n_samples)],  # Unique IDs
        'binary_feature': np.random.choice([0, 1], n_samples)
    })

    # Add missing values
    enhanced_data.loc[np.random.choice(n_samples, 50, replace=False), 'feature3'] = np.nan

    # Create advanced EDA features
    advanced_eda = AdvancedEDAFeatures(enhanced_data)

    # Generate automated insights
    insights = advanced_eda.automated_insights_generator()

    print("Enhanced EDA - Automated Insights:")
    for insight in insights:
        print(f"- {insight}")

    # Feature importance analysis
    importance_results = advanced_eda.feature_importance_analysis('target')

    print("\nFeature Importance Analysis:")
    if 'correlation_importance' in importance_results:
        print("Top correlated features:")
        for feature, importance in list(importance_results['correlation_importance'].items())[:5]:
            print(f"  {feature}: {importance:.3f}")

    if 'mutual_information' in importance_results:
        print("Top mutual information features:")
        for feature, importance in list(importance_results['mutual_information'].items())[:5]:
            print(f"  {feature}: {importance:.3f}")

# Configuration and Setup Instructions
def setup_instructions():
    """Print setup instructions for the EDA system"""

    instructions = """
# Multi-Agent EDA System Setup Instructions

## Required Dependencies

Install the following packages: