# 教育数据分析流程

本notebook展示了完整的教育数据分析流程，包括：
1. 环境准备
2. 数据库连接
3. 数据收集
4. 数据处理
5. 数据分析
6. 数据可视化

## 1. 环境准备

首先安装必要的依赖包：

导入所需的模块：

In [None]:
!pip install pandas numpy matplotlib seaborn plotly psycopg2-binary pymongo python-dotenv eurostat statsmodels


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import sys
import os

# Add scripts directory to Python path
scripts_dir = os.path.join(os.path.dirname(os.getcwd()), 'scripts')
sys.path.append(scripts_dir)

from data_processing.db_manager import DatabaseManager
from data_collection.eurostat_collector import EurostatCollector
from data_processing.data_processor import EducationDataProcessor
from analysis.education_analyzer import EducationAnalyzer
from visualization.data_visualizer import EducationVisualizer

## 2. 数据库连接

测试数据库连接：

In [4]:
# Initialize database manager
db_manager = DatabaseManager()

# Test connections
pg_success = db_manager.connect_postgres()
mongo_success = db_manager.connect_mongodb()

print(f"PostgreSQL connection: {'Success' if pg_success else 'Failed'}")
print(f"MongoDB connection: {'Success' if mongo_success else 'Failed'}")

# Setup PostgreSQL tables
if pg_success:
    db_manager.setup_postgres_tables()

2024-12-14 20:50:10,314 - INFO - Successfully connected to PostgreSQL
2024-12-14 20:50:10,330 - INFO - Successfully connected to MongoDB


PostgreSQL connection: Success
MongoDB connection: Success


2024-12-14 20:50:11,427 - INFO - Successfully set up PostgreSQL tables


## 3. 数据收集

从Eurostat收集教育数据：

In [11]:
import sys
import logging
import os
import time
import pandas as pd
import eurostat

# Configure logging
logging.basicConfig(level=logging.INFO,
                   format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Get the project root directory
notebook_dir = os.getcwd()  # Get current notebook directory
project_root = os.path.dirname(os.path.dirname(notebook_dir))  # Get parent directory
sys.path.append(project_root)  # Add to Python path

from data_processing.db_manager import DatabaseManager

def get_education_data(indicator: str, start_year: int = 2010):
    """
    Retrieve education data for a specific indicator with proper filtering.
    """
    try:
        logger.info(f"Collecting data for indicator: {indicator}")
        
        # Get data with specific parameters
        data = eurostat.get_data_df(indicator)
        
        if data is None or data.empty:
            logger.error(f"No data retrieved for indicator {indicator}")
            return None
            
        # Ensure we have the required columns
        if 'geo' not in data.columns:
            # Try to reset the index if geo is in the index
            if 'geo' in data.index.names:
                data = data.reset_index()
            else:
                logger.error("Missing 'geo' column in dataset")
                return None
                
        # Filter for years >= start_year
        if 'time' in data.columns:
            data = data[data['time'].astype(str).astype(int) >= start_year]
        
        return data
        
    except Exception as e:
        logger.error(f"Error collecting data: {str(e)}")
        return None

def main():
    """Main execution function"""
    try:
        # Initialize database manager
        db_manager = DatabaseManager()
        
        # Test database connections
        if db_manager.test_connections():
            print("Database connections successful")
            
            # Define indicators
            indicators = {
                'educ_uoe_enrt01': 'Students by education level',
                'educ_uoe_perp01': 'Teaching staff',
                'educ_uoe_fina01': 'Education finance'
            }
            
            # Collect and process data for each indicator
            for code, description in indicators.items():
                logger.info(f"Processing {description} (Code: {code})")
                df = get_education_data(code)
                
                if df is not None and not df.empty:
                    # Store in databases
                    db_manager.store_in_postgres(df, code)
                    db_manager.store_in_mongodb(df.to_dict('records'), code)
                    logger.info(f"Successfully processed and stored data for {code}")
                else:
                    logger.warning(f"Skipping {code} due to collection failure")
                    
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
    finally:
        db_manager.close_connections()
        logger.info("Closed all database connections")

if __name__ == "__main__":
    main()

# Execute the main function
main()

2024-12-14 21:00:54,785 - ERROR - Error in main execution: 'DatabaseManager' object has no attribute 'test_connections'
2024-12-14 21:00:54,787 - INFO - Closed all database connections
2024-12-14 21:00:54,787 - INFO - Closed all database connections
2024-12-14 21:00:54,788 - ERROR - Error in main execution: 'DatabaseManager' object has no attribute 'test_connections'
2024-12-14 21:00:54,789 - INFO - Closed all database connections
2024-12-14 21:00:54,789 - INFO - Closed all database connections


## 4. 数据处理

处理和清洗收集的数据：

In [None]:
# Initialize processor
processor = EducationDataProcessor()

# Process all datasets
processed_data = processor.process_indicators(collected_data)

# Calculate statistics for each dataset
for code, df in processed_data.items():
    print(f"\nStatistics for {collector.base_indicators[code]}:")
    stats = processor.calculate_statistics(df, code)
    for stat, value in stats.items():
        if isinstance(value, (int, float)):
            print(f"{stat}: {value:.2f}")

## 5. 数据分析

分析处理后的数据：

In [None]:
# Initialize analyzer
analyzer = EducationAnalyzer()

# Select countries for analysis
countries = ['DE', 'FR', 'ES', 'IT']

for code, df in processed_data.items():
    print(f"\nAnalysis for {collector.base_indicators[code]}:")
    
    # Analyze trends
    for country in countries:
        trends = analyzer.analyze_trends(df, country)
        print(f"\nTrends for {country}:")
        for metric, value in trends.items():
            print(f"{metric}: {value:.2f}")
    
    # Generate forecasts
    print("\nForecasts:")
    for country in countries:
        forecast, conf_int = analyzer.generate_forecast(df, country)
        if forecast:
            print(f"{country}: Next 5 years: {[f'{x:.2f}' for x in forecast]}")
    
    # Compare countries
    comparison = analyzer.compare_countries(df, countries)
    print("\nCountry Comparison:")
    print(comparison)

## 6. 数据可视化

创建可视化图表：

In [None]:
# Initialize visualizer
visualizer = EducationVisualizer()

for code, df in processed_data.items():
    indicator_name = collector.base_indicators[code]
    
    # Plot trends
    visualizer.plot_trend(df, countries, 
                         f"Education {indicator_name} Trends",
                         f"plots/{code}_trends.png")
    
    # Plot forecasts for each country
    for country in countries:
        country_data = df[df['geo'] == country]['values'].tolist()
        forecast, conf_int = analyzer.generate_forecast(df, country)
        if forecast:
            visualizer.plot_forecast(country_data, forecast, conf_int,
                                    f"{indicator_name} Forecast for {country}",
                                    f"plots/{code}_{country}_forecast.html")
    
    # Plot country comparison
    comparison = analyzer.compare_countries(df, countries)
    visualizer.plot_comparison(comparison, 'latest_value',
                              f"{indicator_name} by Country",
                              f"plots/{code}_comparison.png")