In [1]:
"""
Education Data Analysis Project
This script contains the complete code for analyzing education data across different countries.
The analysis includes data collection, storage, and analysis using both SQL and NoSQL databases.
"""

# 1. Required Imports
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from statsmodels.tsa.statespace.sarimax import SARIMAX
import eurostat
import logging
from datetime import datetime
import os
from dotenv import load_dotenv
from pymongo import MongoClient
import psycopg2
from psycopg2.extras import execute_values
import time
from pymongo import UpdateOne
from statsmodels.tsa.arima.model import ARIMA


In [9]:
import sys
import logging
import os
import time
import pandas as pd
import eurostat

# Configure logging
logging.basicConfig(level=logging.INFO,
                   format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Get the project root directory and set up the Python path correctly
notebook_dir = os.getcwd()  # Get current notebook directory
project_root = os.path.dirname(notebook_dir)  # Get parent directory
scripts_dir = os.path.join(project_root, 'scripts')  # Get scripts directory
sys.path.append(scripts_dir)  # Add scripts directory to Python path

# Now we can import from the scripts directory
from data_processing.db_manager import DatabaseManager

def get_education_data(indicator: str, start_year: int = 2010):
    """
    Retrieve education data for a specific indicator with proper filtering.
    """
    try:
        logger.info(f"Collecting data for indicator: {indicator}")
        
        # Get data with specific parameters
        data = eurostat.get_data_df(indicator)
        
        if data is None or data.empty:
            logger.error(f"No data retrieved for indicator {indicator}")
            return None
            
        # Ensure we have the required columns
        if 'geo' not in data.columns:
            # Try to reset the index if geo is in the index
            if 'geo' in data.index.names:
                data = data.reset_index()
            else:
                logger.error("Missing 'geo' column in dataset")
                return None
                
        # Filter for years >= start_year
        if 'time' in data.columns:
            data = data[data['time'].astype(str).astype(int) >= start_year]
        
        return data
        
    except Exception as e:
        logger.error(f"Error collecting data: {str(e)}")
        return None

# Initialize database manager
db_manager = DatabaseManager()

# Define indicators
indicators = {
    'educ_uoe_enrt01': 'Students by education level',
    'educ_uoe_perp01': 'Teaching staff',
    'educ_uoe_fina01': 'Education finance'
}

# Collect and process data for each indicator
for code, description in indicators.items():
    logger.info(f"Processing {description} (Code: {code})")
    df = get_education_data(code)
    
    if df is not None and not df.empty:
        # Store in databases
        db_manager.store_in_postgres(df, code)
        db_manager.store_in_mongodb(df.to_dict('records'), code)
        logger.info(f"Successfully processed and stored data for {code}")
        
        # Display sample of the data
        print(f"\nSample data for {description}:")
        print(df.head())
    else:
        logger.warning(f"Skipping {code} due to collection failure")

# Close database connections
db_manager.close_connections()
logger.info("Closed all database connections")

ModuleNotFoundError: No module named 'data_processing'

In [4]:

        # Connect to databases
        pg_conn = get_postgres_connection()
        mongo_db = get_mongodb_connection()
        
        if pg_conn is None or mongo_db is None:
            print("Failed to connect to databases")
            return
        
        try:
            # Test MongoDB connection
            mongo_db.command('ping')
        except Exception as e:
            print(f"MongoDB connection test failed: {str(e)}")
            return
            
        # Set up PostgreSQL database
        setup_postgres_database(pg_conn)
        
        # Collect and store data
        print("\nCollecting and storing education data...")
        collect_and_store_education_data(pg_conn, mongo_db)
        

Successfully connected to PostgreSQL
Successfully connected to MongoDB


SyntaxError: 'return' outside function (748532696.py, line 7)

In [5]:

        # Analyze metrics for EU countries
        print("\nAnalyzing education metrics...")
        eu_countries = ['DE', 'FR', 'IT', 'ES', 'NL']  # Example EU countries
        year_range = (2010, 2023)
        
        for country in eu_countries:
            print(f"\nAnalyzing data for {country}")
            
            # Get metrics analysis
            metrics = analyze_education_metrics(mongo_db, country, year_range)
            if metrics:
                print(f"\nMetrics Analysis for {country}:")
                for metric, stats in metrics.items():
                    print(f"\n{metric.upper()}:")
                    print(f"Mean: {stats['mean']:.2f}")
                    print(f"Median: {stats['median']:.2f}")
                    if 'trend' in stats:
                        print(f"Trend slope: {stats['trend']['slope']:.4f}")
            
            # Generate forecasts
            print(f"\nGenerating forecasts for {country}")
            for metric in ['education_investment', 'student_teacher_ratio', 'completion_rate']:
                forecast = generate_forecasts(mongo_db, metric, country)
                if forecast:
                    print(f"\n{metric.upper()} Forecast:")
                    for year, value in zip(forecast['years'], forecast['values']):
                        print(f"{year}: {value:.2f}")
        


Analyzing education metrics...

Analyzing data for DE

Metrics Analysis for DE:

EDUCATION_INVESTMENT:
Mean: 8962.50
Median: 8962.50

STUDENT_TEACHER_RATIO:
Mean: 13.55
Median: 13.55
Trend slope: 3.7000

Generating forecasts for DE


  warn('Too few observations to estimate starting parameters%s.'
  np.inner(score_obs, score_obs) /



Analyzing data for FR

Metrics Analysis for FR:

EDUCATION_INVESTMENT:
Mean: 10107.90
Median: 10107.90

STUDENT_TEACHER_RATIO:
Mean: 14.20
Median: 14.20
Trend slope: 10.8000

Generating forecasts for FR


  warn('Too few observations to estimate starting parameters%s.'
  np.inner(score_obs, score_obs) /



Analyzing data for IT

Metrics Analysis for IT:

EDUCATION_INVESTMENT:
Mean: 5990.40
Median: 5990.40

STUDENT_TEACHER_RATIO:
Mean: 16.15
Median: 16.15
Trend slope: -5.7000

Generating forecasts for IT


  warn('Too few observations to estimate starting parameters%s.'
  np.inner(score_obs, score_obs) /



Analyzing data for ES

Metrics Analysis for ES:

EDUCATION_INVESTMENT:
Mean: 4937.90
Median: 4937.90

STUDENT_TEACHER_RATIO:
Mean: 12.90
Median: 12.90
Trend slope: 1.2000

Generating forecasts for ES


  warn('Too few observations to estimate starting parameters%s.'
  np.inner(score_obs, score_obs) /



Analyzing data for NL

Metrics Analysis for NL:

EDUCATION_INVESTMENT:
Mean: 8616.80
Median: 8616.80

STUDENT_TEACHER_RATIO:
Mean: 18.20
Median: 18.20
Trend slope: -3.8000

Generating forecasts for NL


  warn('Too few observations to estimate starting parameters%s.'
  np.inner(score_obs, score_obs) /


In [6]:

        # Compare countries
        print("\nComparing countries...")
        for metric in ['education_investment', 'student_teacher_ratio', 'completion_rate']:
            comparison = compare_countries(mongo_db, eu_countries, metric, year_range)
            if comparison:
                print(f"\n{metric.upper()} Comparison:")
                for country, stats in comparison.items():
                    print(f"\n{country}:")
                    print(f"Mean: {stats['mean']:.2f}")
                    print(f"Latest Value: {stats['latest_value']:.2f}")
                    print(f"Trend: {stats['trend']:.4f}")


Comparing countries...


  comparison = compare_countries(mongo_db, eu_countries, metric, year_range)
  comparison = compare_countries(mongo_db, eu_countries, metric, year_range)
  comparison = compare_countries(mongo_db, eu_countries, metric, year_range)
  comparison = compare_countries(mongo_db, eu_countries, metric, year_range)
  comparison = compare_countries(mongo_db, eu_countries, metric, year_range)



EDUCATION_INVESTMENT Comparison:

DE:
Mean: 8962.50
Latest Value: 8962.50
Trend: 2.2273

FR:
Mean: 10107.90
Latest Value: 10107.90
Trend: 2.5119

IT:
Mean: 5990.40
Latest Value: 5990.40
Trend: 1.4887

ES:
Mean: 4937.90
Latest Value: 4937.90
Trend: 1.2271

NL:
Mean: 8616.80
Latest Value: 8616.80
Trend: 2.1414

STUDENT_TEACHER_RATIO Comparison:

DE:
Mean: 13.55
Latest Value: 15.40
Trend: 3.7000

FR:
Mean: 14.20
Latest Value: 19.60
Trend: 10.8000

IT:
Mean: 16.15
Latest Value: 13.30
Trend: -5.7000

ES:
Mean: 12.90
Latest Value: 13.50
Trend: 1.2000

NL:
Mean: 18.20
Latest Value: 16.30
Trend: -3.8000
