# Education Data Analysis Project

## Overview
This notebook presents a comprehensive analysis of education data across different countries. The project combines data from various sources to analyze education investment, quality, outcomes, and resource allocation patterns.

### Project Objectives
1. Analyze education investment trends
2. Evaluate education quality indicators
3. Assess resource allocation efficiency
4. Predict future education metrics

### Analysis Components
- Data collection from Eurostat
- Data storage in PostgreSQL and MongoDB
- Statistical analysis
- Time series forecasting
- Interactive visualizations

## 1. Environment Setup

First, let's install and import all necessary packages.

In [1]:
!pip install pandas numpy plotly scikit-learn statsmodels pymongo psycopg2-binary python-dotenv eurostat tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import pymongo
import psycopg2
from datetime import datetime
import os
from dotenv import load_dotenv
import eurostat
from tqdm import tqdm

## 2. Database Configuration

Set up connections to PostgreSQL and MongoDB databases.

In [10]:
# Load environment variables
load_dotenv()

# PostgreSQL configuration
POSTGRES_CONFIG = {
    'dbname': os.getenv('POSTGRES_DB', 'education_db'),
    'user': os.getenv('POSTGRES_USER', 'postgres'),
    'password': os.getenv('POSTGRES_PASSWORD', 'postgrespassword'),
    'host': os.getenv('POSTGRES_HOST', 'localhost'),
    'port': os.getenv('POSTGRES_PORT', '5432')
}

# MongoDB configuration
MONGODB_CONFIG = {
    'host': os.getenv('MONGODB_HOST', 'localhost'),
    'port': int(os.getenv('MONGODB_PORT', '27017')),
    'db': os.getenv('MONGODB_DB', 'education_db')
}

# Create database connections
def get_postgres_connection():
    return psycopg2.connect(**POSTGRES_CONFIG)

def get_mongodb_connection():
    client = pymongo.MongoClient(host=MONGODB_CONFIG['host'], port=MONGODB_CONFIG['port'])
    return client[MONGODB_CONFIG['db']]

## 3. Data Collection

Collect education data from Eurostat.

In [11]:
def collect_education_data():
    # Eurostat dataset codes
    datasets = {
        'education_investment': 'educ_uoe_fine09',
        'student_teacher_ratio': 'educ_uoe_perp04',
        'completion_rate': 'edat_lfse_03',
        'literacy_rate': 'edat_lfse_01'
    }
    
    data = {}
    for metric, code in datasets.items():
        try:
            df = eurostat.get_data_df(code)
            data[metric] = df
        except Exception as e:
            print(f"Error collecting {metric} data: {str(e)}")
    
    return data

# Collect data
education_data = collect_education_data()

## 4. Data Preprocessing

Clean and prepare the collected data for analysis.

In [13]:
# Data Collection
def collect_education_data():
    # Eurostat dataset codes for different education metrics
    datasets = {
        'education_investment': 'educ_uoe_fine09',
        'student_teacher_ratio': 'educ_uoe_perp04',
        'completion_rate': 'edat_lfse_03',
        'literacy_rate': 'edat_lfse_01'
    }
    
    data = {}
    for metric, code in datasets.items():
        try:
            print(f"Collecting {metric} data...")
            df = eurostat.get_data_df(code)
            # Print data structure to understand column names
            print(f"\nColumns for {metric}:")
            print(df.columns.tolist())
            data[metric] = df
        except Exception as e:
            print(f"Error collecting {metric} data: {str(e)}")
    
    return data

# Data Preprocessing
def preprocess_data(data):
    processed_data = pd.DataFrame()
    
    for metric, df in data.items():
        print(f"\nProcessing {metric} data...")
        
        # Reset index to keep original index as columns
        df = df.reset_index()
        
        # Print column names to understand actual data structure
        print(f"Columns in {metric} dataset:")
        print(df.columns.tolist())
        
        # Select appropriate id_vars based on actual column names
        id_cols = [col for col in df.columns if col in ['geo', 'time', 'country', 'year']]
        if not id_cols:
            print(f"Warning: No identifying columns found in {metric} dataset")
            continue
            
        # Reshape data
        try:
            melted = df.melt(id_vars=id_cols,
                           var_name='indicator',
                           value_name=metric)
            
            # Select required columns
            melted = melted[id_cols + [metric]]
            
            # Standardize column names
            column_mapping = {
                'geo': 'country',
                'time': 'year'
            }
            melted = melted.rename(columns=column_mapping)
            
            # Merge datasets
            if processed_data.empty:
                processed_data = melted
            else:
                merge_cols = ['country', 'year'] if 'country' in melted.columns else ['geo', 'time']
                processed_data = processed_data.merge(melted, on=merge_cols, how='outer')
                
        except Exception as e:
            print(f"Error processing {metric} data: {str(e)}")
            continue
    
    # Clean data by removing rows with missing values
    processed_data = processed_data.dropna()
    
    # Ensure consistent column naming
    if 'geo' in processed_data.columns:
        processed_data = processed_data.rename(columns={'geo': 'country'})
    if 'time' in processed_data.columns:
        processed_data = processed_data.rename(columns={'time': 'year'})
    
    return processed_data

# Execute data collection and preprocessing
print("Collecting education data...")
education_data = collect_education_data()

print("\nPreprocessing data...")
processed_education_data = preprocess_data(education_data)

# Display processed data structure
print("\nProcessed data structure:")
print(processed_education_data.info())
print("\nFirst few rows of processed data:")
print(processed_education_data.head())

Collecting education data...
Collecting education_investment data...

Columns for education_investment:
['freq', 'unit', 'isced11', 'geo\\TIME_PERIOD', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']
Collecting student_teacher_ratio data...

Columns for student_teacher_ratio:
['freq', 'unit', 'isced11', 'geo\\TIME_PERIOD', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']
Collecting completion_rate data...

Columns for completion_rate:
['freq', 'sex', 'age', 'unit', 'isced11', 'geo\\TIME_PERIOD', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']
Collecting literacy_rate data...

Columns for literacy_rate:
['freq', 'sex', 'wstatus', 'citizen', 'age', 'unit', 'geo\\TIME_PERIOD', '2004', '2005', '2006', '2007', '2008', '2009'

## 5. Basic Data Analysis

Perform initial analysis of education metrics.

In [16]:
def analyze_investment_trends(df):
    """
    Analyze education investment trends over time
    
    Parameters:
    df (DataFrame): Processed education data
    
    Returns:
    DataFrame: Trends analysis results
    """
    print("Data shape:", df.shape)
    print("Available columns:", df.columns.tolist())
    
    # Ensure data types are correct
    if 'time' in df.columns:
        df['year'] = pd.to_numeric(df['time'], errors='coerce')
    
    # Check if education investment column exists
    investment_cols = [col for col in df.columns if 'educ' in col.lower() or 'invest' in col.lower()]
    if investment_cols:
        investment_col = investment_cols[0]
        print(f"Using column '{investment_col}' for investment analysis")
    else:
        raise ValueError("No investment-related column found in the data")
    
    # Calculate investment trends
    trends = df.groupby('year')[investment_col].agg(['mean', 'min', 'max']).round(2)
    
    # Create trend visualization
    fig = px.line(trends.reset_index(), 
                  x='year',
                  y=['mean', 'min', 'max'],
                  title='Education Investment Trends',
                  labels={'value': 'Investment Value',
                         'year': 'Year',
                         'variable': 'Metric'})
    
    fig.update_layout(
        xaxis_title="Year",
        yaxis_title="Investment (%)",
        legend_title="Statistics"
    )
    
    fig.show()
    
    return trends

# Analyze investment trends with error handling
try:
    investment_trends = analyze_investment_trends(processed_education_data)
except Exception as e:
    print(f"Error in analysis: {str(e)}")
    print("\nDetailed data information:")
    print(processed_education_data.info())

Data shape: (0, 0)
Available columns: []
Error in analysis: No investment-related column found in the data

Detailed data information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame
None


## 6. Advanced Analysis

Perform more sophisticated analyses including quality assessment and forecasting.

In [6]:
def analyze_education_quality(df):
    # Normalize metrics
    scaler = StandardScaler()
    metrics = ['student_teacher_ratio', 'completion_rate', 'literacy_rate']
    normalized_data = pd.DataFrame(scaler.fit_transform(df[metrics]), columns=metrics)
    
    # Calculate quality scores
    weights = {'student_teacher_ratio': -0.3, 'completion_rate': 0.4, 'literacy_rate': 0.3}
    quality_scores = sum(normalized_data[metric] * weight for metric, weight in weights.items())
    
    # Create visualization
    fig = px.box(quality_scores, title='Education Quality Score Distribution')
    fig.show()
    
    return quality_scores

# Analyze education quality
quality_scores = analyze_education_quality(processed_education_data)

NameError: name 'processed_education_data' is not defined

## 7. Time Series Forecasting

Predict future education metrics using time series analysis.

In [7]:
def forecast_metrics(df, metric, periods=5):
    # Prepare time series data
    ts_data = df.groupby('year')[metric].mean()
    
    # Fit model
    model = ExponentialSmoothing(ts_data, seasonal_periods=4, trend='add', seasonal='add')
    fitted_model = model.fit()
    
    # Generate forecast
    forecast = fitted_model.forecast(periods)
    
    # Create visualization
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=ts_data.index, y=ts_data.values, name='Historical'))
    fig.add_trace(go.Scatter(x=forecast.index, y=forecast.values, name='Forecast'))
    fig.update_layout(title=f'{metric} Forecast')
    fig.show()
    
    return forecast

# Forecast education investment
investment_forecast = forecast_metrics(processed_education_data, 'education_investment')

NameError: name 'processed_education_data' is not defined

## 8. Conclusions and Recommendations

Summarize key findings and provide recommendations.

In [8]:
def generate_recommendations(df):
    recommendations = []
    
    # Analyze investment efficiency
    inv_corr = df['education_investment'].corr(df['completion_rate'])
    if inv_corr > 0.5:
        recommendations.append("Increase education investment to improve completion rates")
    
    # Analyze quality metrics
    quality_trend = df.groupby('year')['completion_rate'].mean().pct_change().mean()
    if quality_trend < 0:
        recommendations.append("Focus on improving education quality metrics")
    
    return recommendations

# Generate recommendations
recommendations = generate_recommendations(processed_education_data)
print("\nRecommendations:")
for rec in recommendations:
    print(f"- {rec}")

NameError: name 'processed_education_data' is not defined