# Education Data Analysis Project

## Overview
This project presents a comprehensive analysis of education data across different countries. The analysis combines data from multiple sources to examine education investment, quality, outcomes, and resource allocation patterns.

### Project Objectives
1. Analyze education investment trends
2. Evaluate education quality indicators
3. Assess resource allocation efficiency
4. Predict future education metrics

### Analysis Components
- Data collection from Eurostat
- Statistical analysis
- Time series forecasting
- Interactive visualizations

## 1. Environment Setup

First, let's import all necessary packages and set up configurations.

##  Environment Installation

Before starting the analysis, we need to install the required packages. Run the following commands in your terminal or in a notebook cell:

In [3]:
# Install required packages
!pip install pandas numpy plotly scikit-learn statsmodels pymongo psycopg2-binary python-dotenv eurostat tqdm

# Verify installations
import pkg_resources
required_packages = [
    'pandas',
    'numpy',
    'plotly',
    'scikit-learn',
    'statsmodels',
    'pymongo',
    'psycopg2-binary',
    'python-dotenv',
    'eurostat',
    'tqdm'
]

# Check installed versions
for package in required_packages:
    version = pkg_resources.get_distribution(package).version
    print(f"{package}: v{version}")


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
pandas: v2.1.3
numpy: v1.24.3
plotly: v5.24.1
scikit-learn: v1.3.2
statsmodels: v0.14.4
pymongo: v4.6.0
psycopg2-binary: v2.9.10
python-dotenv: v1.0.0
eurostat: v1.0.4
tqdm: v4.66.1


  import pkg_resources


In [4]:
# Import required packages
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from statsmodels.tsa.statespace.sarimax import SARIMAX
import eurostat
import logging
from datetime import datetime
import os
from dotenv import load_dotenv

# Set up logging
logging.basicConfig(
    filename='education_data_collection.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

## 2. Data Collection

This section collects education-related data from Eurostat, including:
- Education investment
- Student-teacher ratio
- Completion rates
- Literacy rates

In [6]:
def collect_education_data():
    """
    Collect education data from Eurostat
    Returns a dictionary containing different education metrics
    """
    datasets = {
        'education_investment': 'educ_uoe_fine09',  # Education investment
        'student_teacher_ratio': 'educ_uoe_perp04', # Student-teacher ratio
        'completion_rate': 'edat_lfse_03',          # Completion rate
        'literacy_rate': 'edat_lfse_01'            # Literacy rate
    }
    
    data = {}
    for metric, code in datasets.items():
        try:
            logging.info(f"Starting collection of {metric} data...")
            print(f"Collecting {metric} data...")
            
            # Get raw data from Eurostat
            df = eurostat.get_data_df(code)
            print(f"\nColumns in {metric} dataset:")
            print(df.columns.tolist())
            
            # First, handle the time columns
            time_cols = [col for col in df.columns if col.isdigit()]
            
            # Identify non-time columns
            non_time_cols = [col for col in df.columns if not col.isdigit()]
            
            # Create a base DataFrame with non-time columns
            base_df = df[non_time_cols].copy()
            
            # Initialize list to store transformed data
            transformed_data = []
            
            # Process each row
            for idx, row in base_df.iterrows():
                for year in time_cols:
                    new_row = row.copy()
                    new_row['year'] = year
                    new_row[metric] = df.loc[idx, year]
                    transformed_data.append(new_row)
            
            # Create new DataFrame from transformed data
            transformed_df = pd.DataFrame(transformed_data)
            
            # Clean the data
            if 'geo\\TIME_PERIOD' in transformed_df.columns:
                transformed_df['geo'] = transformed_df['geo\\TIME_PERIOD'].str.split('\\').str[0]
                transformed_df = transformed_df.drop('geo\\TIME_PERIOD', axis=1)
            
            # Convert metric values to numeric
            transformed_df[metric] = pd.to_numeric(transformed_df[metric], errors='coerce')
            
            # Store the transformed DataFrame
            data[metric] = transformed_df
            
            print(f"Successfully collected and transformed {metric} data")
            print(f"Shape of {metric} dataset: {transformed_df.shape}")
            print(f"Columns in transformed dataset: {transformed_df.columns.tolist()}\n")
            
        except Exception as e:
            print(f"Error collecting {metric} data: {str(e)}")
            print(f"Full error details: ", e)
            import traceback
            print(traceback.format_exc())
    
    return data

# Execute data collection with more detailed output
print("Starting education data collection...")
education_data = collect_education_data()

# Display the structure of collected data
for metric, df in education_data.items():
    print(f"\nStructure of {metric} dataset:")
    print(df.info())
    print("\nFirst few rows:")
    print(df.head())
    print("\n" + "="*50)

Starting education data collection...
Collecting education_investment data...

Columns in education_investment dataset:
['freq', 'unit', 'isced11', 'geo\\TIME_PERIOD', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']
Successfully collected and transformed education_investment data
Shape of education_investment dataset: (25850, 6)
Columns in transformed dataset: ['freq', 'unit', 'isced11', 'year', 'education_investment', 'geo']

Collecting student_teacher_ratio data...

Columns in student_teacher_ratio dataset:
['freq', 'unit', 'isced11', 'geo\\TIME_PERIOD', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']
Successfully collected and transformed student_teacher_ratio data
Shape of student_teacher_ratio dataset: (5960, 6)
Columns in transformed dataset: ['freq', 'unit', 'isced11', 'year', 'student_teacher_ratio', 'geo']

Collecting completion_rate data...

Columns in completion_rate dataset:
['freq', 'sex', 'age', 'unit', 'isce

## 3. Data Preprocessing

Clean and standardize the collected data, including:
- Merging different datasets
- Handling missing values
- Standardizing column names
- Converting data types

In [None]:
def preprocess_data(data):
    """
    Preprocess and merge all education data
    Returns a cleaned and standardized DataFrame
    """
    processed_data = None
    
    for metric, df in data.items():
        print(f"Processing {metric} data...")
        
        # Determine merge columns
        common_cols = ['geo', 'time', 'year']
        merge_cols = [col for col in common_cols if col in df.columns]
        
        if not merge_cols:
            print(f"Warning: No merge columns found in {metric} dataset")
            continue
        
        # Prepare current dataset
        cols_to_keep = merge_cols + [metric]
        current_df = df[cols_to_keep].copy()
        
        # Merge data
        if processed_data is None:
            processed_data = current_df
        else:
            processed_data = processed_data.merge(
                current_df,
                on=merge_cols,
                how='outer'
            )
    
    if processed_data is not None:
        # Clean and standardize data
        processed_data = processed_data.dropna()
        processed_data['year'] = pd.to_numeric(processed_data['year'])
        processed_data = processed_data.rename(columns={'geo': 'country'})
        processed_data = processed_data.sort_values('year')
        
        print("Data preprocessing completed")
    else:
        print("Warning: No valid data after preprocessing")
        processed_data = pd.DataFrame()
    
    return processed_data

# Execute data preprocessing
print("\nStarting data preprocessing...")
processed_education_data = preprocess_data(education_data)

# Display processed data structure
print("\nProcessed data structure:")
print(processed_education_data.info())
print("\nData preview:")
print(processed_education_data.head())


Starting data preprocessing...
Processing education_investment data...
Processing student_teacher_ratio data...
Processing completion_rate data...
Processing literacy_rate data...


## 4. Data Analysis

Perform multi-dimensional analysis on preprocessed data:
- Basic statistical analysis
- Trend analysis
- Country comparisons

In [None]:
def analyze_education_metrics(df):
    """
    Analyze education indicators
    Returns a dictionary containing analysis results and visualizations
    """
    results = {}
    
    # Basic statistical analysis
    print("Performing basic statistical analysis...")
    for column in df.select_dtypes(include=[np.number]).columns:
        if column != 'year':
            stats = df.groupby('year')[column].agg(['mean', 'std', 'min', 'max'])
            results[f'{column}_stats'] = stats
            print(f"\nBasic statistics for {column}:")
            print(stats)
    
    # Trend analysis
    print("\nPerforming trend analysis...")
    for column in df.select_dtypes(include=[np.number]).columns:
        if column != 'year':
            fig = px.line(df.groupby('year')[column].mean().reset_index(),
                         x='year', y=column,
                         title=f'Trend of {column} Over Time')
            results[f'{column}_trend'] = fig
            fig.show()
    
    # Country comparison
    print("\nPerforming country comparison...")
    latest_year = df['year'].max()
    for column in df.select_dtypes(include=[np.number]).columns:
        if column != 'year':
            latest_data = df[df['year'] == latest_year].sort_values(column, ascending=False)
            fig = px.bar(latest_data.head(10),
                        x='country', y=column,
                        title=f'Top 10 Countries - {column} ({latest_year})')
            results[f'{column}_comparison'] = fig
            fig.show()
    
    return results

# Execute data analysis
print("Starting data analysis...")
analysis_results = analyze_education_metrics(processed_education_data)

## 5. Forecasting Analysis

Perform time series forecasting on education metrics:
- Using SARIMA models
- Generate 5-year predictions
- Visualize forecast results