# Education Investment Analysis

This notebook analyzes education investment data across EU countries, including economic indicators and policy impacts.

In [None]:
# Install required packages
!pip install pandas numpy matplotlib seaborn plotly psycopg2-binary pymongo python-dotenv eurostat statsmodels

In [None]:
# Import required libraries
import sys
import os
import json
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from dotenv import load_dotenv
import matplotlib
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

In [None]:
# Set up project paths
notebook_dir = Path.cwd()
project_root = notebook_dir.parent.parent
data_dir = project_root / 'data'
cache_dir = data_dir / 'cache'
src_dir = project_root / 'src'

# Create directories if they don't exist
data_dir.mkdir(exist_ok=True)
cache_dir.mkdir(exist_ok=True)
src_dir.mkdir(exist_ok=True)

# Add project root and src to Python path
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))
if str(src_dir) not in sys.path:
    sys.path.append(str(src_dir))

In [None]:
# Load environment variables
env_path = project_root / '.env'
if not env_path.exists():
    # Create default .env file if it doesn't exist
    with open(env_path, 'w') as f:
        f.write("""
POSTGRES_USER=postgres
POSTGRES_PASSWORD=postgres
POSTGRES_DB=education_db
POSTGRES_HOST=localhost
POSTGRES_PORT=5432

MONGODB_URI=mongodb://localhost:27017/
MONGODB_DB=education_db
""")

load_dotenv(env_path)

In [None]:
# Import project modules
try:
    from src.data_processing.db_manager import DatabaseManager
    from src.data_processing.data_cleaner import DataCleaner
    from src.data_collection.eurostat_collector import EurostatCollector
except ImportError as e:
    logging.error(f"Failed to import project modules: {e}")
    logging.info("Creating necessary module files...")
    
    # Create src directory structure
    modules = ['data_processing', 'data_collection', 'visualization']
    for module in modules:
        module_dir = src_dir / module
        module_dir.mkdir(exist_ok=True)
        init_file = module_dir / '__init__.py'
        init_file.touch()
    
    # Create basic module files
    with open(src_dir / 'data_processing' / 'db_manager.py', 'w') as f:
        f.write("""
import os
import logging
import psycopg2
from pymongo import MongoClient

class DatabaseManager:
    def __init__(self):
        self.pg_conn = None
        self.mongo_client = None
    
    def init_postgres_connection(self):
        try:
            self.pg_conn = psycopg2.connect(
                dbname=os.getenv('POSTGRES_DB'),
                user=os.getenv('POSTGRES_USER'),
                password=os.getenv('POSTGRES_PASSWORD'),
                host=os.getenv('POSTGRES_HOST'),
                port=os.getenv('POSTGRES_PORT')
            )
            logging.info('Successfully connected to PostgreSQL')
            return self.pg_conn
        except Exception as e:
            logging.error(f'Failed to connect to PostgreSQL: {e}')
            return None
    
    def init_mongodb_connection(self):
        try:
            self.mongo_client = MongoClient(os.getenv('MONGODB_URI'))
            db = self.mongo_client[os.getenv('MONGODB_DB')]
            logging.info('Successfully connected to MongoDB')
            return db
        except Exception as e:
            logging.error(f'Failed to connect to MongoDB: {e}')
            return None
""")

    with open(src_dir / 'data_processing' / 'data_cleaner.py', 'w') as f:
        f.write("""
import pandas as pd
import numpy as np
import logging

class DataCleaner:
    def clean_education_data(self, df):
        try:
            # Remove rows with all NaN values
            df = df.dropna(how='all')
            
            # Fill missing values with appropriate methods
            df = df.fillna(method='ffill')
            
            # Remove duplicates
            df = df.drop_duplicates()
            
            logging.info(f'Successfully cleaned data: {len(df)} rows remaining')
            return df
        except Exception as e:
            logging.error(f'Error cleaning data: {e}')
            return None
""")

    with open(src_dir / 'data_collection' / 'eurostat_collector.py', 'w') as f:
        f.write("""
import pandas as pd
import eurostat
import logging
from pathlib import Path

class EurostatCollector:
    def __init__(self):
        self.cache_dir = Path.cwd().parent.parent / 'data' / 'cache'
        self.cache_dir.mkdir(parents=True, exist_ok=True)
    
    def get_education_data(self):
        try:
            # Try to load from cache first
            cache_file = self.cache_dir / 'education_investment.csv'
            if cache_file.exists():
                df = pd.read_csv(cache_file)
                logging.info(f'Loaded data from cache: {len(df)} records')
                return df
            
            # If not in cache, fetch from Eurostat
            df = eurostat.get_data_df('educ_uoe_fine06')
            
            # Save to cache
            df.to_csv(cache_file, index=False)
            logging.info(f'Saved data to cache: {cache_file.name}')
            
            return df
        except Exception as e:
            logging.error(f'Error getting education data: {e}')
            return None
""")

    logging.info("Created necessary module files")

In [None]:
# Set plotting style
plt.style.use('seaborn')
sns.set_theme()
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['font.size'] = 12

## Step 1: Data Collection and Storage

In [None]:
# Initialize components
collector = EurostatCollector()
db_manager = DatabaseManager()
cleaner = DataCleaner()

# Collect education data
print("Collecting education investment data...")
education_data = collector.get_education_data()

if education_data is not None:
    print(f"Collected {len(education_data)} education investment records")
    
    # Clean the data
    print("\nCleaning data...")
    education_data_cleaned = cleaner.clean_education_data(education_data)
    
    if education_data_cleaned is not None:
        print(f"Cleaned data shape: {education_data_cleaned.shape}")
        
        # Save to databases
        print("\nSaving data to PostgreSQL...")
        pg_conn = db_manager.init_postgres_connection()
        if pg_conn:
            try:
                education_data_cleaned.to_sql('education_investment',
                                             pg_conn,
                                             if_exists='replace',
                                             index=False)
                print("Successfully saved to PostgreSQL")
            except Exception as e:
                print(f"Error saving to PostgreSQL: {e}")
        
        print("\nSaving data to MongoDB...")
        mongo_db = db_manager.init_mongodb_connection()
        if mongo_db:
            try:
                collection = mongo_db.education_investment
                collection.insert_many(education_data_cleaned.to_dict('records'))
                print("Successfully saved to MongoDB")
            except Exception as e:
                print(f"Error saving to MongoDB: {e}")
else:
    print("Failed to collect education data")