# 数据收集测试

本notebook用于测试从Eurostat获取教育和经济数据的功能。

In [None]:
import sys
import os
import pandas as pd
from dotenv import load_dotenv

# 添加项目根目录到Python路径
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

# 导入数据收集器
from src.data_collection.eurostat_collector import EurostatCollector

# 加载环境变量
load_dotenv()

## 1. 初始化数据收集器

In [None]:
collector = EurostatCollector()

# 显示可用的数据集
print("Available datasets:")
for code, description in collector.get_available_datasets().items():
    print(f"- {code}: {description}")

## 2. 收集教育数据

In [None]:
# 收集教育数据
education_data = collector.collect_education_data()

print("\nEducation Data Shape:", education_data.shape)
print("\nEducation Data Columns:", education_data.columns.tolist())
print("\nSample of Education Data:")
print(education_data.head())

# 基本统计信息
print("\nEducation Data Statistics:")
print(education_data.describe())

## 3. 收集经济数据

In [None]:
# 收集经济数据
economic_data = collector.collect_economic_data()

print("\nEconomic Data Shape:", economic_data.shape)
print("\nEconomic Data Columns:", economic_data.columns.tolist())
print("\nSample of Economic Data:")
print(economic_data.head())

# 基本统计信息
print("\nEconomic Data Statistics:")
print(economic_data.describe())

## 4. 数据质量检查

In [None]:
def check_data_quality(df, name):
    print(f"\nChecking {name} data quality:")
    
    # 检查缺失值
    missing = df.isnull().sum()
    print("\nMissing values:")
    print(missing[missing > 0])
    
    # 检查重复值
    duplicates = df.duplicated().sum()
    print(f"\nNumber of duplicate rows: {duplicates}")
    
    # 检查数值范围
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    print("\nNumeric columns range:")
    for col in numeric_cols:
        print(f"{col}:")
        print(f"  Min: {df[col].min()}")
        print(f"  Max: {df[col].max()}")

# 检查教育数据质量
check_data_quality(education_data, "education")

# 检查经济数据质量
check_data_quality(economic_data, "economic")

## 5. 保存数据

将收集到的数据保存到CSV文件中，以便后续分析使用。

In [None]:
# 创建数据目录
data_dir = os.path.join(project_root, 'data', 'raw')
os.makedirs(data_dir, exist_ok=True)

# 保存教育数据
education_file = os.path.join(data_dir, 'education_data.csv')
education_data.to_csv(education_file, index=False)
print(f"Education data saved to {education_file}")

# 保存经济数据
economic_file = os.path.join(data_dir, 'economic_data.csv')
economic_data.to_csv(economic_file, index=False)
print(f"Economic data saved to {economic_file}")