# 教育数据采集与存储演示

本notebook演示如何:
1. 从Eurostat获取教育数据
2. 从World Bank获取经济数据
3. 收集教育政策文档
4. 将数据存储到PostgreSQL和MongoDB

In [1]:
import sys
import os
import pandas as pd
from dotenv import load_dotenv

# 添加项目根目录到Python路径
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

# 导入自定义模块
from src.data_collection.eurostat_collector import EurostatCollector
from src.data_processing.db_manager import DatabaseManager

# 加载环境变量
load_dotenv()

# 初始化收集器和数据库管理器
collector = EurostatCollector()
db_manager = DatabaseManager()

## 1. 收集教育数据

In [2]:
# 获取教育投资数据
education_data = collector.get_education_investment_data()
print("收集到的教育数据形状:", education_data.shape)
education_data.head()

2024-12-14 22:28:08,837 - INFO - Getting education investment data...
2024-12-14 22:28:09,364 - INFO - Successfully got education investment data: 2585 records


收集到的教育数据形状: (2585, 17)


Unnamed: 0,index,freq,unit,isced11,geo\TIME_PERIOD,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,collected_at,source
0,0,A,EUR,ED0,AT,6313.5,6579.9,6893.8,6959.6,7267.0,7535.6,7617.3,5442.4,8271.8,8492.8,2024-12-14 22:28:09.363820,Eurostat
1,1,A,EUR,ED0,BG,1567.2,1840.1,1868.3,1832.4,1994.6,2212.2,2392.6,2628.5,2756.0,3377.5,2024-12-14 22:28:09.363820,Eurostat
2,2,A,EUR,ED0,CH,6311.8,5884.9,7746.0,14350.9,14296.3,14348.9,14101.3,,16058.9,16328.8,2024-12-14 22:28:09.363820,Eurostat
3,3,A,EUR,ED0,CY,2993.2,2803.2,,2185.1,2081.6,2056.2,2059.3,2029.9,2093.2,2168.0,2024-12-14 22:28:09.363820,Eurostat
4,4,A,EUR,ED0,CZ,2397.8,2382.1,2316.1,2358.9,2337.9,2770.7,3170.2,3541.8,3653.7,4191.3,2024-12-14 22:28:09.363820,Eurostat


## 2. 收集经济数据

In [3]:
# Get economic indicators data
try:
    economic_data = collector.get_economic_indicators()
    if not economic_data.empty:
        print("Economic data shape:", economic_data.shape)
        display(economic_data.head())
    else:
        print("No economic data was retrieved")
except Exception as e:
    print(f"Error getting economic data: {str(e)}")

2024-12-14 22:28:09,375 - INFO - Getting economic indicators data...
2024-12-14 22:28:09,376 - INFO - Fetching data for indicator: gdp_growth (NY.GDP.MKTP.KD.ZG)
2024-12-14 22:28:11,012 - INFO - Successfully processed gdp_growth data with 378 records
2024-12-14 22:28:11,012 - INFO - Fetching data for indicator: employment_rate (SL.EMP.TOTL.SP.ZS)
2024-12-14 22:28:11,322 - INFO - Successfully processed employment_rate data with 378 records
2024-12-14 22:28:11,322 - INFO - Fetching data for indicator: gdp_per_capita (NY.GDP.PCAP.CD)
2024-12-14 22:28:11,590 - INFO - Successfully processed gdp_per_capita data with 378 records
2024-12-14 22:28:11,591 - INFO - Fetching data for indicator: industry_value (NV.IND.TOTL.ZS)
2024-12-14 22:28:11,857 - INFO - Successfully processed industry_value data with 378 records
2024-12-14 22:28:11,857 - INFO - Merging dataframe 1 with shape (378, 3)
2024-12-14 22:28:11,858 - INFO - Merging dataframe 2 with shape (378, 3)
2024-12-14 22:28:11,860 - INFO - Merg

Economic data shape: (378, 8)


Unnamed: 0,country_code,year,gdp_growth,employment_rate,gdp_per_capita,industry_value,collected_at,source
0,AUT,2010,1.837094,57.348,46903.761585,25.534994,2024-12-14 22:28:11.861363,World Bank
1,BEL,2010,2.864293,49.588,44184.946354,20.89458,2024-12-14 22:28:11.861363,World Bank
2,BGR,2010,1.55536,47.902,6863.667068,23.754177,2024-12-14 22:28:11.861363,World Bank
3,CYP,2010,2.283545,60.248,31105.027344,14.479115,2024-12-14 22:28:11.861363,World Bank
4,CZE,2010,2.434902,54.186,19960.068487,33.170752,2024-12-14 22:28:11.861363,World Bank


## 3. 收集政策文档

In [7]:
# Get education policy documents
policy_docs = collector.get_education_policies()
print(f"Collected {len(policy_docs)} policy documents\n")
print("Example document:")
print(policy_docs[0] if policy_docs else "No documents found")


2024-12-14 22:36:39,707 - INFO - Getting education policy documents...
2024-12-14 22:36:40,009 - INFO - Successfully got education policy documents: 1 documents


Collected 1 policy documents

Example document:
{'title': None, 'content': 'Education levelsEuropean Union Member States organise their education and training systems according to common levels.Discover what the EU is doing to achieve the EEA at each level and type of education.Slide', 'url': None, 'collected_at': '2024-12-14T22:36:40.009500', 'source': 'https://education.ec.europa.eu/education-levels'}


## 4. 存储数据

In [6]:
# Connect to databases
db_manager.connect_postgres()
db_manager.connect_mongo()

# Reset table structure
db_manager.drop_tables()
db_manager.setup_postgres_tables()

# Save structured data to PostgreSQL
db_manager.save_to_postgres(education_data, 'education_data')
db_manager.save_to_postgres(economic_data, 'economic_data')

# Save unstructured data to MongoDB
db_manager.save_to_mongo('education_policies', policy_docs)

print("Data storage completed!")

2024-12-14 22:36:18,240 - INFO - Successfully connected to PostgreSQL
2024-12-14 22:36:18,249 - INFO - Successfully connected to MongoDB


AttributeError: 'DatabaseManager' object has no attribute 'drop_tables'

## 5. 验证数据

In [None]:
# 验证PostgreSQL数据
education_count = db_manager.query_postgres("""
    SELECT COUNT(*) as count 
    FROM education_data
""")
print(f"教育数据表中的记录数: {education_count['count'].iloc[0]}")

economic_count = db_manager.query_postgres("""
    SELECT COUNT(*) as count 
    FROM economic_data
""")
print(f"经济数据表中的记录数: {economic_count['count'].iloc[0]}")

# 验证MongoDB数据
policy_count = len(db_manager.query_mongo('education_policies', {}))
print(f"政策文档集合中的文档数: {policy_count}")

# 关闭数据库连接
db_manager.close_connections()