In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import interact, Dropdown
from IPython.display import display
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.ar_model import AutoReg
from sklearn.metrics import mean_squared_error, mean_absolute_error
from joblib import Parallel, delayed
import warnings
import joblib

# 1. Load data
data_full = pd.read_csv('data_imputation_full.csv', index_col=[0, 1])

gdp_data = data_full.xs('Economics: GDP', level=1)

# gdp_data.head()

# reshape data
gdp_long = gdp_data.reset_index().melt(id_vars=['Country'], var_name='Year', value_name='GDP')
gdp_long['Year'] = gdp_long['Year'].astype(int)

# gdp_long.head(50)

In [None]:
# 忽略警告以保持输出整洁
warnings.filterwarnings('ignore')

# 定义ADF检验函数
def adf_test(series, country):
    result = adfuller(series, autolag='AIC')
    print(f'--- {country} ---')
    print(f'ADF Statistic: {result[0]:.4f}')
    print(f'p-value: {result[1]:.4f}')
    for key, value in result[4].items():
        print(f'Critical Value {key}: {value:.4f}')
    if result[1] < 0.05:
        print("结论：序列是平稳的\n")
    else:
        print("结论：序列不是平稳的，需要差分处理\n")

# 定义差分函数
def difference_series(series, order=1):
    return pd.Series(series).diff(order).dropna().values

# 定义AR模型训练函数
def train_ar_model(country, data):
    try:
        # 提取国家数据并排序
        country_data = data.loc[country].sort_index(level='Year')
        
        # 获取差分后的GDP序列
        gdp_diff = country_data['GDP_diff'].dropna().values
        
        # 检查数据点数量
        if len(gdp_diff) < 10:
            print(f"{country}: 数据点不足（{len(gdp_diff)}），跳过AR模型训练。\n")
            return (country, None, None)
        
        # 训练AR模型，自动选择滞后阶数
        model = AutoReg(gdp_diff, lags=None, old_names=False).fit()
        print(f"{country}: AR模型已训练。滞后阶数 = {model.model.lags}\n")
        
        # 返回模型和序列
        return (country, model, gdp_diff)
    
    except Exception as e:
        print(f"{country}: AR模型训练失败。错误信息: {e}\n")
        return (country, None, None)

# 定义模型评估函数
def evaluate_ar_model(country, model, gdp_diff):
    try:
        # 确定训练集和测试集大小
        train_size = int(len(gdp_diff) * 0.8)
        train, test = gdp_diff[:train_size], gdp_diff[train_size:]
        
        # 重新训练模型在训练集上
        model_train = AutoReg(train, lags=model.model.lags, old_names=False).fit()
        
        # 进行预测
        predictions = model_train.predict(start=train_size, end=len(gdp_diff)-1, dynamic=False)
        
        # 评估
        mse = mean_squared_error(test, predictions)
        mae = mean_absolute_error(test, predictions)
        r2 = model_train.rsquared
        
        print(f"{country} - 评估指标: MSE = {mse:.4f}, MAE = {mae:.4f}, R² = {r2:.4f}\n")
        
        # 返回评估结果
        return (country, {'MSE': mse, 'MAE': mae, 'R2': r2})
    
    except Exception as e:
        print(f"{country}: 模型评估失败。错误信息: {e}\n")
        return (country, {'MSE': None, 'MAE': None, 'R2': None})

# 定义预测函数
def forecast_ar_model(country, model, gdp_diff, forecast_steps=5):
    try:
        # 进行预测
        forecast = model.predict(start=len(gdp_diff), end=len(gdp_diff) + forecast_steps - 1, dynamic=False)
        print(f"{country} - 未来{forecast_steps}年GDP差分预测:\n{forecast}\n")
        
        # 返回预测结果
        return (country, forecast)
    
    except Exception as e:
        print(f"{country}: 预测失败。错误信息: {e}\n")
        return (country, None)

# 定义差分预测转回原始GDP函数
def revert_diff(country, forecast_diff, data):
    try:
        # 获取最后一个已知GDP值
        country_data = data.loc[country].sort_index(level='Year')
        last_gdp = country_data['GDP'].iloc[-1]
        
        # 累加差分预测值
        forecast_gdp = []
        current_gdp = last_gdp
        for diff in forecast_diff:
            current_gdp += diff
            forecast_gdp.append(current_gdp)
        
        return (country, forecast_gdp)
    
    except Exception as e:
        print(f"{country}: 差分预测转回原始GDP失败。错误信息: {e}\n")
        return (country, None)


# 添加差分列
gdp_long['GDP_diff'] = gdp_long.groupby('Country')['GDP'].transform(lambda x: x.diff())

# 设置多重索引
gdp_long.set_index(['Country', 'Year'], inplace=True)

# 获取所有国家的列表
countries = gdp_long.index.get_level_values('Country').unique()

# 训练AR模型
results = Parallel(n_jobs=-1)(
    delayed(train_ar_model)(country, gdp_long) for country in countries
)

# 分别提取模型和序列
ar_models = {}
gdp_diff_dict = {}

for result in results:
    country, model, gdp_diff = result
    if model is not None:
        ar_models[country] = model
        gdp_diff_dict[country] = gdp_diff

# 评估AR模型
evaluation_results = Parallel(n_jobs=-1)(
    delayed(evaluate_ar_model)(country, ar_models[country], gdp_diff_dict[country]) for country in ar_models.keys()
)

# 存储评估结果
evaluation_dict = {country: metrics for country, metrics in evaluation_results}

# 转换为DataFrame并保存
evaluation_df = pd.DataFrame.from_dict(evaluation_dict, orient='index')
evaluation_df.to_csv('AR_model_evaluation.csv')
print("\n模型评估结果已保存到 'AR_model_evaluation.csv'。")

# 进行预测
forecast_results = Parallel(n_jobs=-1)(
    delayed(forecast_ar_model)(country, ar_models[country], gdp_diff_dict[country], forecast_steps=5) for country in ar_models.keys()
)

# 存储预测结果
forecast_dict = {country: forecast for country, forecast in forecast_results if forecast is not None}

# 将差分预测转回原始GDP
reverted_forecast_results = Parallel(n_jobs=-1)(
    delayed(revert_diff)(country, forecast_dict[country], gdp_long) for country in forecast_dict.keys()
)

# 存储转回后的预测结果
reverted_forecast_dict = {country: gdp for country, gdp in reverted_forecast_results if gdp is not None}

# 转换为DataFrame并保存
reverted_forecast_df = pd.DataFrame(reverted_forecast_dict).T  # 转置使国家为行
reverted_forecast_df.columns = [f'Forecast_GDP_{i+1}' for i in range(reverted_forecast_df.shape[1])]
reverted_forecast_df.to_csv('AR_model_forecasts_original_gdp.csv')
print("转回原始GDP的预测结果已保存到 'AR_model_forecasts_original_gdp.csv'。")

# 保存模型摘要
for country, model in ar_models.items():
    summary = model.summary()
    with open(f'AR_model_summary_{country}.txt', 'w') as f:
        f.write(str(summary))
    print(f"{country}: 模型摘要已保存为 'AR_model_summary_{country}.txt'。")

# 保存训练好的模型
for country, model in ar_models.items():
    joblib.dump(model, f'AR_model_{country}.pkl')
    print(f"{country}: 模型已保存为 'AR_model_{country}.pkl'。")

# 定义增强版绘图函数
def plot_gdp_enhanced(country):
    try:
        # 提取国家数据并排序
        country_data = gdp_long.loc[country].sort_index()
        
        # 绘制GDP时间序列
        plt.figure(figsize=(12, 6))
        sns.lineplot(x=country_data.index, y='GDP', data=country_data, marker='o', label='实际GDP')
        
        # 添加趋势线（线性回归）
        sns.regplot(x=country_data.index, y='GDP', data=country_data, scatter=False, label='趋势线', color='red')
        
        plt.title(f'{country} GDP 时间序列')
        plt.xlabel('Year')
        plt.ylabel('GDP')
        plt.legend()
        plt.grid(True)
        plt.show()
        
        # 显示最后一个已知GDP值
        last_year = country_data.index.max()
        last_gdp = country_data.loc[last_year, 'GDP']
        print(f"最后已知年份：{last_year}，GDP：{last_gdp:.2f}")
    
    except Exception as e:
        print(f"{country}: 绘图失败。错误信息: {e}\n")

# 创建交互式下拉菜单
interact(plot_gdp_enhanced, country=Dropdown(options=sorted(countries), description='国家:', disabled=False))

In [None]:
# # 2. visualize GDP data
# warnings.filterwarnings('ignore')

# gdp_long['GDP_diff'] = gdp_long.groupby('Country')['GDP'].transform(lambda x: x.diff())

# gdp_long.set_index(['Country', 'Year'], inplace=True)

# # countries = gdp_long['Country'].unique()

# def plot_gdp(country):
    
#     country_data = gdp_long.loc[country].sort_index()
    
#     plt.figure(figsize=(12, 6))
#     sns.lineplot(x=country_data.index, y='GDP', data=country_data, marker='o')
#     plt.title(f'{country} GDP Data')
#     plt.xlabel('Year')
#     plt.ylabel('GDP')
#     plt.grid(True)
#     plt.show()

# countries = gdp_long.index.get_level_values('Country').unique()

# interact(plot_gdp, country=Dropdown(options=sorted(countries), description='Country:', disabled=False))