In [41]:
import pandas as pd

# 1. 加载数据
data_full = pd.read_csv('data_imputation_full.csv', index_col=[0, 1])

# 读取每个国家选择的特征
selected_features_df = pd.read_csv('selected_features_per_country_elastic_net.csv')

# 获取国家列表
countries = selected_features_df['Country'].unique()

# 将选定的特征转换为长格式，便于处理
selected_features_long = selected_features_df.melt(id_vars='Country', value_name='Feature').dropna()

# 2. 为每个国家准备数据的函数
def prepare_country_data(country):
    """
    准备特定国家的变量数据，用于 VAR 模型训练。

    参数：
    - country: 国家名称

    返回：
    - data: 包含选定特征和 GDP 的 DataFrame，索引为年份
    """
    # 获取该国家的选定特征列表
    features = selected_features_long[selected_features_long['Country'] == country]['Feature'].tolist()

    # 确保包含 'Economics: GDP'
    if 'Economics: GDP' not in features:
        features.append('Economics: GDP')

    # 使用 data_full.xs() 读取每个特征的数据
    data_frames = []
    for feature in features:
        # 提取该特征的数据，索引为国家，列为年份
        feature_data = data_full.xs(feature, level=1)
        # 提取特定国家的数据
        country_feature_data = feature_data.loc[feature_data.index == country]
        # 转置数据，使年份成为索引
        country_feature_data = country_feature_data.T
        # 重命名列为特征名称
        country_feature_data.columns = [feature]
        # 添加到数据列表中
        data_frames.append(country_feature_data)

    # 合并所有特征的数据
    from functools import reduce
    data = reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True), data_frames)

    # 将年份索引转换为整数类型
    data.index = data.index.astype(int)

    # 按年份排序
    data = data.sort_index()

    # 返回准备好的数据
    return data

# 3. 为所有国家准备数据，并存储在字典中
country_data_dict = {}

for country in countries:
    try:
        data = prepare_country_data(country)
        country_data_dict[country] = data
    except Exception as e:
        print(f"Error preparing data for {country}: {e}")

In [42]:
import pandas as pd
import numpy as np
from statsmodels.tsa.api import VAR
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings('ignore')

# 获取国家列表
countries = list(country_data_dict.keys())

# 定义获取最佳滞后阶数的函数
def get_best_p(country, max_p=10):
    try:
        # 获取国家的数据
        data = country_data_dict[country]
        
        # 将数据分为训练集和测试集（例如 70% 训练数据）
        train_size = int(len(data) * 0.7)
        train_data = data.iloc[:train_size]
        test_data = data.iloc[train_size:]
        
        # 可能的滞后阶数 p 值
        possible_p = range(1, min(max_p, train_size - 1) + 1)
        
        mse_dict = {}
        
        for p in possible_p:
            try:
                model = VAR(train_data)
                results = model.fit(p)
                
                # 在测试集上进行预测
                lag_order = results.k_ar
                forecast_input = train_data.values[-lag_order:]
                forecast = results.forecast(y=forecast_input, steps=len(test_data))
                forecast_df = pd.DataFrame(forecast, index=test_data.index, columns=test_data.columns)
                
                # 计算测试集上的 MSE（针对 GDP）
                if 'Economics: GDP' in data.columns:
                    target_col = 'Economics: GDP'
                elif 'GDP' in data.columns:
                    target_col = 'GDP'
                else:
                    print(f"{country}: 数据中未找到目标变量 'GDP'。")
                    return {'Country': country, 'Best_p': None, 'MSE': None}
                
                y_true = test_data[target_col]
                y_pred = forecast_df[target_col]
                
                mse = mean_squared_error(y_true, y_pred)
                mse_dict[p] = mse
            except Exception as e:
                # 如果在此 p 值下出现错误，跳过
                # print(f"{country}: 在 p={p} 时出错：{e}")
                continue
        
        if not mse_dict:
            print(f"{country}: 未找到有效的 p 值。")
            return {'Country': country, 'Best_p': None, 'MSE': None}
        
        # 找到使 MSE 最小的 p 值
        best_p = min(mse_dict, key=mse_dict.get)
        best_mse = mse_dict[best_p]
        
        return {'Country': country, 'Best_p': best_p, 'MSE': best_mse}
    except Exception as e:
        print(f"{country}: 处理时出错：{e}")
        return {'Country': country, 'Best_p': None, 'MSE': None}

# 处理所有国家
best_p_results = []

for country in countries:
    result = get_best_p(country, max_p=10)
    best_p_results.append(result)

# 将结果转换为 DataFrame
best_p_df = pd.DataFrame(best_p_results)

# 输出各个国家对应的最佳滞后阶数
print("\n各个国家的最佳滞后阶数（best_p）：")
print(best_p_df)

# 保存结果到 CSV 文件
best_p_df.to_csv('VAR_best_p_per_country.csv', index=False)


各个国家的最佳滞后阶数（best_p）：
                 Country  Best_p           MSE
0                Albania       6  3.296556e+18
1                Algeria       1  2.213920e+22
2                 Angola       7  1.006569e+22
3    Antigua and Barbuda       1  4.624386e+16
4              Argentina       9  4.605023e+21
..                   ...     ...           ...
165              Vietnam       1  1.711473e+21
166       Western Sahara       5  4.414643e+23
167                Yemen       2  7.107409e+20
168               Zambia       1  4.257011e+20
169             Zimbabwe       1  1.420355e+20

[170 rows x 3 columns]


In [36]:
import pandas as pd
import numpy as np
from statsmodels.tsa.api import VAR
import warnings

warnings.filterwarnings('ignore')

# 假设您已经运行了之前的代码，得到了以下数据：
# - country_data_dict: 包含每个国家的数据，键为国家名称，值为 DataFrame
# - best_p_df: 包含每个国家的最佳滞后阶数，列为 'Country' 和 'Best_p'

# 如果 best_p_df 还没有，则读取保存的 CSV 文件
# best_p_df = pd.read_csv('VAR_best_p_per_country.csv')

# 获取国家列表
countries = best_p_df['Country'].tolist()

# 定义用于存储未来预测结果的列表
future_forecasts = []

# 定义处理每个国家的函数
def process_country_forecast(country):
    try:
        # 获取国家的数据
        data = country_data_dict[country]
        
        # 将年份索引转换为 DatetimeIndex
        if not isinstance(data.index, pd.DatetimeIndex):
            data.index = pd.to_datetime(data.index.astype(str), format='%Y')
        
        # 获取最佳滞后阶数
        best_p_row = best_p_df[best_p_df['Country'] == country]
        if best_p_row.empty or pd.isnull(best_p_row['Best_p'].values[0]):
            print(f"{country}: 没有找到最佳滞后阶数，跳过。")
            return None
        best_p = int(best_p_row['Best_p'].values[0])
        
        # 确定目标变量列名
        if 'Economics: GDP' in data.columns:
            target_col = 'Economics: GDP'
        elif 'GDP' in data.columns:
            target_col = 'GDP'
        else:
            print(f"{country}: 数据中未找到目标变量 'GDP'。")
            return None
        
        # 训练 VAR 模型（使用全部数据）
        model = VAR(data)
        model_fitted = model.fit(best_p)
        
        # 预测未来 10 年
        steps_ahead = 10
        lag_order = model_fitted.k_ar
        forecast_input = data.values[-lag_order:]
        future_forecast = model_fitted.forecast(y=forecast_input, steps=steps_ahead)
        
        # 创建未来年份的索引
        last_year = data.index[-1].year
        future_years = pd.date_range(start=pd.Timestamp(last_year + 1, 1, 1), periods=steps_ahead, freq='YS')
        future_forecast_df = pd.DataFrame(future_forecast, index=future_years, columns=data.columns)
        
        # 提取未来 GDP 的预测值
        gdp_forecast = future_forecast_df[[target_col]].reset_index()
        gdp_forecast.columns = ['Year', 'GDP_Forecast']
        gdp_forecast['Country'] = country
        
        # 将年份转换为整数年份
        gdp_forecast['Year'] = gdp_forecast['Year'].dt.year
        
        # 重新排列列的顺序
        gdp_forecast = gdp_forecast[['Country', 'Year', 'GDP_Forecast']]
        
        return gdp_forecast
        
    except Exception as e:
        print(f"{country}: 处理时出错：{e}")
        return None

# 处理所有国家并收集未来预测结果
future_forecasts = []

for country in countries:
    forecast = process_country_forecast(country)
    if forecast is not None:
        future_forecasts.append(forecast)

# 合并所有国家的未来预测结果
future_forecasts_df = pd.concat(future_forecasts, ignore_index=True)

# 保存未来预测的 GDP 值到 CSV 文件
future_forecasts_df.to_csv('VAR_future_gdp_forecasts.csv', index=False)

In [49]:
import pandas as pd
import numpy as np
from statsmodels.tsa.api import VAR
import matplotlib.pyplot as plt
from ipywidgets import interact, Dropdown, IntSlider
import warnings

warnings.filterwarnings('ignore')

# 假设您已经运行了之前的代码，得到了以下数据：
# - country_data_dict: 包含每个国家的数据，键为国家名称，值为 DataFrame
# - best_p_df: 包含每个国家的最佳滞后阶数，列为 'Country' 和 'Best_p'

# 如果 best_p_df 还没有，则读取保存的 CSV 文件
# best_p_df = pd.read_csv('VAR_best_p_per_country.csv')

# 获取国家列表
countries = best_p_df['Country'].tolist()

# 定义用于存储每个国家结果的列表
results = []

# 定义处理每个国家的函数
def process_country(country):
    try:
        # 获取国家的数据
        data = country_data_dict[country]
        
        # 将年份索引转换为 DatetimeIndex
        if not isinstance(data.index, pd.DatetimeIndex):
            data.index = pd.to_datetime(data.index.astype(str), format='%Y')
        
        # 获取最佳滞后阶数
        best_p_row = best_p_df[best_p_df['Country'] == country]
        if best_p_row.empty or pd.isnull(best_p_row['Best_p'].values[0]):
            print(f"{country}: 没有找到最佳滞后阶数，跳过。")
            return None
        best_p = int(best_p_row['Best_p'].values[0])
        
        # 确定目标变量列名
        if 'Economics: GDP' in data.columns:
            target_col = 'Economics: GDP'
        elif 'GDP' in data.columns:
            target_col = 'GDP'
        else:
            print(f"{country}: 数据中未找到目标变量 'GDP'。")
            return None
        
        # 训练 VAR 模型（使用全部数据）
        model = VAR(data)
        model_fitted = model.fit(best_p)
        
        # 保存结果，包括模型、数据和目标列名
        result = {
            'Country': country,
            'Best_p': best_p,
            'Model': model_fitted,
            'Data': data,
            'Target_Col': target_col
        }
        return result
        
    except Exception as e:
        print(f"{country}: 处理时出错：{e}")
        return None

# 处理所有国家并收集结果
results = []

for country in countries:
    res = process_country(country)
    if res is not None:
        results.append(res)

# 定义绘图函数
def plot_var_forecast(country, test_steps=10, forecast_steps=10):
    try:
        # 查找国家的结果
        country_result = next((res for res in results if res['Country'] == country), None)
        if country_result is None:
            print(f"{country}: 没有找到模型结果。")
            return
        
        data = country_result['Data']
        target_col = country_result['Target_Col']
        best_p = country_result['Best_p']
        
        # 确保数据索引为 DatetimeIndex
        if not isinstance(data.index, pd.DatetimeIndex):
            data.index = pd.to_datetime(data.index.astype(str), format='%Y')
        
        # 确保数据量足够
        if len(data) < (best_p + test_steps):
            print(f"{country}: 数据不足，无法进行绘图。")
            return
        
        # 分割数据为训练集和测试集
        train_size = len(data) - test_steps
        train_data = data.iloc[:train_size]
        test_data = data.iloc[train_size:]
        
        # 重新训练 VAR 模型
        model = VAR(train_data)
        model_fitted = model.fit(best_p)
        
        # 在测试集上进行预测
        lag_order = model_fitted.k_ar
        forecast_input = train_data.values[-lag_order:]
        nobs = len(test_data)
        forecast = model_fitted.forecast(y=forecast_input, steps=nobs)
        forecast_df = pd.DataFrame(forecast, index=test_data.index, columns=test_data.columns)
        
        # 预测未来值
        future_forecast = model_fitted.forecast(y=data.values[-lag_order:], steps=forecast_steps)
        last_year = data.index[-1].year
        future_years = pd.date_range(start=pd.Timestamp(last_year + 1, 1, 1), periods=forecast_steps, freq='YS')
        future_forecast_df = pd.DataFrame(future_forecast, index=future_years, columns=data.columns)
        
        # 准备绘图数据
        gdp_actual = pd.concat([train_data[target_col], test_data[target_col]])
        gdp_forecast = pd.concat([forecast_df[target_col], future_forecast_df[target_col]])
        
        # 绘制实际 GDP 和预测的 GDP
        plt.figure(figsize=(12, 6))
        plt.plot(gdp_actual.index.year, gdp_actual.values, marker='o', label='Actual GDP')
        plt.plot(gdp_forecast.index.year, gdp_forecast.values, marker='o', linestyle='--', label='Forecasted GDP')
        plt.title(f'{country} GDP with VAR Model Predictions and Forecast')
        plt.xlabel('Year')
        plt.ylabel('GDP')
        plt.legend()
        plt.grid(True)
        plt.show()
        
        # 计算预测误差
        errors = test_data[target_col].values - forecast_df[target_col].values
        
        # 绘制误差图
        plt.figure(figsize=(12, 6))
        plt.bar(test_data.index.year, errors)
        plt.title(f'{country} Prediction Errors (Test Data - Predictions)')
        plt.xlabel('Year')
        plt.ylabel('Error')
        plt.grid(True)
        plt.show()
        
    except Exception as e:
        print(f"{country}: 绘图失败。错误信息：{e}\n")

# 创建交互式小部件
country_dropdown = Dropdown(
    options=sorted(countries),
    description='Country:',
    style={'description_width': '80px'},
    layout={'width': '300px'},
    disabled=False
)

test_slider = IntSlider(
    min=1,
    max=20,
    step=1,
    value=10,
    description='Test Steps:',
    style={'description_width': '80px'},
    layout={'width': '300px'}
)

forecast_slider = IntSlider(
    min=1,
    max=20,
    step=1,
    value=10,
    description='Forecast Steps:',
    style={'description_width': '80px'},
    layout={'width': '300px'}
)

interact(
    plot_var_forecast,
    country=country_dropdown,
    test_steps=test_slider,
    forecast_steps=forecast_slider
)

# 将未来预测的 GDP 值保存到 CSV 文件
# 这里我们使用之前的结果列表 results

# 定义一个列表，用于存储所有国家的未来预测结果
future_forecasts = []

# 处理每个国家，提取未来预测的 GDP 值
for res in results:
    country = res['Country']
    model_fitted = res['Model']
    data = res['Data']
    target_col = res['Target_Col']
    best_p = res['Best_p']
    
    # 预测未来 10 年
    steps_ahead = 10
    lag_order = model_fitted.k_ar
    forecast_input = data.values[-lag_order:]
    future_forecast = model_fitted.forecast(y=forecast_input, steps=steps_ahead)
    
    # 创建未来年份的索引
    last_year = data.index[-1].year
    future_years = pd.date_range(start=pd.Timestamp(last_year + 1, 1, 1), periods=steps_ahead, freq='YS')
    future_forecast_df = pd.DataFrame(future_forecast, index=future_years, columns=data.columns)
    
    # 提取未来 GDP 的预测值
    gdp_forecast = future_forecast_df[[target_col]].reset_index()
    gdp_forecast.columns = ['Year', 'GDP_Forecast']
    gdp_forecast['Country'] = country
    
    # 将年份转换为整数年份
    gdp_forecast['Year'] = gdp_forecast['Year'].dt.year
    
    # 重新排列列的顺序
    gdp_forecast = gdp_forecast[['Country', 'Year', 'GDP_Forecast']]
    
    # 添加到列表中
    future_forecasts.append(gdp_forecast)

# 合并所有国家的未来预测结果
future_forecasts_df = pd.concat(future_forecasts, ignore_index=True)

# 保存未来预测的 GDP 值到 CSV 文件
future_forecasts_df.to_csv('VAR_future_gdp_forecasts.csv', index=False)

# 输出提示信息
print("已完成所有国家未来 10 年 GDP 的预测，结果已保存到 'VAR_future_gdp_forecasts.csv'。")

interactive(children=(Dropdown(description='Country:', layout=Layout(width='300px'), options=('Albania', 'Alge…

已完成所有国家未来 10 年 GDP 的预测，结果已保存到 'VAR_future_gdp_forecasts.csv'。
