In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import interact, Dropdown
from IPython.display import display
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.ar_model import AutoReg
from sklearn.metrics import mean_squared_error, mean_absolute_error
from joblib import Parallel, delayed
import warnings
import joblib

# 1. Load data
data_full = pd.read_csv('data_imputation_full.csv', index_col=[0, 1])

gdp_data = data_full.xs('Economics: GDP', level=1)

# gdp_data.head()

# reshape data
gdp_long = gdp_data.reset_index().melt(id_vars=['Country'], var_name='Year', value_name='GDP')
gdp_long['Year'] = gdp_long['Year'].astype(int)

# gdp_long.head(50)

In [11]:
# ignore warnings
warnings.filterwarnings('ignore')

# define ADF test function
def adf_test(series, country):
    result = adfuller(series, autolag='AIC')
    print(f'--- {country} ---')
    print(f'ADF Statistic: {result[0]:.4f}')
    print(f'p-value: {result[1]:.4f}')
    for key, value in result[4].items():
        print(f'Critical Value {key}: {value:.4f}')
    if result[1] < 0.05:
        print("Conclusion: The series is stationary\n")
    else:
        print("Conclusion: The series is non-stationary\n")

# define function to difference series
def difference_series(series, order=1):
    return pd.Series(series).diff(order).dropna().values

# define the AR model training function
def train_ar_model(country, data):
    try:
        country_data = data.loc[country].sort_index(level='Year')
        
        # get GDP differences
        gdp_diff = country_data['GDP_diff'].dropna().values
        
        # set the maximum number of lags
        model = AutoReg(gdp_diff, lags=None, old_names=False).fit()
        print(f"{country}: AR model trained successfully with lags = {model.model.lags}\n")
        
        # return country, model, and GDP differences
        return (country, model, gdp_diff)
    
    except Exception as e:
        print(f"{country}: AR model training failed. Error: {e}\n")
        return (country, None, None)

# define the AR model evaluation function
def evaluate_ar_model(country, model, gdp_diff):
    try:
        # define train and test sets
        train_size = int(len(gdp_diff) * 0.8)
        train, test = gdp_diff[:train_size], gdp_diff[train_size:]
        
        # retrain the model
        model_train = AutoReg(train, lags=model.model.model.lags, old_names=False).fit()
        
        # make predictions
        predictions = model_train.predict(start=train_size, end=len(gdp_diff)-1, dynamic=False)
        
        # evaluate the model
        mse = mean_squared_error(test, predictions)
        mae = mean_absolute_error(test, predictions)
        r2 = model_train.rsquared
        
        print(f"{country} - evaluation results: MSE = {mse:.4f}, MAE = {mae:.4f}, R² = {r2:.4f}\n")
        
        # return evaluation results
        return (country, {'MSE': mse, 'MAE': mae, 'R2': r2})
    
    except Exception as e:
        print(f"{country}: Evaluation failed. Error: {e}\n")
        return (country, {'MSE': None, 'MAE': None, 'R2': None})

# define the AR model forecasting function
def forecast_ar_model(country, model, gdp_diff, forecast_steps=5):
    try:
        # make predictions
        forecast = model.predict(start=len(gdp_diff), end=len(gdp_diff) + forecast_steps - 1, dynamic=False)
        print(f"{country} - The next {forecast_steps} years forecast: {forecast}\n")
        
        # return country and forecast
        return (country, forecast)
    
    except Exception as e:
        print(f"{country}: prediction failed. Error: {e}\n")
        return (country, None)

# define the function to revert the differenced forecast to original GDP values
def revert_diff(country, forecast_diff, data):
    try:
        # get the last known GDP value
        country_data = data.loc[country].sort_index(level='Year')
        last_gdp = country_data['GDP'].iloc[-1]
        
        # add the differences to the last known GDP value
        forecast_gdp = []
        current_gdp = last_gdp
        for diff in forecast_diff:
            current_gdp += diff
            forecast_gdp.append(current_gdp)
        
        return (country, forecast_gdp)
    
    except Exception as e:
        print(f"{country}: difference reversion failed. Error: {e}\n")
        return (country, None)


# add GDP differences to the DataFrame
gdp_long['GDP_diff'] = gdp_long.groupby('Country')['GDP'].transform(lambda x: x.diff())

# set the index
gdp_long.set_index(['Country', 'Year'], inplace=True)

# get a list of countries
countries = gdp_long.index.get_level_values('Country').unique()

# train AR models
results = Parallel(n_jobs=-1)(
    delayed(train_ar_model)(country, gdp_long) for country in countries
)

# get AR models and GDP differences
ar_models = {}
gdp_diff_dict = {}

for result in results:
    country, model, gdp_diff = result
    if model is not None:
        ar_models[country] = model
        gdp_diff_dict[country] = gdp_diff

# evaluate AR models
evaluation_results = Parallel(n_jobs=-1)(
    delayed(evaluate_ar_model)(country, ar_models[country], gdp_diff_dict[country]) for country in ar_models.keys()
)

# save evaluation results
evaluation_dict = {country: metrics for country, metrics in evaluation_results}

# transform evaluation results to DataFrame
evaluation_df = pd.DataFrame.from_dict(evaluation_dict, orient='index')
evaluation_df.to_csv('AR_model_evaluation.csv')
print("\nSave as 'AR_model_evaluation.csv'.")

# predict the next 5 years
forecast_results = Parallel(n_jobs=-1)(
    delayed(forecast_ar_model)(country, ar_models[country], gdp_diff_dict[country], forecast_steps=5) for country in ar_models.keys()
)

# save the forecasts
forecast_dict = {country: forecast for country, forecast in forecast_results if forecast is not None}

# revert the differenced forecasts to original GDP values
reverted_forecast_results = Parallel(n_jobs=-1)(
    delayed(revert_diff)(country, forecast_dict[country], gdp_long) for country in forecast_dict.keys()
)

# save the reverted forecasts
reverted_forecast_dict = {country: gdp for country, gdp in reverted_forecast_results if gdp is not None}

# transform the reverted forecasts to DataFrame
reverted_forecast_df = pd.DataFrame(reverted_forecast_dict).T
reverted_forecast_df.columns = [f'Forecast_GDP_{i+1}' for i in range(reverted_forecast_df.shape[1])]
reverted_forecast_df.to_csv('AR_model_forecasts_original_gdp.csv')
print("\nSave as 'AR_model_forecasts_original_gdp.csv'.")

# save the AR model summary
for country, model in ar_models.items():
    summary = model.summary()
    with open(f'AR_model_summary_{country}.txt', 'w') as f:
        f.write(str(summary))
    print(f"{country}: 模型摘要已保存为 'AR_model_summary_{country}.txt'。")

# save the AR models
for country, model in ar_models.items():
    joblib.dump(model, f'AR_model_{country}.pkl')
    print(f"{country}: model saved as 'AR_model_{country}.pkl'.")

# define the function to plot GDP time series
def plot_gdp_enhanced(country):
    try:

        country_data = gdp_long.loc[country].sort_index()
        
        plt.figure(figsize=(12, 6))
        sns.lineplot(x=country_data.index, y='GDP', data=country_data, marker='o', label='GDP')
        
        # add the trend line
        sns.regplot(x=country_data.index, y='GDP', data=country_data, scatter=False, label='Trend Line', color='red')
        
        plt.title(f'{country} GDP Data')
        plt.xlabel('Year')
        plt.ylabel('GDP')
        plt.legend()
        plt.grid(True)
        plt.show()
        
        last_year = country_data.index.max()
        last_gdp = country_data.loc[last_year, 'GDP']
        print(f"The last known GDP value for {country} is {last_gdp:.2f} in {last_year}.\n")
    
    except Exception as e:
        print(f"{country}: plot failed. Error: {e}\n")

# interact with the plot function
interact(plot_gdp_enhanced, country=Dropdown(options=sorted(countries), description='国家:', disabled=False))

Albania: AR model training failed. Error: 'AutoReg' object has no attribute 'lags'

Algeria: AR model training failed. Error: 'AutoReg' object has no attribute 'lags'

Angola: AR model training failed. Error: 'AutoReg' object has no attribute 'lags'

Antigua and Barbuda: AR model training failed. Error: 'AutoReg' object has no attribute 'lags'

Argentina: AR model training failed. Error: 'AutoReg' object has no attribute 'lags'

Armenia: AR model training failed. Error: 'AutoReg' object has no attribute 'lags'

Aruba: AR model training failed. Error: 'AutoReg' object has no attribute 'lags'

Australia: AR model training failed. Error: 'AutoReg' object has no attribute 'lags'

Austria: AR model training failed. Error: 'AutoReg' object has no attribute 'lags'

Azerbaijan: AR model training failed. Error: 'AutoReg' object has no attribute 'lags'

Bahamas: AR model training failed. Error: 'AutoReg' object has no attribute 'lags'

Bahrain: AR model training failed. Error: 'AutoReg' object h

interactive(children=(Dropdown(description='国家:', options=('Albania', 'Algeria', 'Angola', 'Antigua and Barbud…

<function __main__.plot_gdp_enhanced(country)>

In [12]:
# # 2. visualize GDP data
# warnings.filterwarnings('ignore')

# gdp_long['GDP_diff'] = gdp_long.groupby('Country')['GDP'].transform(lambda x: x.diff())

# gdp_long.set_index(['Country', 'Year'], inplace=True)

# # countries = gdp_long['Country'].unique()

# def plot_gdp(country):
    
#     country_data = gdp_long.loc[country].sort_index()
    
#     plt.figure(figsize=(12, 6))
#     sns.lineplot(x=country_data.index, y='GDP', data=country_data, marker='o')
#     plt.title(f'{country} GDP Data')
#     plt.xlabel('Year')
#     plt.ylabel('GDP')
#     plt.grid(True)
#     plt.show()

# countries = gdp_long.index.get_level_values('Country').unique()

# interact(plot_gdp, country=Dropdown(options=sorted(countries), description='Country:', disabled=False))

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from joblib import Parallel, delayed
import joblib
import warnings
from ipywidgets import interact, Dropdown
from IPython.display import display
import os
import shutil

# 忽略警告以保持输出整洁
warnings.filterwarnings('ignore')

# 1. Load data
data_full = pd.read_csv('data_imputation_full.csv', index_col=[0, 1])

# 提取 'Economics: GDP' 相关的数据
gdp_data = data_full.xs('Economics: GDP', level=1)

# Reshape data from wide to long format
gdp_long = gdp_data.reset_index().melt(id_vars=['Country'], var_name='Year', value_name='GDP')
gdp_long['Year'] = gdp_long['Year'].astype(int)

# 2. Define AR model functions
def fit_ar_p(data, p):
    """
    拟合AR(p)模型并返回模型参数。

    参数:
    data (array-like): GDP时间序列数据。
    p (int): 滞后阶数。

    返回:
    beta (numpy.ndarray): 回归系数（包括截距）。
    A (numpy.ndarray): 特征矩阵。
    b (numpy.ndarray): 目标向量。
    """
    N = len(data)
    if N <= p:
        raise ValueError("数据长度必须大于滞后阶数 p")
    
    # 构建特征矩阵 A 和目标向量 b
    A = np.ones((N - p, p + 1))  # 第一列为截距项
    for i in range(p):
        A[:, i + 1] = data[p - i - 1:N - i - 1]
    b = data[p:]
    
    # 使用最小二乘法计算回归系数
    beta = np.linalg.inv(A.T @ A) @ A.T @ b
    
    print("Shape of A (lag matrix):", A.shape)
    print("Shape of b (target vector):", b.shape)
    
    return beta, A, b

def predict_ar_p(history, beta, p):
    """
    使用AR(p)模型进行预测。

    参数:
    history (list or array-like): 最近的p个GDP值。
    beta (numpy.ndarray): 回归系数（包括截距）。
    p (int): 滞后阶数。

    返回:
    prediction (float): 预测的GDP值。
    """
    if len(history) < p:
        raise ValueError("历史数据长度必须大于或等于滞后阶数 p")
    
    # 构建特征向量（包括截距项）
    feature = np.ones(p + 1)
    feature[1:] = history[-1:-p-1:-1]  # 最近的p个值，倒序
    prediction = np.dot(beta, feature)
    return prediction

# 3. Train AR models and select best p
def train_and_select_p(country, df, p_values, train_ratio=0.7):
    """
    为指定国家训练AR模型，选择最佳的滞后阶数 p。

    参数:
    country (str): 国家名称。
    df (DataFrame): 包含所有国家GDP数据的数据框。
    p_values (list): 要尝试的滞后阶数列表。
    train_ratio (float): 训练集比例。

    返回:
    dict: 包含训练结果和最佳 p 值的字典。
    """
    try:
        # 提取该国家的数据并按年份排序
        country_df = df[df['Country'] == country].sort_values('Year')
        gdp = country_df['GDP'].values
        
        # 确保有足够的数据点
        if len(gdp) < max(p_values) + 1:
            print(f"{country}: 数据点不足（{len(gdp)}），跳过AR模型训练。\n")
            return {
                'Country': country,
                'Best_p': None,
                'MSE': None,
                'MAE': None,
                'R2': None,
                'Beta': None,
                'Train': None,
                'Test': None,
                'Predictions': None
            }
        
        # 划分训练集和测试集
        train_size = int(len(gdp) * train_ratio)
        train, test = gdp[:train_size], gdp[train_size:]
        
        best_p = None
        best_mse = float('inf')
        best_beta = None
        best_predictions = None
        
        for p in p_values:
            if len(train) <= p:
                print(f"{country}: p={p} 不适用，数据点不足。\n")
                continue
            try:
                # 拟合AR(p)模型
                beta, A, b = fit_ar_p(train, p)
                
                # 进行预测
                predictions = []
                history = list(train)
                for _ in range(len(test)):
                    pred = predict_ar_p(history, beta, p)
                    predictions.append(pred)
                    history.append(pred)
                
                # 计算MSE
                mse = mean_squared_error(test, predictions)
                
                # 选择最佳 p
                if mse < best_mse:
                    best_mse = mse
                    best_p = p
                    best_beta = beta
                    best_predictions = predictions
            except Exception as e:
                print(f"{country}: p={p} 模型训练或预测失败。错误信息: {e}\n")
                continue
        
        # 计算其他评估指标
        if best_p is not None:
            mae = mean_absolute_error(test, best_predictions)
            r2 = r2_score(test, best_predictions)
        else:
            mae = None
            r2 = None
        
        return {
            'Country': country,
            'Best_p': best_p,
            'MSE': best_mse if best_p is not None else None,
            'MAE': mae,
            'R2': r2,
            'Beta': best_beta,
            'Train': train,
            'Test': test,
            'Predictions': best_predictions
        }
    
    except Exception as e:
        print(f"{country}: AR模型训练失败。错误信息: {e}\n")
        return {
            'Country': country,
            'Best_p': None,
            'MSE': None,
            'MAE': None,
            'R2': None,
            'Beta': None,
            'Train': None,
            'Test': None,
            'Predictions': None
        }

# 定义要尝试的滞后阶数
p_values = list(range(1, 19))  # 尝试 p=1 到 p=60

# 获取所有国家的列表
countries = gdp_long['Country'].unique()

# 并行训练AR模型
results = Parallel(n_jobs=-1)(
    delayed(train_and_select_p)(country, gdp_long, p_values) for country in countries
)

# 将结果转换为 DataFrame
results_df = pd.DataFrame(results)

# 查看评估指标
print(results_df[['Country', 'Best_p', 'MSE', 'MAE', 'R2']].head())

# 过滤成功训练的模型
evaluation_df = results_df.dropna(subset=['Beta', 'Best_p']).reset_index(drop=True)

# 查看统计描述
print(evaluation_df[['MSE', 'MAE', 'R2']].describe())

# 保存评估结果
evaluation_df.to_csv('Manual_AR_model_evaluation.csv', index=False)
print("模型评估结果已保存到 'Manual_AR_model_evaluation.csv'。")

Shape of A (lag matrix): (20, 2)
Shape of b (target vector): (20,)
Shape of A (lag matrix): (19, 3)
Shape of b (target vector): (19,)
Shape of A (lag matrix): (18, 4)
Shape of b (target vector): (18,)
Shape of A (lag matrix): (17, 5)
Shape of b (target vector): (17,)
Shape of A (lag matrix): (16, 6)
Shape of b (target vector): (16,)
Shape of A (lag matrix): Shape of A (lag matrix): (20, 2)(15, 7)

Shape of b (target vector):Shape of b (target vector):  (20,)(15,)

Shape of A (lag matrix): (14, 8)
Shape of b (target vector): (14,)
Shape of A (lag matrix): (19, 3)
Shape of b (target vector): (19,)
Shape of A (lag matrix): (13, 9)
Shape of b (target vector): (13,)
Shape of A (lag matrix): (18, 4)
Shape of b (target vector): (18,)
Shape of A (lag matrix): (12, 10)
Shape of b (target vector): (12,)
Shape of A (lag matrix): (17, 5)
Shape of b (target vector): (17,)
Shape of A (lag matrix): (11, 11)
Shape of b (target vector): (11,)
Shape of A (lag matrix): (16, 6)
Shape of b (target vector):

  output_errors = np.average((y_true - y_pred) ** 2, axis=0, weights=sample_weight)
  output_errors = np.average((y_true - y_pred) ** 2, axis=0, weights=sample_weight)
  output_errors = np.average((y_true - y_pred) ** 2, axis=0, weights=sample_weight)
  output_errors = np.average((y_true - y_pred) ** 2, axis=0, weights=sample_weight)
  output_errors = np.average((y_true - y_pred) ** 2, axis=0, weights=sample_weight)


In [33]:
def forecast_future_gdp_manual(country, df, beta, p, steps=5):
    """
    使用训练好的AR(p)模型预测未来的GDP。

    参数:
    country (str): 国家名称。
    df (DataFrame): 包含该国家的GDP数据的数据框。
    beta (numpy.ndarray): AR模型参数（包括截距）。
    p (int): 滞后阶数。
    steps (int): 预测的未来年份数。

    返回:
    list: 预测的GDP值。
    """
    try:
        # 提取该国家的数据并排序
        country_df = df[df['Country'] == country].sort_values('Year')
        gdp = country_df['GDP'].values.tolist()
        
        predictions = []
        for _ in range(steps):
            # 使用最近的 p 个GDP值进行预测
            history = gdp[-p:]
            pred = predict_ar_p(history, beta, p)
            predictions.append(pred)
            gdp.append(pred)  # 将预测值加入历史数据
        
        return predictions
    
    except Exception as e:
        print(f"{country}: GDP预测失败。错误信息: {e}")
        return [None]*steps

def perform_forecast(row, df, steps=5):
    """
    为指定国家进行GDP预测。

    参数:
    row (Series): 包含国家训练结果的数据行。
    df (DataFrame): 包含所有国家GDP数据的数据框。
    steps (int): 预测的未来年份数。

    返回:
    dict: 包含国家名称和预测的GDP列表的字典。
    """
    country = row['Country']
    beta = row['Beta']
    p = row['Best_p']
    
    if beta is None or p is None:
        return {
            'Country': country,
            'Forecast_GDP': [None]*steps
        }
    
    forecast = forecast_future_gdp_manual(country, df, beta, p, steps)
    return {
        'Country': country,
        'Forecast_GDP': forecast
    }

# 并行进行预测
forecast_steps = 5

forecast_results = Parallel(n_jobs=-1)(
    delayed(perform_forecast)(row, gdp_long, steps=forecast_steps) for index, row in evaluation_df.iterrows()
)

# 将预测结果转换为 DataFrame
forecast_df = pd.DataFrame(forecast_results)

# 展开预测列表为多个列
forecast_expanded = pd.DataFrame(forecast_df['Forecast_GDP'].tolist(),
                                 columns=[f'Forecast_GDP_{i+1}' for i in range(forecast_steps)])

forecast_final = pd.concat([forecast_df['Country'], forecast_expanded], axis=1)

# 查看预测结果
print(forecast_final.head())

# 保存预测结果
forecast_final.to_csv('Manual_AR_model_forecasts.csv', index=False)
print("预测结果已保存到 'Manual_AR_model_forecasts.csv'。")

               Country  Forecast_GDP_1  Forecast_GDP_2  Forecast_GDP_3  \
0              Albania    1.521374e+10    1.558044e+10    1.594588e+10   
1              Algeria    1.633721e+11    1.517289e+11    1.345880e+11   
2               Angola    6.807423e+10    7.901673e+10    9.136311e+10   
3  Antigua and Barbuda    1.431683e+09    1.575690e+09    1.653545e+09   
4            Argentina    3.917863e+11    3.966993e+11    4.019011e+11   

   Forecast_GDP_4  Forecast_GDP_5  
0    1.630407e+10    1.665429e+10  
1    1.577327e+11    1.726633e+11  
2    1.052935e+11    1.210110e+11  
3    1.648470e+09    1.582196e+09  
4    4.068982e+11    4.115259e+11  
预测结果已保存到 'Manual_AR_model_forecasts.csv'。


In [34]:
import os
import shutil

# 创建模型保存文件夹
model_dir = 'Manual_AR_models'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# 移动所有模型文件到模型文件夹
for country in evaluation_df['Country']:
    model_filename = f'AR_model_{country}.pkl'
    original_path = f'AR_model_{country}.pkl'
    new_path = os.path.join(model_dir, model_filename)
    if os.path.exists(original_path):
        shutil.move(original_path, new_path)

In [35]:
from ipywidgets import interact, Dropdown

def plot_gdp_with_forecast_manual(country):
    try:
        # 提取历史GDP数据
        country_history = gdp_long[gdp_long['Country'] == country].sort_values('Year')
        years_history = country_history['Year']
        gdp_history = country_history['GDP']
        
        # 提取预测GDP数据
        country_forecast = forecast_final[forecast_final['Country'] == country]
        if country_forecast.empty:
            print(f"{country} 没有预测数据。")
            return
        forecast_years = list(range(years_history.max() + 1, years_history.max() + 1 + forecast_steps))
        gdp_forecast = country_forecast.iloc[0][1:].values
        
        # 创建图表
        plt.figure(figsize=(12, 6))
        sns.lineplot(x=years_history, y=gdp_history, marker='o', label='历史GDP')
        sns.lineplot(x=forecast_years, y=gdp_forecast, marker='o', linestyle='--', label='预测GDP')
        plt.title(f'{country} GDP 历史与预测')
        plt.xlabel('Year')
        plt.ylabel('GDP')
        plt.legend()
        plt.grid(True)
        plt.show()
        
        # 显示最后一个已知GDP值
        last_year = years_history.max()
        last_gdp = gdp_history.iloc[-1]
        print(f"最后已知年份：{last_year}，GDP：{last_gdp:.2f}")
        
        # 显示预测GDP值
        for i, year in enumerate(forecast_years):
            print(f"预测年份：{year}，GDP：{gdp_forecast[i]:.2f}")
    
    except Exception as e:
        print(f"{country}: 绘图失败。错误信息: {e}\n")

# 创建交互式下拉菜单
interact(plot_gdp_with_forecast_manual, country=Dropdown(options=sorted(countries), description='国家:', disabled=False))

interactive(children=(Dropdown(description='国家:', options=('Albania', 'Algeria', 'Angola', 'Antigua and Barbud…

<function __main__.plot_gdp_with_forecast_manual(country)>