
<div style="width: 100%; max-width: 100%; background-color: #fefefe; border: 1px solid #333; border-radius: 10px; padding: 20px; font-family: Arial, sans-serif; box-sizing: border-box;">
  <h3 style="color: #2c3e50; text-align: center;">LASSO Model</h2>
  
  <p style="color: #34495e; line-height: 1.6;">
    Beyond AR model, I also want to explore other approaches. Considering the challenges with the VAR model (overfitting), I plan to try the Lasso model. It regularizes coefficients in addition to minimizing RSS, incorporating both into its optimization. I believe this might be an effective solution to address overfitting.
  <p style="color: #34495e; line-height: 1.6;">
    Compared to manually adjusting feature thresholds for 170 countries, using coefficients as penalty factors to automatically reduce the weights of less significant features is a more efficient approach.
  </p>
  
</div>

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import MultiTaskLassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import reduce
from ipywidgets import interact, Dropdown, IntSlider, Layout
import warnings
import os

warnings.filterwarnings('ignore')

# Load data
data_full = pd.read_csv('data_imputation_full.csv', index_col=[0, 1])
selected_features_df = pd.read_csv('selected_features_per_country_elastic_net.csv')
countries = selected_features_df['Country'].unique()

# Transform selected_features_df to long format
selected_features_long = selected_features_df.melt(id_vars='Country', value_name='Feature').dropna()

# Prepare data for each country
def prepare_country_data(country):
    try:
        features = selected_features_long[selected_features_long['Country'] == country]['Feature'].tolist()
        if 'Economics: GDP' not in features:
            features.append('Economics: GDP')

        data_frames = []
        for feature in features:
            try:
                feature_data = data_full.xs(feature, level=1)
                country_feature_data = feature_data.loc[feature_data.index == country]
                country_feature_data = country_feature_data.T
                country_feature_data.columns = [feature]
                data_frames.append(country_feature_data)
            except Exception:
                continue

        if not data_frames:
            return None

        data = reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True, how='outer'), data_frames)
        data.index = data.index.astype(int)
        data = data.sort_index()
        data.index = pd.to_datetime(data.index.astype(str), format='%Y')

        return data
    except Exception:
        return None

# Create lagged features
def create_lagged_features(data, max_lag):
    df = data.copy()
    cols = df.columns
    lagged_data = []
    for lag in range(1, max_lag + 1):
        shifted = df.shift(lag)
        shifted.columns = [f'{col}_lag{lag}' for col in cols]
        lagged_data.append(shifted)
    lagged_df = pd.concat(lagged_data, axis=1)
    combined_df = pd.concat([df, lagged_df], axis=1).dropna()
    X = combined_df.iloc[:, len(cols):]
    y = combined_df.iloc[:, :len(cols)]
    return X, y

# Process each country using MultiTaskLassoCV
def process_country(country):
    try:
        data = prepare_country_data(country)
        if data is None:
            return None

        max_lag = 5
        X, y = create_lagged_features(data, max_lag)

        # Standardize features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Train MultiTaskLassoCV model
        model = MultiTaskLassoCV(cv=5)
        model.fit(X_scaled, y)

        # Generate future forecasts
        forecast_steps = 10
        forecasts = []
        last_known = X.iloc[-1].values

        for step in range(forecast_steps):
            if step == 0:
                lagged_vars = last_known
            else:
                lagged_vars = np.roll(lagged_vars, -y.shape[1])
                lagged_vars[-y.shape[1]:] = forecasts[-1]

            lagged_vars_scaled = scaler.transform([lagged_vars])
            y_pred = model.predict(lagged_vars_scaled)
            forecasts.append(y_pred[0])

        forecasts = np.array(forecasts)
        future_years = pd.date_range(start=pd.Timestamp(data.index[-1].year + 1, 1, 1), periods=forecast_steps, freq='YS')
        forecast_df = pd.DataFrame(forecasts, index=future_years, columns=y.columns)

        # Extract GDP forecast
        target_col = 'Economics: GDP' if 'Economics: GDP' in data.columns else 'GDP'
        gdp_forecast = forecast_df[[target_col]].reset_index()
        gdp_forecast.columns = ['Year', 'GDP_Forecast']
        gdp_forecast['Country'] = country
        gdp_forecast['Year'] = gdp_forecast['Year'].dt.year
        gdp_forecast = gdp_forecast[['Country', 'Year', 'GDP_Forecast']]

        return {
            'Country': country,
            'Model': model,
            'Scaler': scaler,
            'Lag_Order': max_lag,
            'Data': data,
            'Target_Col': target_col,
            'GDP_Forecast': gdp_forecast
        }
    except Exception as e:
        print(f"{country}: Error in process_country: {e}")
        return None

# Use parallel processing to handle all countries
def process_all_countries(countries, max_workers=8):
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_country, country): country for country in countries}
        for future in as_completed(futures):
            result = future.result()
            if result is not None:
                results.append(result)
    return results

# Process countries in parallel
results = process_all_countries(countries, max_workers=8)

# Combine GDP forecasts
future_forecasts = []
for res in results:
    future_forecasts.append(res['GDP_Forecast'])
future_forecasts_df = pd.concat(future_forecasts, ignore_index=True)
# future_forecasts_df.to_csv('future_GDP_forecasts.csv', index=False)
# print("Saved as 'future_GDP_forecasts.csv'")

# Define a function to plot results
def plot_var_forecast(country, forecast_steps=10):
    try:
        country_result = next((res for res in results if res['Country'] == country), None)
        if country_result is None:
            print(f"{country}: Not found in results.")
            return

        data = country_result['Data']
        target_col = country_result['Target_Col']
        model = country_result['Model']
        scaler = country_result['Scaler']
        max_lag = country_result['Lag_Order']

        X, y = create_lagged_features(data, max_lag)
        train_size = int(len(X) * 0.7)
        X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
        y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        y_pred = model.predict(X_test_scaled)

        # Forecast future values
        forecasts = []
        last_known = X.iloc[-1].values

        for step in range(forecast_steps):
            if step == 0:
                lagged_vars = last_known
            else:
                lagged_vars = np.roll(lagged_vars, -y.shape[1])
                lagged_vars[-y.shape[1]:] = forecasts[-1]

            lagged_vars_scaled = scaler.transform([lagged_vars])
            y_future_pred = model.predict(lagged_vars_scaled)
            forecasts.append(y_future_pred[0])

        forecasts = np.array(forecasts)
        future_years = pd.date_range(start=pd.Timestamp(data.index[-1].year + 1, 1, 1), periods=forecast_steps, freq='YS')
        future_forecast_df = pd.DataFrame(forecasts, index=future_years, columns=y.columns)

        # Plot GDP over time
        plt.figure(figsize=(12, 6))
        plt.plot(y_train.index.year, y_train[target_col], marker='o', label='Train Data')
        plt.plot(y_test.index.year, y_test[target_col], marker='o', label='Test Data')
        plt.plot(y_test.index.year, y_pred[:, y.columns.get_loc(target_col)], marker='o', linestyle='--', label='Predictions')
        plt.plot(future_forecast_df.index.year, future_forecast_df[target_col], marker='o', linestyle='--', label='Forecast')
        plt.title(f'{country} GDP with LASSO Model Predictions and Forecast')
        plt.xlabel('Year')
        plt.ylabel('GDP')
        plt.legend()
        plt.grid(True)
        plt.show()

        errors = y_test[target_col].values - y_pred[:, y.columns.get_loc(target_col)]

        # Plot prediction errors
        plt.figure(figsize=(12, 6))
        plt.bar(y_test.index.year, errors, color='orange')
        plt.title(f'{country} Prediction Errors (Test Data - Predictions)')
        plt.xlabel('Year')
        plt.ylabel('Error')
        plt.grid(True)
        plt.show()

    except Exception as e:
        print(f"{country}: Plot failed. Error: {e}\n")

# Interactive plot
default_country = 'United States'
default_forecast_steps = 5

country_dropdown = Dropdown(
    options=sorted(countries),
    description='Country:',
    value=default_country,
    style={'description_width': '120px'},
    layout=Layout(width='400px', margin='0 0 0 100px'),
    disabled=False
)

forecast_slider = IntSlider(
    min=1,
    max=10,
    step=1,
    value=default_forecast_steps,
    description='Forecast Steps:',
    style={'description_width': '120px'},
    layout=Layout(width='400px', margin='0 0 0 100px')
)
interact(plot_var_forecast, country=country_dropdown, forecast_steps=forecast_slider)

# save all data in Lasso folder for Streamlit visualization
output_folder = 'Lasso'
os.makedirs(output_folder, exist_ok=True)

def save_plot_data(country_result, forecast_steps):
    try:
        data = country_result['Data']
        target_col = country_result['Target_Col']
        model = country_result['Model']
        scaler = country_result['Scaler']
        max_lag = country_result['Lag_Order']

        X, y = create_lagged_features(data, max_lag)
        train_size = int(len(X) * 0.7)
        X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
        y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

        # standardize features
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        y_pred = model.predict(X_test_scaled)

        # save original data
        data.to_csv(os.path.join(output_folder, f'data_{country_result["Country"]}_original.csv'))

        # save train and test data
        pd.concat([X_train, y_train], axis=1).to_csv(os.path.join(output_folder, f'data_{country_result["Country"]}_train.csv'))
        pd.concat([X_test, y_test], axis=1).to_csv(os.path.join(output_folder, f'data_{country_result["Country"]}_test.csv'))

        # save predictions
        y_test_pred_df = pd.DataFrame(y_pred, index=y_test.index, columns=y.columns)
        y_test_pred_df.to_csv(os.path.join(output_folder, f'data_{country_result["Country"]}_predictions.csv'))

        # generate future forecasts
        forecasts = []
        last_known = X.iloc[-1].values

        for step in range(forecast_steps):
            if step == 0:
                lagged_vars = last_known
            else:
                lagged_vars = np.roll(lagged_vars, -y.shape[1])
                lagged_vars[-y.shape[1]:] = forecasts[-1]

            lagged_vars_scaled = scaler.transform([lagged_vars])
            y_future_pred = model.predict(lagged_vars_scaled)
            forecasts.append(y_future_pred[0])

        forecasts = np.array(forecasts)
        future_years = pd.date_range(start=pd.Timestamp(data.index[-1].year + 1, 1, 1), periods=forecast_steps, freq='YS')
        future_forecast_df = pd.DataFrame(forecasts, index=future_years, columns=y.columns)
        future_forecast_df.to_csv(os.path.join(output_folder, f'data_{country_result["Country"]}_future_forecast.csv'))

        # print(f"All data for {country_result['Country']} saved successfully.")
    except Exception as e:
        print(f"{country_result['Country']}: Error in saving plot data: {e}")

for res in results:
    save_plot_data(res, forecast_steps=10)

interactive(children=(Dropdown(description='Country:', index=160, layout=Layout(margin='0 0 0 100px', width='4…