In [17]:
import pandas as pd
import numpy as np
# from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV
from sklearn.preprocessing import StandardScaler
from joblib import Parallel, delayed
import warnings

# 1. Load data
url = 'data_imputation_full.csv'
data_full = pd.read_csv(url, index_col=[0, 1])

# 2. Characteristic value selection
gdp_model_vars = [
    'Economics: GDP',
    'Electricity: Distribution Losses',
    'Electricity: Exports',
    'Electricity: Imports',
    'Electricity: Installed Capacity',
    'Electricity: Net Consumption',
    'Electricity: Net Generation',
    'Electricity: Net Imports',
    'Population: Area (km²)',
    'Population: Density', 
    'Population: Growth Rate',
    'Population: Percentage', 
    'Population: Population'
]

# Subset the data to include only the selected variables
gdp_model_data = data_full.loc[data_full.index.get_level_values(1).isin(gdp_model_vars)]

if 'Year' in gdp_model_data.columns:
    gdp_model_data = gdp_model_data.pivot_table(index=['Country', 'Year'], columns='Variable', values='Value').reset_index()
else:
    gdp_model_data = gdp_model_data.stack().reset_index()
    gdp_model_data.columns = ['Country', 'Variable', 'Year', 'Value']
    gdp_model_data = gdp_model_data.pivot_table(index=['Country', 'Year'], columns='Variable', values='Value').reset_index()

# print(gdp_model_data.head())

In [24]:
# ignore warnings
warnings.filterwarnings('ignore')

def remove_highly_correlated_features(X, threshold=0.9):
    """
    Remove confounding features.

    Parameters:
    - X (pd.DataFrame):  features.
    - threshold (float): correlation threshold.

    Returns:
    - X_reduced (pd.DataFrame): features with highly correlated features removed.
    """
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    X_reduced = X.drop(columns=to_drop)
    return X_reduced

def select_features_elastic_net(country, data, target_variable='Economics: GDP', cv=5, random_state=42):
    """
    Select features using ElasticNet regression for each country.

    Parameters:
    - country (str):         the country name
    - data (pd.DataFrame):   the data including all countries
    - target_variable (str): the target variable name
    - cv (int):              the number of cross-validation folds
    - random_state (int):    the random state for ElasticNetCV
    - min_data_points (int): the minimum number of data points required to run the model

    Returns:
    - (str, list):         a tuple containing the country name and a list of selected features
    """
    try:
        # get the data for the country
        country_data = data[data['Country'] == country].copy()

        # select features and target variable
        X = country_data.drop(['Country', 'Year', target_variable], axis=1, errors='ignore')
        y = country_data[target_variable]

        # remove rows with missing values (there is no missing value in the selected variables)
        X = X.dropna()
        y = y[X.index]

        # remove highly correlated features
        X_reduced = remove_highly_correlated_features(X, threshold=0.9)

        # check if all features are removed
        if X_reduced.empty:
            print(f"{country}: all features are removed")
            return (country, [])

        # standardize the features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_reduced)

        # fit ElasticNet regression
        elastic_net = ElasticNetCV(
            cv=cv,
            random_state=random_state,
            # max_iter=20000,  # raise max_iter to avoid convergence warning
            # alphas=np.logspace(-6, 0, 100),  # alpha values to search
            # l1_ratio=[0.1, 0.5, 0.9]  # l1 ratio values to search
        ).fit(X_scaled, y)

        # get selected features
        coef = pd.Series(elastic_net.coef_, index=X_reduced.columns)
        threshold = 0.05
        selected_features = coef[coef.abs() > threshold].index.tolist()

        return (country, selected_features)

    except Exception as e:
        print(f"Error processing {country}: {e}")
        return (country, [])
    

# get a list of countries
countries = gdp_model_data['Country'].unique()

# use parallel processing to select features for each country
results_elastic_net = Parallel(n_jobs = -1 )(
    delayed(select_features_elastic_net)(country, gdp_model_data) for country in countries
)

# convert the results to a dictionary
selected_features_dict_elastic_net = dict(results_elastic_net)

# create a DataFrame from the dictionary
selected_features_df_elastic_net = pd.DataFrame.from_dict(selected_features_dict_elastic_net, orient='index')
selected_features_df_elastic_net.columns = [f'Feature_{i+1}' for i in range(selected_features_df_elastic_net.shape[1])]
selected_features_df_elastic_net.index.name = 'Country'

# display the selected features
print("\neach country's selected features:")
print(selected_features_df_elastic_net.head())

# save the selected features to a CSV file
selected_features_df_elastic_net.to_csv('selected_features_per_country_elastic_net.csv')


each country's selected features:
                                            Feature_1  \
Country                                                 
Albania              Electricity: Distribution Losses   
Algeria              Electricity: Distribution Losses   
Angola               Electricity: Distribution Losses   
Antigua and Barbuda  Electricity: Distribution Losses   
Argentina            Electricity: Distribution Losses   

                                           Feature_2  \
Country                                                
Albania                         Electricity: Exports   
Algeria                         Electricity: Exports   
Angola                       Population: Growth Rate   
Antigua and Barbuda  Electricity: Installed Capacity   
Argentina                       Electricity: Exports   

                                        Feature_3  \
Country                                             
Albania                      Electricity: Imports   
Algeria      