In [1]:
from collections import Counter
from keras.src.optimizers import Adam, SGD, RMSprop, Adagrad, Adadelta
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit, GridSearchCV
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from sklearn.metrics import mean_squared_error
from math import sqrt
import warnings
import gc
import matplotlib.pyplot as plt
import numpy as np
warnings.filterwarnings('ignore')
import tensorflow as tf
from tuning import hyperparameter_tuning

param_grid = {
    'look_back': [1,2,3],
    'epochs': [10, 32],
    'learning_rate': [0.001, 0.01, 0.1],
    'optimizer': ['adam', 'rmsprop'],
    'neurons': [32, 64, 128, 256],
    'dropout_rate': [0.1, 0.2, 0.3]
}

np.random.seed(84)
tf.random.set_seed(84)
tf.config.threading.set_intra_op_parallelism_threads(84)
tf.random.set_seed(84)

scaled_cols = ['GDP (current LCU) [NY.GDP.MKTP.CN]', 'Current health expenditure per capita (current US$) [SH.XPD.CHEX.PC.CD]',
               'Life expectancy at birth, total (years) [SP.DYN.LE00.IN]',
               'Urban population (% of total population) [SP.URB.TOTL.IN.ZS]',
               'Political Stability and Absence of Violence/Terrorism: Estimate [PV.EST]', 'Government Effectiveness: Estimate [GE.EST]',
               'Control of Corruption: Estimate [CC.EST]', 'Population density [EN.POP.DNST]']

num_features = len(scaled_cols)

def create_dataset(dataset, look_back = 2):
    dataX, dataY = [], []
    dataset = dataset.reset_index(drop=True)
    for i in range(len(dataset) - look_back):
        a = dataset.loc[i:(i + look_back), scaled_cols].values  # Ends at `look_back - 1` to get 5 steps
        dataX.append(a)
        # Assumes the target is right after the last step in `a`
        dataY.append(dataset.loc[i + look_back, 'Population growth (annual %) [SP.POP.GROW]'])

    return np.array(dataX), np.array(dataY)

def split_data(data, cols_to_scale, test_length):
    years = data['Time'].unique()

    # Adjusting the split based on sample_length
    train_years = years[:-test_length]
    test_years1 = years[-test_length:int(-test_length/2)]
    test_years2 = years[int(-test_length/2):]

    train_data = data[data['Time'].isin(train_years)]
    test_data1 = data[data['Time'].isin(test_years1)]
    test_data2 = data[data['Time'].isin(test_years2)]

    scaler = StandardScaler()
    # Fitting the scaler only on the training data
    scaler.fit(train_data[scaled_cols])

    # Transforming both training and testing data
    # train_data.loc[:, scaled_cols] = scaler.transform(train_data[scaled_cols])
    test_data1.loc[:, scaled_cols] = scaler.transform(test_data1[scaled_cols])
    test_data2.loc[:, scaled_cols] = scaler.transform(test_data2[scaled_cols])

    print("[INFO] Data successfully split into training and testing datasets.")
    # train_data.to_csv("train_scaled.csv", index=False)
    test_data1.to_csv("test1_scaled.csv", index=False)
    test_data2.to_csv("test1_scaled.csv", index=False)

    return train_data, test_data1, test_data2


def build_model(learning_rate, num_time_steps, num_features, dropout_rate=0.2, optimizer='adam', neurons=32):

    # Select optimizer
    if optimizer == 'adam':
        opt = Adam(learning_rate=learning_rate)
    elif optimizer == 'sgd':
        opt = SGD(learning_rate=learning_rate)
    elif optimizer == 'rmsprop':
        opt = RMSprop(learning_rate=learning_rate)
    elif optimizer == 'adagrad':
        opt = Adagrad(learning_rate=learning_rate)
    elif optimizer == 'adadelta':
        opt = Adadelta(learning_rate=learning_rate)
    else:
        opt = Adam(learning_rate=learning_rate)

    model = Sequential([
        LSTM(neurons, input_shape=(num_time_steps, num_features)),
        Dropout(dropout_rate),
        Dense(64, activation='tanh'),
        Dense(1)
    ])
    model.compile(optimizer=opt, loss='mse')
    return model


# def hyperparameter_tuning(params):
#     # best_loss = np.inf
#     # best_params = None
#     # best_model = None
#     total_loss = []
#     n_splits = 3
#     train_data = train_scaled
#     look_back, epochs, learning_rate, optimizer, neurons, dropout_rate = params

#     # #Cross-validation
#     for split in range(-n_splits,-1):
#       years = train_data['Time'].unique()

#       #Last year is for validation
#       train_years = years[:split-1]
#       val_years = years[split-look_back:split+1]

#       model = build_model(learning_rate=learning_rate, num_time_steps=look_back,
#                                                   num_features=len(scaled_cols), dropout_rate=dropout_rate,
#                                                   optimizer=optimizer, neurons=neurons)

#       model = fit_model(model, train_data[train_data["Time"].isin(train_years)], look_back=look_back, epochs=epochs, split="train")[0]
#       val_loss = fit_model(model, train_data[train_data["Time"].isin(val_years)], look_back=look_back, epochs=epochs, split="val")[3]
#       total_loss.append(val_loss)

#       print('One iteration', flush=True)

#     result = pd.DataFrame([{'look_back': look_back, 'epochs': epochs,
#                             'learning_rate': learning_rate, 'optimizer': optimizer,
#                             'neurons': neurons, 'dropout_rate': dropout_rate,
#                             'validation_loss': np.mean(total_loss)}])
#     return result

    #   for look_back in param_grid['look_back']:

    #       val_years = years[split-look_back:split+1] #Take two years from train years to make the prediction

    #       for epochs in param_grid['epochs']:
    #           for learning_rate in param_grid['learning_rate']:
    #               for optimizer in param_grid['optimizer']:
    #                   for neurons in param_grid['neurons']:
    #                       for dropout_rate in param_grid['dropout_rate']:
    #                           print(f"Training with: look_back={look_back}, epochs={epochs}, learning_rate={learning_rate}, optimizer={optimizer}, neurons={neurons}, dropout_rate={dropout_rate}")
    #                           model = build_model(learning_rate=learning_rate, num_time_steps=look_back,
    #                                               num_features=len(scaled_cols), dropout_rate=dropout_rate,
    #                                               optimizer=optimizer, neurons=neurons)

    #                           model = fit_model(model, train_data[train_data["Time"].isin(train_years)], look_back=look_back, epochs=epochs, split="train")[0]
    #                           val_loss = fit_model(model, train_data[train_data["Time"].isin(val_years)], look_back=look_back, epochs=epochs, split="val")[3]

    #                           result = pd.DataFrame([{'val_year' : years[split], 'look_back': look_back, 'epochs': epochs,
    #                                                   'learning_rate': learning_rate, 'optimizer': optimizer,
    #                                                   'neurons': neurons, 'dropout_rate': dropout_rate,
    #                                                   'validation_loss': val_loss}])

    #                           result.to_csv("results_file.csv", mode='a', header=False, index=False)

    #                           if val_loss < best_loss:
    #                               best_loss = val_loss
    #                               best_params = {
    #                                   'look_back': look_back,
    #                                   'epochs': epochs,
    #                                   'learning_rate': learning_rate,
    #                                   'optimizer': optimizer,
    #                                   'neurons': neurons,
    #                                   'dropout_rate': dropout_rate
    #                               }
    #                               best_model = model

    #                           print(f"Trial completed with validation loss={val_loss}")

    #   print(f"Best parameters: {best_params} with loss={best_loss}")
    # return best_params, best_model


# def fit_model(model, data, look_back = 2, epochs=1, split="train"):
#     FourD_dataX, FourD_dataY = [], []
#     countries = data['Country Name'].unique()
#     np.random.shuffle(countries)  # Shuffle countries at each epoch
#     total_loss = 0
#     losses = {}
#     all_actuals = []
#     all_predictions = []
#     for country in countries:
#       X_country, y_country = create_dataset(data[data['Country Name'] == country].reset_index(drop=True),
#                                             look_back)
#       FourD_dataX.append(X_country)
#       FourD_dataY.append(y_country)

#       if len(X_country) == 0:
#           print(f"Skipping country {country} due to insufficient data.")
#           continue
#       if split == "train":
#         model.fit(X_country, y_country, epochs=epochs, verbose=0)
#       if split == "val" or split == "test":
#         country_predict = model.predict(X_country, verbose=0)
#         total_loss += (y_country - country_predict)**2
#         if split == "test":
#           losses[country] = (y_country - country_predict)**2
#           all_actuals.extend(y_country)
#           all_predictions.extend(country_predict.flatten())

#     stacked_dataX = np.stack(np.array(FourD_dataX), axis=0)
#     stacked_dataY = np.stack(np.array(FourD_dataY), axis=0)

#     total_loss = total_loss/len(countries)

#     return model, stacked_dataX, stacked_dataY, total_loss, losses, all_actuals, all_predictions

def evaluate_model(model, test_data, look_back):

    avg_loss, losses, all_actuals, all_predictions = fit_model(model = model, data = test_data, look_back = look_back, epochs=1, split = "test")[3:]

    # adding title and labels
    plt.figure(figsize=(14, 6))
    plt.plot(all_actuals, label='Actual', color='blue')
    plt.plot(all_predictions, label='Predicted', color='red', linestyle='--')
    plt.title('Actual vs. Predicted Population Growth')
    plt.xlabel('Time Step Across All Countries')
    plt.ylabel('Population Growth (annual %)')
    plt.legend()
    plt.show()

    losses_df = pd.DataFrame(list(losses.items()), columns=['Country', 'Loss'])

    # Sort DataFrame by 'Loss' in descending order and take the first 30 rows
    losses_df = losses_df.sort_values('Loss', ascending=False).head(30)

    losses_df.plot.bar(x='Country', y='Loss', figsize=(10, 6))
    plt.title('Test Loss for Each Country')
    plt.xlabel('Country')
    plt.ylabel('Loss')
    plt.show()

    # Plot the bar chart for the lowest 30 losses
    smallest_losses_df = losses_df.sort_values('Loss').head(30)
    smallest_losses_df.plot.bar(x='Country', y='Loss', figsize=(10, 6))
    plt.title('Smallest 30 Losses for Each Country')
    plt.xlabel('Country')
    plt.ylabel('Loss')
    plt.show()

    min_test_loss = losses_df['Loss'].min()
    max_test_loss = losses_df['Loss'].max()
    losses_df.to_csv("losses.csv", index=False)
    print("Average test loss: ", avg_loss)
    print("Minimum test loss: ", min_test_loss)
    print("Maximum test loss: ", max_test_loss)
    print(f"[INFO] Model evaluated successfully")




In [None]:
from tqdm import tqdm
from multiprocess import Pool
np.random.seed(84)
tf.random.set_seed(84)

# Load and prepare data
df_filled = pd.read_csv('df_filled.csv')

# Split data
# train_scaled, test_scaled1, test_scaled2 = split_data(data=df_filled, cols_to_scale=scaled_cols, test_length=int(len(df_filled["Time"].unique())//5))
train, test_scaled1, test_scaled2 = split_data(data=df_filled, cols_to_scale=scaled_cols, test_length=int(len(df_filled["Time"].unique())//5))

params_list = [(look_back, epochs, learning_rate, optimizer, neurons, dropout_rate, train) for look_back in param_grid['look_back'] for epochs in param_grid['epochs']
                for learning_rate in param_grid['learning_rate'] for optimizer in param_grid['optimizer'] for neurons in param_grid['neurons']
                for dropout_rate in param_grid['dropout_rate']]


with Pool(15) as p:
    pool_outputs = list(
        tqdm(
            p.imap(hyperparameter_tuning, params_list),
            total=len(params_list)
        )
    )

print(pool_outputs)
new_dict = dict(pool_outputs)
print("dict:", new_dict)

# params = hyperparameter_tuning(train_scaled, param_grid)
# model_tuned = build_model(look_back=params["look_back"], epochs=params["epochs"], dropout_rate=params["dropout_rate"],
#                          optimizer=params["optimizer"], neurons = params["neurons"], learning_rate = params["learning_rate"])
# fitted_model = fit_model(model=model_tuned, data=train_scaled, look_back=params["look_back"], epochs=params["epochs"], split="train")[0]
# test_scaled1 = pd.concat(train_scaled[train_scaled["Time"].isin(train_scaled["Time"][-params["look_back"]+1:])],test_scaled1) #add look_back years from train data to test data 1
# test_scaled2 = pd.concat(test_scaled1[test_scaled1["Time"].isin(test_scaled1["Time"][-params["look_back"]+1:])],test_scaled2) #add look_back years from train data to test data 1
# evaluate_model(model=fitted_model, test_data=test_scaled1, look_back=params["look_back"])
# evaluate_model(model=fitted_model, test_data=test_scaled2, look_back=params["look_back"])
# fitted_model = fit_model(model=fitted_model, data=test_scaled1, look_back=params["look_back"], epochs=params["epochs"], split="train")[0]
# evaluate_model(model=fitted_model, test_data=test_scaled2, look_back=params["look_back"])

[INFO] Data successfully split into training and testing datasets.


  3%|██                                                                             | 11/432 [09:51<4:31:50, 38.74s/it]