# 1. Prepare datasets

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# Create a folder named "datasets" if it doesn't exist
folder_name = "datasets"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

df = pd.read_csv("top_5_countries.csv", index_col=0, parse_dates=True)
# Reset index for Data Loader
df.reset_index(inplace=True)
df = df.iloc[:,:-6]

In [3]:
top_5_countries = ['DE', 'GB', 'ES', 'FR', 'IT']

# Split and save the datasets
for country_prefix in top_5_countries:
    # Filter columns with the specified prefix
    country_columns = [col for col in df.columns if col.startswith(country_prefix)]
    
    # Insert the date column at the beginning of every dataset
    country_columns.insert(0,"date")
    country_df = df[country_columns]
    
    # Save the DataFrame to a CSV file
    file_name = f"datasets/{country_prefix}_data.csv"
    country_df.to_csv(file_name, index=False)


# 2. Informer

In [None]:
# https://colab.research.google.com/drive/1rv2rKwQqgoHDNjXtRoAEWZ2ATz0gGAKu?usp=sharing#scrollTo=yu6zzic9t_Cz
# Popen: https://colab.research.google.com/github/aviadr1/learn-python/blob/master/content/13_multiprocessing/notebooks/os_system_subprocess.ipynb
import subprocess
import os
# parent_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
path_to_run_file = "/Users/valentyna/Documents/Master_thesis_new/TSLibrary/run.py"

def run_output(path_to_run_file, model_arguments):
    try:
        # Execute the script and capture the output
        command = ["python", "-u", path_to_run_file] + model_arguments
        output = subprocess.check_output(command, universal_newlines=True)
    except subprocess.CalledProcessError as e:
        output = e.output  

    return output

In [None]:
"""
import subprocess
import os
# parent_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
path_to_run_file = "/content/my_work/TSLibrary/run.py"

def run_output(path_to_run_file, model_arguments):
    try:
        # Define command and options wanted
        command = "python"
        options = "-u"
        # Run the shell command directly in Colab
        output = !{command} {options} {path_to_run_file} {model_arguments}

    except subprocess.CalledProcessError as e:
        output = e.output

    return output

def run_output(path_to_run_file, model_arguments):
    try:
        # Execute the script using the %run magic command
        output = %run -i {path_to_run_file} {model_arguments}

    except Exception as e:
        output = str(e)

    return output
"""

In [None]:
# Seems to work in colab
import subprocess

def run_output(path_to_run_file, model_arguments):
    try:
        # Construct the command to execute the script with required and model arguments
        command = ["python", "-u", path_to_run_file] + model_arguments
        # Execute the script and capture the output
        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = process.communicate()
        # Check if there's any error in the process
        if process.returncode != 0:
            output = stderr.decode("utf-8")
        else:
            output = stdout.decode("utf-8")
    except Exception as e:
        output = str(e)
    return output

In [None]:
import time 
start = time.time()

datasets = ['DE_data.csv', 'GB_data.csv', 'ES_data.csv', 'FR_data.csv', 'IT_data.csv']
num_cols = ["5", "5", "3", "3", "3"]
pred_len = "24"

for i, dataset in enumerate(datasets):
    model_id = f"_{pred_len}_{dataset[:2]}"  # Create the model_id
    model_arguments = [
                "--task_name", "long_term_forecast",
                "--is_training", "1", #True
                "--root_path", "datasets/",
                "--data_path", dataset,
                # "--train_epochs", "1",
                "--model_id", model_id,
                "--model", "Informer",
                "--data", "custom", # Use a custom dataloader (same data preparation as in ARIMA)
                "--features", "M", # Multivariate
                "--seq_len", "96",
                "--label_len", "48",
                "--pred_len", pred_len,
                "--e_layers", "2", 
                "--d_layers", "5",
                "--factor", "5",
                "--enc_in", num_cols[i], 
                "--dec_in", num_cols[i], 
                "--c_out", num_cols[i],
                "--des", "Exp",
                "--itr", "2",
            ]

    int_start = time.time()

    model_output = run_output(path_to_run_file, model_arguments)
    
    int_end = time.time()
    print(model_output)
    print(f"Time intermediate for {dataset[:2]} dataset:", (int_end - int_start)/60, "min.")


    end = time.time()
print("Total time:", (end - start)/60, "min.")

# Test

In [19]:
# metrics
np.load("/Users/valentyna/Documents/Master_thesis_new/results/long_term_forecast_1_Informer_custom_ftM_sl10_ll5_pl10_dm512_nh8_el3_dl2_df2048_fc3_ebtimeF_dtTrue_Exp_0/metrics.npy")

array([  0.86847895,   1.1777946 ,   1.0852624 ,   3.2318454 ,
       769.5208    ], dtype=float32)

In [21]:
# preds
np.load("/Users/valentyna/Documents/Master_thesis_new/results/long_term_forecast_1_Informer_custom_ftM_sl10_ll5_pl10_dm512_nh8_el3_dl2_df2048_fc3_ebtimeF_dtTrue_Exp_0/pred.npy").shape

(39, 10, 5)