# 2024-01-20 20 - Train-based constraint level estimation.ipynb
Copy of NB 13, calculate constraints based on training MSE on ERM runs for January's batch of experiments.

## Fetching runs

In [1]:
import wandb
from math import isnan 
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import wandb
from tqdm import tqdm
# from cycler import cycler
import matplotlib as mpl
from IPython.display import Markdown, display
    
api = wandb.Api()
project = "Autoformer"
workspace = "alelab"

experiment_tags = [
    "e11_erm_weather_dlfix",
    "e12_erm_electricity_dlfix",
    "e13_erm_exchange_dlfix",
    ]

# get all runs that both: 1.  match any experiment tag and 2. are finished
runs = api.runs(f"{workspace}/{project}",
                {"$and": [
                    {"tags": {"$in": experiment_tags}},
                    # Get only ERM runs: {"config.dual_lr": 0},
                    {"config.dual_lr": 0},
                    {"state": "finished"}
                ]})

def tag_experiment(run):
    for tag in experiment_tags:
        if tag in run.tags:
            return tag
    return ''

def runs_into_df(runs):
    all_runs = []
    run_counter = 0
    for run in tqdm(runs):
        run_counter += 1
        for split in ["train", "test","val"]:
            for metric in ["mse",]:
                pred_len = run.config["pred_len"]
                metrics = np.zeros(pred_len)
                for i in range(pred_len):
                    run_dict = {**run.config}
                    #run_dict["constraint_level"] = constraint_level
                    run_dict[f"{metric}"] = run.summary[f"{metric}/{split}/{i}"]
                    #run_dict[f"{metric}"] = run.summary.get(f"{metric}/{split}/{i}",run.summary.get(f"mse/{split}/{i}",np.nan)) #god forgive me for this line
                    run_dict["step"]=i
                    run_dict["epoch"]=run.summary["epoch"]
                    run_dict["infeasible_rate"]=run.summary[f"infeasible_rate/{split}"]
                    run_dict["infeasibles"]=run.summary[f"infeasibles/{split}"]
                    run_dict[f"multiplier"] = run.summary[f"multiplier/{i}"] if split == "train" else np.nan
                    run_dict["split"] = split
                    run_dict["run_id"] = run.id
                    # Get either Constrained/ or ERM/ from the run name, then append model name.
                    #print("run.name", run.name)
                    #debug if ERM run
                    run_dict["Algorithm"] = f"{run.name.split('/')[0]} {run.config['model']}"
                    run_dict["sweep_id"] = run.sweep.id if run.sweep else ''
                    #print("Algorithm", run_dict["Algorithm"])

                    # Get the experiment tag
                    run_dict["experiment_tag"] = tag_experiment(run)

                    # To better plot constrained vs ERM
                    #TODO this is a hack while I consolidate the tags. 
                    run_dict["type"] = "ERM" if run.config['dual_lr'] == 0 else "Constrained"

                    all_runs.append(run_dict)
    print(f"Fetched {run_counter} runs")
    df = pd.DataFrame(all_runs)
    print(f"Total records: {(df.shape)}")
    print(f"Total runs: {df.run_id.nunique()}")
    return df
df = runs_into_df(runs)

100%|██████████| 120/120 [00:10<00:00, 11.86it/s]


Fetched 120 runs
Total records: (120960, 76)
Total runs: 120


In [2]:
df[['run_id',"sweep_id",'Algorithm',"type",'model','data_path','constraint_level','pred_len']].drop_duplicates().sort_values(['model',"pred_len","constraint_level"])

Unnamed: 0,run_id,sweep_id,Algorithm,type,model,data_path,constraint_level,pred_len
23616,yl4lsmf1,7z00icod,Weather-StatInformed-ERM-10e Autoformer,ERM,Autoformer,weather.csv,-1,96
23904,txo8m3o0,j115hcxz,Weather-StatInformed-ERM-10e Autoformer,ERM,Autoformer,weather.csv,-1,96
36000,9sqnju3c,ro3nwbin,Exchange-StatInformed-ERM-10e Autoformer,ERM,Autoformer,exchange_rate.csv,-1,96
48096,m0dkle2c,jsh6vre8,Electricity-StatInformed-ERM-10e Autoformer,ERM,Autoformer,electricity.csv,-1,96
60192,uchtvkhp,kqk88y4g,Weather-StatInformed-ERM-10e Autoformer,ERM,Autoformer,weather.csv,-1,96
...,...,...,...,...,...,...,...,...
64512,srxqrvth,pzuksd9e,Exchange-StatInformed-ERM-10e Reformer,ERM,Reformer,exchange_rate.csv,-1,720
76608,k1woe3lg,2fvhx9ou,Electricity-StatInformed-ERM-10e Reformer,ERM,Reformer,electricity.csv,-1,720
88704,014gnz8z,80y93xe0,Weather-StatInformed-ERM-10e Reformer,ERM,Reformer,weather.csv,-1,720
100800,g02aylnp,8xqpivas,Exchange-StatInformed-ERM-10e Reformer,ERM,Reformer,exchange_rate.csv,-1,720


# Get IQR for all models, and pred lengths, datasets
Note that the IQRs are vastly different for autoformer and reformer.

In [15]:
stats=df.query("split=='train'").groupby(['data_path','model','pred_len'])['mse'].describe().reset_index()
constraint_data=stats[['data_path','model','pred_len','25%','50%','75%','mean','std']].sort_values(['data_path','pred_len','model'])
constraint_data

Unnamed: 0,data_path,model,pred_len,25%,50%,75%,mean,std
0,electricity.csv,Autoformer,96,0.12774,0.130797,0.137069,0.135374,0.013636
4,electricity.csv,Informer,96,0.170214,0.174671,0.182912,0.177596,0.009735
8,electricity.csv,Reformer,96,0.174376,0.176341,0.178398,0.17681,0.003726
1,electricity.csv,Autoformer,192,0.158004,0.163558,0.171457,0.168043,0.017472
5,electricity.csv,Informer,192,0.188372,0.192644,0.197674,0.193602,0.007632
9,electricity.csv,Reformer,192,0.184147,0.185285,0.186903,0.18597,0.003507
2,electricity.csv,Autoformer,336,0.168818,0.17875,0.188562,0.180856,0.018951
6,electricity.csv,Informer,336,0.219842,0.224648,0.232744,0.227809,0.011531
10,electricity.csv,Reformer,336,0.193268,0.193908,0.194859,0.194635,0.004194
3,electricity.csv,Autoformer,720,0.201388,0.209942,0.22379,0.21647,0.02643


# Parsing constraints to paste on sweeps

In [25]:
import yaml

# Iterate over each row in the DataFrame
for index, row in constraint_data.iterrows():
    # Extract the necessary values
    model_name = row['model']
    data_path = row['data_path']
    pred_len = row['pred_len']
    constraint_level_values = row[['25%', '50%', '75%']].astype(float).values.round(3)

    # Create the configuration dictionary
    config = {
        'model': {'value': f'{model_name}'},
        'constraint_level': {'values': constraint_level_values.tolist()}
    }

    # Generate the comment
    comment = f"# {data_path}, pred_len={pred_len}, {model_name}"

    # Convert the dictionary to a YAML string
    yaml_config = yaml.dump(config, default_flow_style=False)

    # Replace the list representation with an array representation
    yaml_config = yaml_config.replace('values:\n  - ', 'values: [')
    yaml_config = yaml_config.replace('\n  - ', ', ')

    # Add the closing bracket for the array
    yaml_config = yaml_config.replace('\nmodel', ']\nmodel')

    # Combine the comment and the YAML string
    yaml_output = comment + '\n' + yaml_config

    print(yaml_output)

# electricity.csv, pred_len=96, Autoformer
constraint_level:
  values: [0.128, 0.131, 0.137]
model:
  value: Autoformer

# electricity.csv, pred_len=96, Informer
constraint_level:
  values: [0.17, 0.175, 0.183]
model:
  value: Informer

# electricity.csv, pred_len=96, Reformer
constraint_level:
  values: [0.174, 0.176, 0.178]
model:
  value: Reformer

# electricity.csv, pred_len=192, Autoformer
constraint_level:
  values: [0.158, 0.164, 0.171]
model:
  value: Autoformer

# electricity.csv, pred_len=192, Informer
constraint_level:
  values: [0.188, 0.193, 0.198]
model:
  value: Informer

# electricity.csv, pred_len=192, Reformer
constraint_level:
  values: [0.184, 0.185, 0.187]
model:
  value: Reformer

# electricity.csv, pred_len=336, Autoformer
constraint_level:
  values: [0.169, 0.179, 0.189]
model:
  value: Autoformer

# electricity.csv, pred_len=336, Informer
constraint_level:
  values: [0.22, 0.225, 0.233]
model:
  value: Informer

# electricity.csv, pred_len=336, Reformer
constra