In [1]:
import sys

sys.path.append("./myutils")
from data_generator import (
    DataGeneratorParam,
    TrainDataParam,
    RealDataParam,
    ExpandedDataParam,
    DataGenerator,
)
from train_rhat import train_rhat
from r0_CI import construct_r0_CIs
from true_models import generate_true_models, generate_bias_models
from learner import build_learner
from utils import get_model_directory_uri
import itertools
import numpy as np
import pandas as pd
from multiprocessing.dummy import Pool as ThreadPool
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm

In [2]:
import warnings
warnings.filterwarnings(
    "ignore",
    message="You are using `torch.load` with `weights_only=False`",
    category=FutureWarning
)

# All parameters

In [3]:
Q = 100  # Z in R^Q
P = 100  # X in R^P
# Y in R
OUTPUT_DIRECTORY_URI = "./results"

In [4]:
r0, g0, f0 = generate_true_models(Q, P)
DATA_GENERATOR_PARAM = DataGeneratorParam(p=P, q=Q, r0=r0, g0=g0, f0=f0)

TRAIN_DATA_PARAM = TrainDataParam(n_train=50000)

In [5]:
r0_LEARNERS = {
    # "linear": build_learner(model_type='ols'),
    "random_forest_20dep_20ests": build_learner(
        model_type="rf",
        output_dim=P,
        max_depth=20,
        n_estimators=20,
        max_features="sqrt",
    ),
    "random_forest_20dep_50ests": build_learner(
        model_type="rf",
        output_dim=P,
        max_depth=20,
        n_estimators=50,
        max_features="sqrt",
    ),
    "random_forest_40dep_20ests": build_learner(
        model_type="rf",
        output_dim=P,
        max_depth=40,
        n_estimators=20,
        max_features="sqrt",
    ),
    "random_forest_40dep_50ests": build_learner(
        model_type="rf",
        output_dim=P,
        max_depth=40,
        n_estimators=50,
        max_features="sqrt",
    ),
    "kernel": build_learner(model_type="krr"),
    "xgboost": build_learner(model_type="xgb", output_dim=P),
    "neural_net_128x128_1000_64": build_learner(
        model_type="mlp",
        input_dim=P,
        output_dim=Q,
        hidden_layers=[128, 128],
        epochs=1000,
        batch_size=64,
    ),
}

In [6]:
nn_bias_1, nn_bias_2 = generate_bias_models(Q, P)
BIAS_FUNCS = [nn_bias_1, nn_bias_2]
BIAS_SCALES = [0, 1, 10]
N_REALS = [100, 1000]
R_EXPANDEDS = [0.001, 0.01]

REAL_DATA_PARAMS = [
    RealDataParam(bias_func=bias_func, bias_scale=bias_scale, n_real=n_real)
    for bias_func, bias_scale, n_real in itertools.product(
        BIAS_FUNCS, BIAS_SCALES, N_REALS
    )
]
print("Number of real data param combos:", len(REAL_DATA_PARAMS))

EXPANDED_DATA_PARAMS = [
    ExpandedDataParam(n_expanded=10 * n_real, r_expanded=r)
    for n_real, r in itertools.product(N_REALS, R_EXPANDEDS)
]
print("Number of expanded data param combos:", len(EXPANDED_DATA_PARAMS))

Number of real data param combos: 12
Number of expanded data param combos: 4


In [7]:
combinations = list(
    itertools.product(r0_LEARNERS.items(), REAL_DATA_PARAMS, EXPANDED_DATA_PARAMS)
)
print(f"There are {len(combinations)} combinations to run.")

There are 336 combinations to run.


# Retrieve all trained models

In [8]:
def train_single_model(args):
    learner_name, learner = args
    model_directory_uri = get_model_directory_uri(
        data_generator_param=DATA_GENERATOR_PARAM,
        train_data_param=TRAIN_DATA_PARAM,
        r0_learner_name=learner_name,
        output_directory_uri=OUTPUT_DIRECTORY_URI,
    )
    rhat = train_rhat(
        data_generator_param=DATA_GENERATOR_PARAM,
        train_data_param=TRAIN_DATA_PARAM,
        model_directory_uri=model_directory_uri,
        learner_name=learner_name,
        learner=learner,
        fresh=False,
    )
    return model_directory_uri, rhat

In [9]:
trained_models = {}

with ProcessPoolExecutor() as executor:
    futures = {
        executor.submit(train_single_model, item): item[0]  # learner_name
        for item in r0_LEARNERS.items()
    }
    for future in tqdm(as_completed(futures), total=len(futures), desc="Training Models"):
        learner_name = futures[future]
        model_directory_uri, rhat = future.result()
        trained_models[learner_name] = (model_directory_uri, rhat)

Training Models:   0%|          | 0/7 [00:00<?, ?it/s]

Reading rhat...
Reading rhat...Reading rhat...

Reading rhat...
Reading rhat...
Reading rhat...
Reading rhat...
kernel training MSE = 6039.233176466542
'train_rhat' executed in 0.970391s


Training Models:  14%|█▍        | 1/7 [00:00<00:05,  1.00it/s]

xgboost training MSE = 2670.9454665857
'train_rhat' executed in 1.956382s


Training Models:  29%|██▊       | 2/7 [00:02<00:07,  1.58s/it]

neural_net_128x128_1000_64 training MSE = 1.8428453699535878
'train_rhat' executed in 5.481387s


Training Models:  43%|████▎     | 3/7 [00:06<00:09,  2.43s/it]

random_forest_20dep_20ests training MSE = 2181.720559704722
'train_rhat' executed in 31.568061s
random_forest_40dep_20ests training MSE = 1853.2292092049986
'train_rhat' executed in 38.245884s


Training Models:  71%|███████▏  | 5/7 [01:09<00:38, 19.00s/it]

random_forest_20dep_50ests training MSE = 1944.3113873115026
'train_rhat' executed in 124.588467s
random_forest_40dep_50ests training MSE = 1613.1390952323227
'train_rhat' executed in 163.584861s


Training Models: 100%|██████████| 7/7 [03:53<00:00, 33.40s/it]


# Construct r0_CIs


In [None]:
def run_single_experiment(
    r0_learner_name,
    r0_learner,
    data_generator_param,
    train_data_param,
    output_directory_uri,
    real_data_param,
    expanded_data_param,
    r0,
    fresh=False,
):
    

    r0_CIs, coverage, avg_me = construct_r0_CIs(
        data_generator_param=data_generator_param,
        real_data_param=real_data_param,
        expanded_data_param=expanded_data_param,
        model_directory_uri=model_directory_uri,
        rhat=rhat,
        r0=r0,
        fresh=fresh,
    )

    return {
        "r0_learner_name": r0_learner_name,
        "bias_func": str(real_data_param.bias_func),
        "bias_scale": real_data_param.bias_scale,
        "n_real": real_data_param.n_real,
        "n_expanded": expanded_data_param.n_expanded,
        "r_expanded": expanded_data_param.r_expanded,
        "coverage": coverage,
        "avg_me": avg_me,
    }

In [None]:
# Pack function with fixed args using partial
def run_combination(args):
    (r0_learner_name, r0_learner), real_data_param, expanded_data_param = args
    return run_single_experiment(
        r0_learner_name=r0_learner_name,
        r0_learner=r0_learner,
        data_generator_param=DATA_GENERATOR_PARAM,
        train_data_param=TRAIN_DATA_PARAM,
        output_directory_uri=OUTPUT_DIRECTORY_URI,
        real_data_param=real_data_param,
        expanded_data_param=expanded_data_param,
        r0=r0,
        fresh=False,
    )

results = []
with ThreadPool() as pool:
    for result in tqdm(pool.imap_unordered(run_combination, combinations), total=len(combinations)):
        results.append(result)

  0%|                                                     | 0/192 [00:00<?, ?it/s]

In [None]:
results = pd.DataFrame(results)
print(results)
results.to_csv("results.csv", index=False)

In [None]:
from IPython.display import Audio, display

# Replace 'sound.mp3' with the path to your MP3 file.
display(Audio("/u/home/y/yqg36/alarm.mp3", autoplay=True))