In [1]:
import sys

sys.path.append("./myutils")
from data_generator import (
    DataGeneratorParam,
    TrainDataParam,
    RealDataParam,
    ExpandedDataParam,
    DataGenerator,
)
from train_rhat import train_rhat
from r0_CI import construct_r0_CIs
from true_models import generate_true_models, generate_bias_models
from learner import build_learner
from utils import get_model_directory_uri
import itertools
import numpy as np
import pandas as pd
from multiprocessing.dummy import Pool as ThreadPool
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm

In [None]:
import warnings
warnings.filterwarnings(
    "ignore",
    message="You are using `torch.load` with `weights_only=False`",
    category=FutureWarning
)

# All parameters

In [2]:
Q = 100  # Z in R^Q
P = 100  # X in R^P
# Y in R
OUTPUT_DIRECTORY_URI = "./results"

In [3]:
r0, g0, f0 = generate_true_models(Q, P)
DATA_GENERATOR_PARAM = DataGeneratorParam(p=P, q=Q, r0=r0, g0=g0, f0=f0)

TRAIN_DATA_PARAM = TrainDataParam(n_train=50000)

In [4]:
r0_LEARNERS = {
    # "linear": build_learner(model_type='ols'),
    "random_forest": build_learner(
        model_type="rf",
        output_dim=P,
        n_estimators=100,
        #         max_depth=100,
        max_features="sqrt",
        n_jobs=-1,
    ),
    "kernel": build_learner(model_type="krr"),
    "xgboost": build_learner(model_type="xgb", output_dim=P),
    "neural_net_128x128_1000_64": build_learner(
        model_type="mlp",
        input_dim=P,
        output_dim=Q,
        hidden_layers=[128, 128],
        epochs=1000,
        batch_size=64,
    ),
}

In [5]:
nn_bias_1, nn_bias_2 = generate_bias_models(Q, P)
BIAS_FUNCS = [nn_bias_1, nn_bias_2]
BIAS_SCALES = [0, 1, 10]
N_REALS = [100, 1000]
R_EXPANDEDS = [0.001, 0.01]

REAL_DATA_PARAMS = [
    RealDataParam(bias_func=bias_func, bias_scale=bias_scale, n_real=n_real)
    for bias_func, bias_scale, n_real in itertools.product(
        BIAS_FUNCS, BIAS_SCALES, N_REALS
    )
]
print("Number of real data param combos:", len(REAL_DATA_PARAMS))

EXPANDED_DATA_PARAMS = [
    ExpandedDataParam(n_expanded=10 * n_real, r_expanded=r)
    for n_real, r in itertools.product(N_REALS, R_EXPANDEDS)
]
print("Number of expanded data param combos:", len(EXPANDED_DATA_PARAMS))

Number of real data param combos: 12
Number of expanded data param combos: 4


In [6]:
combinations = list(
    itertools.product(r0_LEARNERS.items(), REAL_DATA_PARAMS, EXPANDED_DATA_PARAMS)
)
print(f"There are {len(combinations)} combinations to run.")

There are 192 combinations to run.


# Retrieve all trained models

In [None]:
def train_single_model(args):
    learner_name, learner = args
    model_directory_uri = get_model_directory_uri(
        data_generator_param=DATA_GENERATOR_PARAM,
        train_data_param=TRAIN_DATA_PARAM,
        r0_learner_name=learner_name,
        output_directory_uri=OUTPUT_DIRECTORY_URI,
    )
    rhat = train_rhat(
        data_generator_param=DATA_GENERATOR_PARAM,
        train_data_param=TRAIN_DATA_PARAM,
        model_directory_uri=model_directory_uri,
        learner_name=learner_name,
        learner=learner,
        fresh=False,
    )
    return model_directory_uri, rhat

trained_models = {}

with ProcessPoolExecutor() as executor:
    futures = {executor.submit(train_single_model, item): item for item in r0_LEARNERS.items()}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Training Models"):
        model_directory_uri, rhat = future.result()
        trained_models[model_directory_uri] = rhat

Training Models:   0%|          | 0/4 [00:00<?, ?it/s]

kernel training MSE = 6039.233176466542
'train_rhat' executed in 1.393837s


Training Models:  25%|██▌       | 1/4 [22:02<1:06:06, 1322.25s/it]Process ForkProcess-2:
Process ForkProcess-6:
Process ForkProcess-5:
Process ForkProcess-8:
Process ForkProcess-7:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/yqg36/clear/envs/dml-env/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/yqg36/clear/envs/dml-env/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/yqg36/clear/envs/dml-env/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/yqg36/clear/envs/dml-env/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/yqg36/clear/envs/dml-env/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/h

KeyboardInterrupt: 

In [7]:
trained_models = {}
for learner_name, learner in r0_LEARNERS.items():
    model_directory_uri = get_model_directory_uri(
        data_generator_param=DATA_GENERATOR_PARAM,
        train_data_param=TRAIN_DATA_PARAM,
        r0_learner_name=learner_name,
        output_directory_uri=OUTPUT_DIRECTORY_URI,
    )
    rhat = train_rhat(
        data_generator_param=DATA_GENERATOR_PARAM,
        train_data_param=TRAIN_DATA_PARAM,
        model_directory_uri=model_directory_uri,
        learner_name=learner_name,
        learner=learner,
        fresh=False,
    )
    trained_models[model_directory_uri] = rhat

KeyboardInterrupt: 

# Construct r0_CIs


There are 192 combinations to run.


In [None]:
def run_single_experiment(
    r0_learner_name,
    r0_learner,
    data_generator_param,
    train_data_param,
    output_directory_uri,
    real_data_param,
    expanded_data_param,
    r0,
    fresh=False,
):
    

    r0_CIs, coverage, avg_me = construct_r0_CIs(
        data_generator_param=data_generator_param,
        real_data_param=real_data_param,
        expanded_data_param=expanded_data_param,
        model_directory_uri=model_directory_uri,
        rhat=rhat,
        r0=r0,
        fresh=fresh,
    )

    return {
        "r0_learner_name": r0_learner_name,
        "bias_func": str(real_data_param.bias_func),
        "bias_scale": real_data_param.bias_scale,
        "n_real": real_data_param.n_real,
        "n_expanded": expanded_data_param.n_expanded,
        "r_expanded": expanded_data_param.r_expanded,
        "coverage": coverage,
        "avg_me": avg_me,
    }

In [None]:
# Pack function with fixed args using partial
def run_combination(args):
    (r0_learner_name, r0_learner), real_data_param, expanded_data_param = args
    return run_single_experiment(
        r0_learner_name=r0_learner_name,
        r0_learner=r0_learner,
        data_generator_param=DATA_GENERATOR_PARAM,
        train_data_param=TRAIN_DATA_PARAM,
        output_directory_uri=OUTPUT_DIRECTORY_URI,
        real_data_param=real_data_param,
        expanded_data_param=expanded_data_param,
        r0=r0,
        fresh=False,
    )

results = []
with ThreadPool() as pool:
    for result in tqdm(pool.imap_unordered(run_combination, combinations), total=len(combinations)):
        results.append(result)

  0%|                                                     | 0/192 [00:00<?, ?it/s]

In [None]:
results = pd.DataFrame(results)
print(results)
results.to_csv("results.csv", index=False)

In [None]:
from IPython.display import Audio, display

# Replace 'sound.mp3' with the path to your MP3 file.
display(Audio("/u/home/y/yqg36/alarm.mp3", autoplay=True))