In [None]:
import os, sys
import pandas as pd
import torch
import optuna
from sklearn.model_selection import KFold
import json

# 检测运行环境
def in_notebook():
    return 'IPKernelApp' in getattr(globals().get('get_ipython', lambda: None)(), 'config', {})

if in_notebook():
    from IPython.display import clear_output, display
    notebook_dir = os.getcwd()
    src_path = os.path.abspath(os.path.join(notebook_dir, '..'))
    RUN_MODE = 'eval'
else:
    src_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
sys.path.append(src_path) if src_path not in sys.path else None

from src.utils import *
from src.model_utils import *
from src.setup import *
from ite_setup import *
from ganite_mod import Ganite, GaniteRegressor
from ganite_mod.utils.metrics import *

In [None]:
df = pd.read_csv(f'{DATA}/imputed/EXIT_SEP_clean_imputed.tsv.gz', sep='\t', index_col='ID')
features, _, _, treatment, outcomes = get_ite_features()
current_outcome = outcomes[0] # 设置预测目标

df_train = df.sample(frac=0.7, random_state=19960816)
df_test = df[~df.index.isin(df_train.index)].copy()
X, W, y = load_data(df)

X = np.array(X)
W = np.array(W)
y = np.array(y)

# 随机搜索

In [None]:
def objective(trial):
    # 定义需要调优的超参数范围
    dim_hidden = trial.suggest_categorical("dim_hidden", [50, 75, 100, 200, 300, 400])
    alpha = trial.suggest_float("alpha", 0.1, 1.0, step=0.05)
    beta = trial.suggest_float("beta", 0.0, 1.0, step=0.05)
    depth = trial.suggest_int("depth", 0, 5, step=1)
    num_iterations = trial.suggest_int("num_iterations", 1000, 2500, step=500)
    num_discr_iterations = trial.suggest_categorical("num_discr_iterations", [1, 2, 3])

    # 初始化模型
    model = GaniteRegressor(
        dim_in=X.shape[1],
        binary_y=True,
        dim_hidden=dim_hidden,
        alpha=alpha,
        beta=beta,
        depth=depth,
        num_iterations=num_iterations,
        num_discr_iterations=num_discr_iterations,
    )

    # 实现交叉验证
    kf = KFold(n_splits=10, shuffle=True, random_state=19960816)
    scores = []

    for train_index, val_index in kf.split(X):
        # 划分训练集和验证集
        X_train, X_val = X[train_index], X[val_index]
        T_train, T_val = W[train_index], W[val_index]
        y_train, y_val = y[train_index], y[val_index]
        
        # 训练模型
        model.fit((X_train, T_train), y_train)
        
        # 验证模型并记录分数
        score = model.score((X_val, T_val), y_val)  # 默认负均方误差
        scores.append(score)

    # 返回平均交叉验证分数（负均方误差）
    return np.mean(scores)

# 日志功能：设置 Optuna 的日志级别
optuna.logging.set_verbosity(optuna.logging.INFO)
# 实时打印当前最佳结果
def trial_callback(study, trial):
    if in_notebook():
        clear_output(wait=True)  # 清除之前的输出
        df_trials = study.trials_dataframe()  # 获取当前的试验数据
        display(df_trials)  # 动态显示最新的 dataframe
        print(f"Current best value: {study.best_value}")
        print(f"Current best parameters: {study.best_params}")
    else:
        print(f"Trial {trial.number} finished with value: {trial.value} and parameters: {trial.params}")
        # print(f"Current best value is {study.best_value} with parameters: {study.best_params}")

# 使用 Optuna 优化
study = optuna.create_study(direction="maximize")  # 或 "minimize"，取决于评分标准
study.optimize(objective, n_trials=20, callbacks=[trial_callback])

# 获取最佳超参数, 保存最佳超参数到 JSON 文件
best_params = study.best_params
print("Best Parameters:", best_params)
with open(f"{MODELS}/GANITE_best_hyperparams_optuna.json", "w") as f:
    json.dump(study.best_params, f)

# 保存完整调参历史为 CSV 文件
df_trials = study.trials_dataframe()
df_trials.to_csv(f"{MODELS}/GANITE_optuna_tuning_history.csv", index=False)

# 使用最佳参数重新初始化模型
best_model = GaniteRegressor(
    dim_in=X.shape[1],
    binary_y=True,
    dim_hidden=best_params["dim_hidden"],
    alpha=best_params["alpha"],
    beta=best_params["beta"],
    depth=best_params["depth"],
    num_iterations=best_params["num_iterations"],
    num_discr_iterations=best_params["num_discr_iterations"],
)

# 训练最佳模型
best_model.fit((X, W), y)

# 保存最佳模型
torch.save(best_model.state_dict(), f"{MODELS}/GANITE_best_weights_optuna.pth")

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_beta,params_depth,params_dim_hidden,params_num_discr_iterations,params_num_iterations,state
0,0,-0.121017,2024-11-28 02:37:05.392571,2024-11-28 02:50:18.446940,0 days 00:13:13.054369,0.85,0.3,5,50,3,2000,COMPLETE
1,1,-0.106148,2024-11-28 02:50:18.465538,2024-11-28 03:06:42.984015,0 days 00:16:24.518477,0.7,1.0,5,300,2,2500,COMPLETE
2,2,-0.093175,2024-11-28 03:06:43.001173,2024-11-28 03:10:34.156088,0 days 00:03:51.154915,0.35,0.75,3,300,1,1000,COMPLETE
3,3,-0.125874,2024-11-28 03:10:34.180995,2024-11-28 03:17:34.454741,0 days 00:07:00.273746,0.6,0.8,1,300,3,1500,COMPLETE
4,4,-0.13083,2024-11-28 03:17:34.465250,2024-11-28 03:22:10.614930,0 days 00:04:36.149680,0.75,0.25,1,400,1,1500,COMPLETE
5,5,-0.10014,2024-11-28 03:22:10.641594,2024-11-28 03:34:48.961695,0 days 00:12:38.320101,0.8,0.6,2,100,3,2500,COMPLETE
6,6,-0.158163,2024-11-28 03:34:48.976476,2024-11-28 03:42:41.638851,0 days 00:07:52.662375,0.65,0.35,0,200,2,2500,COMPLETE
7,7,-0.10929,2024-11-28 03:42:41.654045,2024-11-28 03:53:58.451029,0 days 00:11:16.796984,0.35,0.6,5,50,2,2000,COMPLETE
8,8,-0.1193,2024-11-28 03:53:58.469223,2024-11-28 04:02:27.544087,0 days 00:08:29.074864,0.5,0.25,5,75,2,1500,COMPLETE
9,9,-0.133864,2024-11-28 04:02:27.559463,2024-11-28 04:06:58.591303,0 days 00:04:31.031840,0.25,0.4,1,50,1,1500,COMPLETE


Current best value: -0.08110309005254665
Current best parameters: {'dim_hidden': 100, 'alpha': 1.0, 'beta': 1.0, 'depth': 4, 'num_iterations': 1000, 'num_discr_iterations': 1}
Best Parameters: {'dim_hidden': 100, 'alpha': 1.0, 'beta': 1.0, 'depth': 4, 'num_iterations': 1000, 'num_discr_iterations': 1}


# 手动调参

In [4]:
# X_train, W_train, y_train = load_data(df_train)
# X_test, W_test, y_test = load_data(df_test)

# # modified GANITE
# model = Ganite(dim_in=X.shape[1],
#                binary_y=True,
#                dim_hidden=300,
#                alpha = 0.3,
#                beta = 0.3,
#                depth = 3,
#                minibatch_size = 200,
#                num_iterations=2500,
#                num_discr_iterations=3,
#                )

# if RUN_MODE == 'train':
#     model = model.fit(X_train, W_train, y_train)
#     torch.save(model.state_dict(), f"{MODELS}/GANITE.pth")
# else:
#     model.load_state_dict(torch.load(f"{MODELS}/GANITE_best_weights_manual.pth", weights_only=True))
#     model.eval()  # 切换到评估模式（重要！）
#     print("模型参数已加载！")

# # 测试集测试
# Y_1_test, Y_0_test, ITE_test = model(X_test)
# df_test['potential_y1'] = Y_1_test.cpu()
# df_test['potential_y0'] = Y_0_test.cpu()
# df_test['ITE'] = ITE_test.cpu()
# df_test['y_pred_observed'] = df_test.apply(lambda row: row['potential_y1'] if row[treatment]==1 else row['potential_y0'], axis=1)

# ATE_test = RCT_ATE(df_test[treatment], df_test[current_outcome])
# ATE_pred_ob = RCT_ATE(df_test[treatment], df_test['y_pred_observed'])
# ATE_pred = df_test['ITE'].mean()

# print(f'实际ATE: {ATE_test:.4f}, 预测实际ATE: {ATE_pred_ob:.4f}, ATE误差: {ATE_test - ATE_pred_ob:.4f}, 预测组间ATE: {ATE_pred:.4f}')