In [1]:
import random
import numpy as np
import pandas as pd
import pickle

from functools import partial
from esn_tarnet import *
from feature_select import *
from s_learner import *
from t_learner import *
from tarnet import *

def set_seed(seed):
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
set_seed(42)



In [2]:
country_code = 'ID'

In [3]:
#读取特征列表
with open(f'/mlx_devbox/users/wangyuxin.huoshan/playground/bonus_train_data/{country_code}_feature_list_aftersale_gmv_selected.pkl', 'rb') as f:
    feature_list = pickle.load(f)
print(len(feature_list))

with open(f'/mlx_devbox/users/wangyuxin.huoshan/playground/bonus_train_data/{country_code}_feature_list_aftersale_gmv_selected_discrete.pkl', 'rb') as f:
    feature_list_discrete = pickle.load(f)
print(len(feature_list_discrete))

with open(f'/mlx_devbox/users/wangyuxin.huoshan/playground/bonus_train_data/{country_code}_feature_list_aftersale_gmv_selected_discrete_size.pkl', 'rb') as f:
    discrete_size_cols = pickle.load(f)
print(len(discrete_size_cols))


400
133
133


In [4]:
df = pd.read_parquet(f'/mlx_devbox/users/wangyuxin.huoshan/playground/bonus_train_data/aftersale_train_valid_{country_code}',columns=feature_list+['is_treatment','gmv','settle_gmv_rate']).fillna(0)

for column in df.columns:
    if df[column].dtype != 'float':
        df[column] = df[column].astype('float')

df['gmv_label'] = df['gmv'].apply(lambda x: 1 if x>0 else 0)
df.groupby('is_treatment').agg(
        gmv_label_pos_rate=('gmv_label', 'mean'),  # 正例比例 = 平均值（因为是0/1）
        gmv_mean=('gmv', 'mean'),                  # gmv 均值
        count=('gmv_label', 'size')                # 样本数（可选）
    ).reset_index()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
model = Tarnet(
    input_dim=len(feature_list), discrete_size_cols=discrete_size_cols,embedding_dim=3,share_dim=64,
                 share_hidden_dims =[512,256,256,128],
                 base_hidden_dims=[64,32,32,16],output_activation_base=None,
                 share_hidden_func = torch.nn.ELU(),base_hidden_func = torch.nn.ELU(), 
                 task = 'regression',classi_nums=2, treatment_label_list=[0,1],model_type='Tarnet',device=device
).to(device)
loss_f = partial(tarnet_loss)

In [None]:
model.fit(
    df=df,
    feature_list=feature_list,
    discrete_cols=feature_list_discrete,
    epochs=300,
    batch_size=256,
    learning_rate=1e-5,
    loss_f=loss_f,
    tensorboard=False,
    num_workers=40,
    pin_memory=True,
    device=device,
    valid_perc=True,
    label_y='gmv',
    label_treatment='is_treatment',
    task='regression',
    loss_type='huberloss',
    treatment_label_list=[0, 1],
    checkpoint_path=f'/mlx_devbox/users/wangyuxin.huoshan/playground/bonus_train_data/{country_code}_aftersale_gmv_tarnet_checkpoint_256.pth',
    if_continued_train = 0
)

[16.560626 22.251774  0.       ...  0.       86.133461  0.      ]
预计单epoch训练步数: 13047


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


--epoch: 0 train_loss: 74.2683  valid_loss: 74.3893 uplift_loss: nan 
epoch: 0 time: 405.4796s
--epoch: 1 train_loss: 73.8589  valid_loss: 74.0665 uplift_loss: nan 
epoch: 1 time: 402.7189s
--epoch: 2 train_loss: 73.6515  valid_loss: 73.9199 uplift_loss: nan 
epoch: 2 time: 402.0542s
--epoch: 3 train_loss: 73.4729  valid_loss: 73.7446 uplift_loss: nan 
epoch: 3 time: 405.8036s
