In [1]:
import random
import numpy as np
import pandas as pd
import pickle

from functools import partial
from esn_tarnet import *
from feature_select import *
from s_learner import *
from t_learner import *
from tarnet import *
from dragonnet import *

def set_seed(seed):
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
set_seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
country_code = 'ID'
feature_list_path = f'/mlx_devbox/users/wangyuxin.huoshan/playground/bonus_train_data/{country_code}_feature_list_aftersale_gmv_selected.pkl'
feature_list_discrete_path = f'/mlx_devbox/users/wangyuxin.huoshan/playground/bonus_train_data/{country_code}_feature_list_aftersale_gmv_selected_discrete.pkl'
discrete_size_cols_path = f'/mlx_devbox/users/wangyuxin.huoshan/playground/bonus_train_data/{country_code}_feature_list_aftersale_gmv_selected_discrete_size.pkl'
train_data_apth = f'/mlx_devbox/users/wangyuxin.huoshan/playground/bonus_train_data/aftersale_train_valid_{country_code}'

In [3]:
#读取特征列表
with open(feature_list_path, 'rb') as f:
    feature_list = pickle.load(f)
print(len(feature_list))

with open(feature_list_discrete_path, 'rb') as f:
    feature_list_discrete = pickle.load(f)
print(len(feature_list_discrete))

with open(discrete_size_cols_path, 'rb') as f:
    discrete_size_cols = pickle.load(f)
print(len(discrete_size_cols))

label_y = 'gmv'
task='regression'
treatment_label_list=[0, 1]
loss_type='huberloss'
checkpoint_path=f'/mlx_devbox/users/wangyuxin.huoshan/playground/bonus_train_data/{country_code}_aftersale_gmv_tarnet_checkpoint_256.pth'

model = Tarnet(
    input_dim=len(feature_list), discrete_size_cols=discrete_size_cols,embedding_dim=3,share_dim=64,
                 share_hidden_dims =[256,128,128,64],
                 base_hidden_dims=[64,32,32,16],output_activation_base=None,
                 share_hidden_func = torch.nn.ELU(),base_hidden_func = torch.nn.ELU(), 
                 task = task,classi_nums=2, treatment_label_list=treatment_label_list,model_type='Tarnet',device=device
).to(device)
loss_f = partial(tarnet_loss)

# model = Dragonnet(input_dim=len(feature_list),discrete_size_cols=discrete_size_cols,embedding_dim=3,share_dim=64,
#                  share_hidden_dims =[512,256,256,128], share_hidden_func = torch.nn.ELU(),
#                  base_hidden_dims=[64,32,32,16],output_activation_base=torch.nn.Sigmoid(),base_hidden_func = torch.nn.ELU(),
#                  ipw_hidden_dims=[64,32,32,16],output_activation_ipw=torch.nn.Sigmoid(),ipw_hidden_func = torch.nn.ELU(),
#                  epsilons_hidden_dims=[64,32,32,16],output_activation_epsilons=torch.nn.Sigmoid(),epsilons_hidden_func = torch.nn.ELU(),
#                  task = 'classification',classi_nums=2, treatment_label_list=[0,1],model_type='Dragonnet',device=device
# ).to(device)
# loss_f = partial(dragonnet_loss)


200
60
60


In [4]:
df = pd.read_parquet(train_data_apth,columns=feature_list+['is_treatment','gmv']).fillna(0)

for column in df.columns:
    if df[column].dtype != 'float':
        df[column] = df[column].astype('float')

df['gmv_label'] = df['gmv'].apply(lambda x: 1 if x>0 else 0)
df.groupby('is_treatment').agg(
        gmv_label_pos_rate=('gmv_label', 'mean'),  # 正例比例 = 平均值（因为是0/1）
        gmv_mean=('gmv', 'mean'),                  # gmv 均值
        count=('gmv_label', 'size')                # 样本数（可选）
    ).reset_index()



Unnamed: 0,is_treatment,gmv_label_pos_rate,gmv_mean,count
0,0.0,0.477848,37.374539,2783298
1,1.0,0.478017,37.578845,2783677


In [5]:
model.fit(
    df=df,
    feature_list=feature_list,
    discrete_cols=feature_list_discrete,
    epochs=300,
    batch_size=256,
    learning_rate=1e-5,
    loss_f=loss_f,
    tensorboard=False,
    num_workers=40,
    pin_memory=True,
    device=device,
    valid_perc=True,
    label_y=label_y,
    label_treatment='is_treatment',
    task=task,
    loss_type=loss_type,
    treatment_label_list=treatment_label_list,
    checkpoint_path = checkpoint_path,
    if_continued_train = 0
)


[16.560626 22.251774  0.       ...  0.       86.133461  0.      ]
预计单epoch训练步数: 13047
--epoch: 0 train_loss: 74.2378  valid_loss: 74.4042 uplift_loss: nan 
epoch: 0 time: 294.4381s


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


--epoch: 1 train_loss: 73.8390  valid_loss: 74.1460 uplift_loss: nan 
epoch: 1 time: 294.4726s
--epoch: 2 train_loss: 73.6423  valid_loss: 73.9751 uplift_loss: nan 
epoch: 2 time: 291.4906s
--epoch: 3 train_loss: 73.4656  valid_loss: 73.7878 uplift_loss: nan 
epoch: 3 time: 293.9627s
--epoch: 4 train_loss: 73.2966  valid_loss: 73.6177 uplift_loss: nan 
epoch: 4 time: 294.2649s
--epoch: 5 train_loss: 73.1352  valid_loss: 73.4666 uplift_loss: nan 
epoch: 5 time: 292.3166s
--epoch: 6 train_loss: 72.9805  valid_loss: 73.2900 uplift_loss: nan 
epoch: 6 time: 294.7252s
--epoch: 7 train_loss: 72.8310  valid_loss: 73.1222 uplift_loss: nan 
epoch: 7 time: 294.7894s
--epoch: 8 train_loss: 72.6872  valid_loss: 72.9943 uplift_loss: nan 
epoch: 8 time: 293.4840s
--epoch: 9 train_loss: 72.5505  valid_loss: 72.8409 uplift_loss: nan 
epoch: 9 time: 291.2804s
--epoch: 10 train_loss: 72.4185  valid_loss: 72.7242 uplift_loss: nan 
epoch: 10 time: 292.7400s
--epoch: 11 train_loss: 72.2911  valid_loss: 72.