In [1]:
%cd ../
import pandas as pd
from torch.utils.data import Dataset,DataLoader
import torch
import numpy as np
import ast
import torch.nn as nn
import torch.optim as optim
import random
from matplotlib import pyplot as plt
import os
import shutil
from datetime import datetime
import wandb
from utils import set_all_seed, load_exp1_dataset
from topk_mmoe import TopkMMoE
from trainer_and_evaluator import train_TopkMMoE, eval_TopkMMoE

C:\jupyter file\CMU_3\distributed ML\project



In [2]:
set_all_seed(42)
train_dataset, val_dataset, test_dataset = load_exp1_dataset()

Absolute Pearson correlation coefficient: 0.17716269850240274
P-value: 0.0


In [3]:
# 设定各种超参数，wandb日志名，日志存储路径，模型存储路径
train_params = {
    "batch_size": 256,
    "N_epochs": 50,
    "lr": 0.0001
}

# 仅设置dropout
model_params={
    "feature_dim": 114,
    "expert_dim": 32,
    "n_expert": 16,
    "n_activated_expert": 4,
    "n_task": 2,
    "sparse_load_balancing_loss_coef": 0,
    "olmo_load_balancing_loss_coef": 0,
    "router_z_loss_coef": 0,
    "gate_dropout": 0,
    "tower_dropout": 0,
    "expert_dropout": 0.2
}

model_name="exp1_topk_mmoe_dropout"
if not os.path.exists("model/"+model_name):
    os.makedirs("model/"+model_name) 

train_params_str = "_".join(f"{key}={value}" for key, value in train_params.items())
model_params_str = "_".join(f"{key}={value}" for key, value in model_params.items())
short_model_params_str = "_".join(f"{value}" for key, value in model_params.items())

wandb_name=model_name+":"+train_params_str+"_"+model_params_str

# 使用short_model_params_str是因为windows支持的最长文件名长度仅为260
bestmodel_save_dir=f"model/"+model_name+"/"+train_params_str+"_"+short_model_params_str 

print(wandb_name)
print(bestmodel_save_dir)

wandb.init(project="mmoe", name=wandb_name)

exp1_topk_mmoe_dropout:batch_size=256_N_epochs=50_lr=0.0001_feature_dim=114_expert_dim=32_n_expert=16_n_activated_expert=4_n_task=2_sparse_load_balancing_loss_coef=0_olmo_load_balancing_loss_coef=0_router_z_loss_coef=0_gate_dropout=0_tower_dropout=0_expert_dropout=0.2
model/exp1_topk_mmoe_dropout/batch_size=256_N_epochs=50_lr=0.0001_114_32_16_4_2_0_0_0_0_0_0.2


[34m[1mwandb[0m: Currently logged in as: [33myuntaozh[0m ([33mzhengyuntao[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
mymodel = TopkMMoE(**model_params)

nParams = sum([p.nelement() for p in mymodel.parameters()])
print('Number of parameters: %d' % nParams)

nParams_in_mmoe=0
for name,p in mymodel.named_parameters():
    if name.startswith("sparse_mmoe"):
        nParams_in_mmoe=nParams_in_mmoe+p.nelement()
print('Number of parameters in SparseMMoE: %d' % nParams_in_mmoe)
# 相比于MMOE增加了w_noises和b_noises
print(f'Number of activated parameters in SparseMMoE: {int(nParams_in_mmoe*model_params["n_activated_expert"]/model_params["n_expert"])}')

Number of parameters: 45586
Number of parameters in SparseMMoE: 43584
Number of activated parameters in SparseMMoE: 10896


In [5]:
losses, val_losses, adam_batch_loss= train_TopkMMoE(mymodel,
                                           train_dataset,
                                           val_dataset,
                                           bestmodel_save_dir,
                                           **train_params)

Currently using device:cuda
Epoch=0,train_loss=0.8511374592781067,val_loss=0.535780131816864
current epoch is the best so far. Saving model...
Epoch=1,train_loss=0.5531395077705383,val_loss=0.4611344039440155
current epoch is the best so far. Saving model...
Epoch=2,train_loss=0.49470704793930054,val_loss=0.4260278642177582
current epoch is the best so far. Saving model...
Epoch=3,train_loss=0.4503813683986664,val_loss=0.3904240131378174
current epoch is the best so far. Saving model...
Epoch=4,train_loss=0.4059414863586426,val_loss=0.3733872175216675
current epoch is the best so far. Saving model...
Epoch=5,train_loss=0.37316104769706726,val_loss=0.3385964035987854
current epoch is the best so far. Saving model...
Epoch=6,train_loss=0.3437843322753906,val_loss=0.3096078932285309
current epoch is the best so far. Saving model...
Epoch=7,train_loss=0.3277091383934021,val_loss=0.29977530241012573
current epoch is the best so far. Saving model...
Epoch=8,train_loss=0.31449761986732483,val

In [6]:
# load best model based on validation
mybestmodel = TopkMMoE(**model_params)
mybestmodel.load_state_dict(torch.load(bestmodel_save_dir))

<All keys matched successfully>

In [7]:
auc1, auc2=eval_TopkMMoE(mybestmodel, test_dataset)

The history saving thread hit an unexpected error (OperationalError('database is locked')).History will not be written to the database.
AUC: 0.9398881470859197
AUC: 0.9896809098137342


In [8]:
wandb.finish()

VBox(children=(Label(value='1.266 MB of 1.292 MB uploaded\r'), FloatProgress(value=0.9804389443305424, max=1.0…

0,1
load_balancing_loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
router_z_loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
task1_loss,██▅▃▃▄▃▃▃▃▃▂▃▃▂▂▃▂▃▂▂▃▂▂▂▂▂▄▂▃▁▂▃▂▁▃▁▂▃▂
task2_loss,█▇▅▃▄▃▃▂▃▂▂▃▂▂▂▂▂▃▂▂▁▃▃▂▂▁▂▃▂▁▂▁▂▂▂▂▂▂▁▃
task_0/expert_0_weight,██▃▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
task_0/expert_10_weight,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
task_0/expert_11_weight,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
task_0/expert_12_weight,▂▃█▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
task_0/expert_13_weight,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
task_0/expert_14_weight,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█

0,1
load_balancing_loss,0.0
router_z_loss,0.0
task1_loss,0.1508
task2_loss,0.1223
task_0/expert_0_weight,0.0
task_0/expert_10_weight,0.0
task_0/expert_11_weight,0.0
task_0/expert_12_weight,0.0
task_0/expert_13_weight,0.0
task_0/expert_14_weight,2e-05
