# Wide&Deep

In [1]:
# 加载依赖
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

from utils.criteo_dataset import CriteoDataset
from utils.metric import CTRMetric
from utils.trainer import Trainer
from utils.utils import weight_init

In [2]:
# 使用的超参数
from configparser import ConfigParser
config = ConfigParser()
config.read('./config/wide&deep.ini', encoding='utf-8')
config = config._sections

In [3]:
eval(config['DATA']['num_workers'])

2

In [4]:
# 使用的数据集为Criteo数据集
train_pth = '../dataset/criteo-100k-train.txt'
valid_pth = '../dataset/criteo-100k-valid.txt'
test_pth = '../dataset/criteo-100k-test.txt'
train_set = CriteoDataset(train_pth, mode='train')
valid_set = CriteoDataset(valid_pth, mode='valid', encoders=train_set.encoders)
test_set = CriteoDataset(valid_pth, mode='test', encoders=train_set.encoders)



In [5]:
train_loader = DataLoader(
    dataset=train_set,
    batch_size=eval(config['DATA']['train_batch_size']),
    shuffle=True,
    num_workers=eval(config['DATA']['num_workers']),
)

valid_loader = DataLoader(
    dataset=valid_set,
    batch_size=eval(config['DATA']['valid_batch_size']),
    shuffle=False,
    num_workers=eval(config['DATA']['num_workers']),
)

test_loader = DataLoader(
    dataset=test_set,
    batch_size=eval(config['DATA']['test_batch_size']),
    shuffle=False,
    num_workers=eval(config['DATA']['num_workers']),
)

# for e in range(2):
#     for step, (batch_y, batch_X) in enumerate(tqdm(train_loader)):
#         print(f'epoch: {e}\tstep: {step}\tbatch_X: {batch_X}\tbatch_y: {batch_y}')
#         if step >= 10:
#             break

In [6]:
for data in test_loader:
    print(data)
    print(data[1][0][:32].shape)
    break

[tensor([1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
        1., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1.,
        1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0.,
        1., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0.,
        0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0.]), tensor([[ 3.7698e+00,  2.6380e+03,  1.2000e+01,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  2.0000e+00,  2.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  1.0000e+00,  2.9000e+01,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        ...,
        [ 2.0000e+00, -1.0000e+00,  4.0117e+01,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+0

In [7]:
eval(config['MODEL']['feature_dims'])

[512, 256, 128]

In [8]:
# 编写模型，简单起见，把数据集中的连续型特征（包括目标编码后的特征）放到deep里（0：32），剩下（32：111）的放到wide里
class WideAndDeep(nn.Module):
    def __init__(self, config:ConfigParser):
        self.config = config
        super(WideAndDeep, self).__init__()
        self.wide_features_num = eval(config['MODEL']['wide_features_num'])
        self.deep_features_num = eval(config['MODEL']['deep_features_num'])
        self.one_hot_features_num = eval(config['MODEL']['one_hot_features_num'])
        self.embed_dim = eval(config['MODEL']['embed_dim'])
        self.feature_dims = eval(config['MODEL']['feature_dims'])
        self.wide_features_range = eval(config['MODEL']['wide_features_range'])
        self.deep_features_range = eval(config['MODEL']['deep_features_range'])

        self.wide_bn = nn.BatchNorm1d(self.wide_features_num)
        self.wide = nn.Linear(self.wide_features_num, 1)

        self.deep_embed = nn.Embedding(num_embeddings=self.deep_features_num, embedding_dim=self.embed_dim)
        deep_input_dim = self.one_hot_features_num * self.embed_dim
        self.feature_dims.insert(0, deep_input_dim)
        self.feature_dims.append(1)

        self.deep_bn = nn.BatchNorm1d(deep_input_dim)

        self.deep = nn.Sequential()
        for i in range(len(self.feature_dims)):
            self.deep.add_module(f'linear{i}', nn.Linear(self.feature_dims[i], self.feature_dims[i + 1]))
            if i == len(self.feature_dims) - 2:
                break
            self.deep.add_module(f'relu{i}',nn.LeakyReLU())


    def forward(self, batch_x):
        bsz = batch_x.size()[0]
        # ========wide=======
        # bsz, self.wide_features_num
        wide_features = batch_x[:, self.wide_features_range[0]: self.wide_features_range[1]]
        # print(wide_features.shape)
        # bsz, 1
        wide_features = self.wide(self.wide_bn(wide_features))


        # ========deep=======
        # bsz, self.deep_features_num
        deep_features = batch_x[:, self.deep_features_range[0]: self.deep_features_range[1]]
        # print(deep_features.shape)
        # 这里利用了onehot向量组合的特性，每一行1的个数等于原始数据中被onehot处理的特征个数，也等于deep feature行求和的结果
        k = int(torch.sum(deep_features[0]).item())
        # print(f'k={k}')
        _, idx = torch.topk(deep_features, k)
        # embed: bsz, self.deep_features_num, self.embed_dim
        # after view: bsz, self.deep_features_num * self.embed_dim
        deep_features = self.deep_embed(idx).view(bsz, -1)
        # print(deep_features.shape)
        # bsz, 1

        deep_features = self.deep(self.deep_bn(deep_features))
        # raise KeyboardInterrupt
        return deep_features + wide_features

In [9]:
# 初始化模型和参数
model = WideAndDeep(config)
print(model.wide.bias.data)
model.apply(weight_init)
print(model.wide.bias.data)

tensor([-0.0887])
tensor([0.])


In [10]:
optimizer = optim.Adam(lr=eval(config['OPTIM']['learning_rate']), params=model.parameters(), weight_decay=eval(config['OPTIM']['weight_decay']))
loss_func = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(eval(config['LOSS']['pos_weight'])))
metric = CTRMetric()

trainer = Trainer(
    model=model,
    loss_func=loss_func,
    optimizer=optimizer,
    metric=metric,
    train_loader=train_loader,
    valid_loader=valid_loader,
    test_loader=test_loader,
    config=config,
)

TRAIN: 
	epoch: 30
	device: mps
OPTIM: 
	learning_rate: 1e-3
	weight_decay: 1e-1
LOSS: 
	pos_weight: 5.0
MODEL: 
	embed_dim: 256
	one_hot_features_num: 7
	wide_features_num: 32
	wide_features_range: [0, 32]
	deep_features_num: 79
	deep_features_range: [32, 111]
	feature_dims: [512, 256, 128]
	task: classification
	user_num: 611
	item_num: 193610
DATA: 
	train_batch_size: 256
	valid_batch_size: 128
	test_batch_size: 128
	dataset_ratio: [0.8, 0.1, 0.1]
	num_workers: 2
	num_feature: 111


In [11]:
if __name__ == '__main__':
    trainer.train()
    trainer.test()



100%|██████████| 313/313 [00:04<00:00, 62.63it/s] 


Train Epoch: 1
Loss: 1.2085383024078589


100%|██████████| 79/79 [00:02<00:00, 30.34it/s]


Valid Epoch: 1
loss: 1.277462159531026
accuracy: 0.714399
precision: 0.414783
recall: 0.550818
F1: 0.469138
AUC: 0.713258



100%|██████████| 313/313 [00:04<00:00, 65.14it/s] 


Train Epoch: 2
Loss: 1.0086587915024436


100%|██████████| 79/79 [00:02<00:00, 30.66it/s]


Valid Epoch: 2
loss: 1.2050770589067965
accuracy: 0.748121
precision: 0.451978
recall: 0.467982
F1: 0.456204
AUC: 0.727486



100%|██████████| 313/313 [00:04<00:00, 63.54it/s] 


Train Epoch: 3
Loss: 0.9475117238184896


100%|██████████| 79/79 [00:02<00:00, 29.73it/s]


Valid Epoch: 3
loss: 1.1701276196709163
accuracy: 0.748912
precision: 0.453450
recall: 0.465058
F1: 0.456236
AUC: 0.730480



100%|██████████| 313/313 [00:04<00:00, 62.87it/s] 


Train Epoch: 4
Loss: 0.938360391143031


100%|██████████| 79/79 [00:02<00:00, 29.92it/s]


Valid Epoch: 4
loss: 1.1564572015895118
accuracy: 0.751088
precision: 0.457934
recall: 0.469742
F1: 0.460716
AUC: 0.729842



100%|██████████| 313/313 [00:04<00:00, 63.22it/s] 


Train Epoch: 5
Loss: 0.949598237538871


100%|██████████| 79/79 [00:02<00:00, 30.61it/s]


Valid Epoch: 5
loss: 1.156926397281357
accuracy: 0.739320
precision: 0.438072
recall: 0.478126
F1: 0.454424
AUC: 0.723160



100%|██████████| 313/313 [00:04<00:00, 64.28it/s] 


Train Epoch: 6
Loss: 0.9620775784166476


100%|██████████| 79/79 [00:02<00:00, 29.77it/s]


Valid Epoch: 6
loss: 1.167322069783754
accuracy: 0.746835
precision: 0.449094
recall: 0.445234
F1: 0.444014
AUC: 0.720431



100%|██████████| 313/313 [00:04<00:00, 64.50it/s] 


Train Epoch: 7
Loss: 0.9683396835296679


100%|██████████| 79/79 [00:02<00:00, 30.29it/s]


Valid Epoch: 7
loss: 1.1616799771031248
accuracy: 0.737441
precision: 0.434934
recall: 0.472422
F1: 0.449897
AUC: 0.719739



100%|██████████| 313/313 [00:04<00:00, 63.66it/s] 


Train Epoch: 8
Loss: 0.9724025029343919


100%|██████████| 79/79 [00:02<00:00, 30.13it/s]


Valid Epoch: 8
loss: 1.1582144544094424
accuracy: 0.733782
precision: 0.429514
recall: 0.477529
F1: 0.449318
AUC: 0.720537



100%|██████████| 313/313 [00:04<00:00, 63.95it/s] 


Train Epoch: 9
Loss: 0.9725071376504989


100%|██████████| 79/79 [00:02<00:00, 29.53it/s]


Valid Epoch: 9
loss: 1.162882379338711
accuracy: 0.744660
precision: 0.445369
recall: 0.447593
F1: 0.443468
AUC: 0.722220



  0%|          | 0/313 [00:00<?, ?it/s]
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/xansar/.conda/envs/RecommenderSystem/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Users/xansar/.conda/envs/RecommenderSystem/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
  File "/Users/xansar/PycharmProjects/RecommenderSystem/Recommender-System-Pytorch/MyImplement/utils/__init__.py", line 1, in <module>
    from . import criteo_dataset, movielens_dataset, metric, trainer
  File "/Users/xansar/PycharmProjects/RecommenderSystem/Recommender-System-Pytorch/MyImplement/utils/criteo_dataset.py", line 3, in <module>
    import pandas_profiling as pp
  File "/Users/xansar/.conda/envs/RecommenderSystem/lib/python3.8/site-packages/pandas_profiling/__init__.py", line 6, in <module>
    from pandas_profiling.controller import pandas_decorato

KeyboardInterrupt: 