# Logistic Regression

In [1]:
# 依赖，使用torch构建模型
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

from tqdm import tqdm

from utils.utils import CriteoDataset, CTRMetric

# 1. 数据集处理

In [2]:
# 使用的超参数
config = {
    'TRAIN_BATCH_SIZE': 128,
    'VALID_BATCH_SIZE': 128,
    'TEST_BATCH_SIZE': 128,
    'DEVICE': 'mps',
    'NUM_WORKERS': 6,
    'EPOCH': 30,
    'NUM_FEATURE': 111,
    'POS_WEIGHT': 4,
    'LEARNING_RATE': 1e-4,
}


In [3]:
# 使用的数据集为Criteo数据集
train_pth = '../dataset/criteo-100k-train.txt'
valid_pth = '../dataset/criteo-100k-valid.txt'
test_pth = '../dataset/criteo-100k-test.txt'
train_set = CriteoDataset(train_pth, mode='train')
valid_set = CriteoDataset(valid_pth, mode='valid', encoders=train_set.encoders)
test_set = CriteoDataset(valid_pth, mode='test', encoders=train_set.encoders)



In [4]:
train_loader = DataLoader(
    dataset=train_set,
    batch_size=config['TRAIN_BATCH_SIZE'],
    shuffle=True,
    num_workers=config['NUM_WORKERS']
)

valid_loader = DataLoader(
    dataset=valid_set,
    batch_size=config['VALID_BATCH_SIZE'],
    shuffle=False,
    num_workers=config['NUM_WORKERS']
)

test_loader = DataLoader(
    dataset=test_set,
    batch_size=config['TEST_BATCH_SIZE'],
    shuffle=False,
    num_workers=config['NUM_WORKERS']
)

# for e in range(2):
#     for step, (batch_y, batch_X) in enumerate(tqdm(train_loader)):
#         print(f'epoch: {e}\tstep: {step}\tbatch_X: {batch_X}\tbatch_y: {batch_y}')
#         if step >= 10:
#             break

In [5]:
# 构建模型
class LogisticRegressionModel(nn.Module):
    def __init__(self, num_features):
        super(LogisticRegressionModel, self).__init__()
        self.bn = nn.BatchNorm1d(num_features=num_features)
        self.linar_layer = nn.Linear(num_features, 1)

    def forward(self, batch_X):
        return self.linar_layer(self.bn(batch_X))

In [6]:
# 构建trianer
class Trainer:
    def __init__(self, model, loss_func, optimizer, metric, train_loader, valid_loader, test_loader, config):
        self.config = config
        print('='*10 + "Config" + '='*10)
        for k, v in self.config.items():
            print(f'{k}: {v}')
        print('='*25)
        self.model: nn.Module = model
        self.loss_func: nn.Module = loss_func
        self.optimizer: optim.Optimizer = optimizer
        self.metric = metric
        self.train_loader = train_loader
        self.valid_loader = valid_loader
        self.test_loader = test_loader

        self.to(self.config['DEVICE'])

    def to(self, device=None):
        if device is None:
            self.model = self.model.to(self.config['DEVICE'])
            self.loss_func = self.loss_func.to(self.config['DEVICE'])
        else:
            self.model = self.model.to(device)
            self.loss_func = self.loss_func.to(self.config['DEVICE'])
            self.config['DEVICE'] = device

    def step(self, batch_y, batch_x, mode='train'):
        device = self.config['DEVICE']
        if mode == 'train':
            self.model.train()
            self.optimizer.zero_grad()

            batch_y = batch_y.to(device)
            batch_x = batch_x.to(device)

            pred = self.model(batch_x)
            loss = self.loss_func(pred, batch_y)
            loss.backward()
            self.optimizer.step()
            return loss.item(), pred
        elif mode == 'evaluate':
            with torch.no_grad():
                self.model.eval()

                batch_y = batch_y.to(device)
                batch_x = batch_x.to(device)

                pred = self.model(batch_x)
                loss = self.loss_func(pred, batch_y)
                return loss.item(), pred
        else:
            raise ValueError("Wrong Mode")
    def _compute_metric(self, metric_str):
        self.metric.get_batch_metric()
        for k, v in self.metric.metric_dict.items():
            if k == 'cnt':
                continue
            metric_str += f'{k}: {v:4f}\n'
        self.metric.init_metric()
        return metric_str
    def train(self):
        print("="*10 + "TRAIN BEGIN" + "="*10)
        epoch = self.config['EPOCH']
        for e in range(1, epoch + 1):
            all_loss = 0.0
            for s, (batch_y, batch_x) in enumerate(tqdm(train_loader)):
                loss, pred = self.step(batch_y.unsqueeze(1), batch_x, mode='train')
                all_loss += loss

            all_loss /= s + 1
            print(f'Train Epoch: {e}\nLoss: {all_loss}')
            if e % 1 == 0:
                all_loss = 0.0
                self.metric.init_metric()
                for s, (batch_y, batch_x) in enumerate(tqdm(valid_loader)):
                    loss, pred = self.step(batch_y.unsqueeze(1), batch_x, mode='evaluate')
                    all_loss += loss

                    self.metric.compute_metric(pred, batch_y)
                all_loss /= s + 1
                metric_str = f'loss: {all_loss}\n'
                metric_str = self._compute_metric(metric_str)
                print(f'Valid Epoch: {e}\n' + metric_str)
        print("="*10 + "TRAIN END" + "="*10)

    def test(self):
        all_loss = 0.0
        self.metric.init_metric()
        for s, (batch_y, batch_x) in enumerate(tqdm(test_loader)):
            loss, pred = self.step(batch_y.unsqueeze(1), batch_x, mode='evaluate')
            all_loss += loss

        all_loss /= s + 1
        metric_str = f'loss: {all_loss}\n'
        metric_str = self._compute_metric(metric_str)
        print(f'Test Loss: {all_loss}\n' + metric_str)

In [7]:
model = LogisticRegressionModel(num_features=config['NUM_FEATURE'])
weight = torch.tensor(config['POS_WEIGHT'])
loss_func = nn.BCEWithLogitsLoss(pos_weight=weight)
optimizer = optim.Adam(lr=config['LEARNING_RATE'], params=model.parameters())
metric = CTRMetric()

trainer = Trainer(
    model=model,
    loss_func=loss_func,
    optimizer=optimizer,
    metric=metric,
    train_loader=train_loader,
    valid_loader=valid_loader,
    test_loader=test_loader,
    config=config
)

TRAIN_BATCH_SIZE: 128
VALID_BATCH_SIZE: 128
TEST_BATCH_SIZE: 128
DEVICE: mps
NUM_WORKERS: 6
EPOCH: 30
NUM_FEATURE: 111
POS_WEIGHT: [4]
LEARNING_RATE: 0.0001


In [8]:
if __name__ == '__main__':
    trainer.train()
    trainer.test()



100%|██████████| 625/625 [00:09<00:00, 67.62it/s] 


Train Epoch: 1
Loss: 1.0577327836036683


100%|██████████| 79/79 [00:07<00:00, 11.21it/s]


Valid Epoch: 1
loss: 1.0555117107644867
accuracy: 0.737342
precision: 0.429355
recall: 0.428049
F1: 0.424967
AUC: 0.710751



100%|██████████| 625/625 [00:09<00:00, 68.97it/s] 


Train Epoch: 2
Loss: 0.9186187308311462


100%|██████████| 79/79 [00:07<00:00, 11.09it/s]


Valid Epoch: 2
loss: 1.0296182790889015
accuracy: 0.738924
precision: 0.440759
recall: 0.506901
F1: 0.468177
AUC: 0.736729



100%|██████████| 625/625 [00:09<00:00, 68.40it/s] 


Train Epoch: 3
Loss: 0.866689479637146


100%|██████████| 79/79 [00:07<00:00, 11.22it/s]


Valid Epoch: 3
loss: 1.012877725347688
accuracy: 0.737144
precision: 0.439886
recall: 0.537942
F1: 0.480869
AUC: 0.746732



100%|██████████| 625/625 [00:08<00:00, 70.41it/s] 


Train Epoch: 4
Loss: 0.8407413570404053


100%|██████████| 79/79 [00:06<00:00, 11.50it/s]


Valid Epoch: 4
loss: 1.0091328809532938
accuracy: 0.739122
precision: 0.443644
recall: 0.543889
F1: 0.485548
AUC: 0.749602



100%|██████████| 625/625 [00:08<00:00, 71.45it/s] 


Train Epoch: 5
Loss: 0.8233562371253967


100%|██████████| 79/79 [00:06<00:00, 11.50it/s]


Valid Epoch: 5
loss: 1.0172273063961463
accuracy: 0.743968
precision: 0.449669
recall: 0.528974
F1: 0.483197
AUC: 0.750255



100%|██████████| 625/625 [00:08<00:00, 71.57it/s] 


Train Epoch: 6
Loss: 0.8124627101898193


100%|██████████| 79/79 [00:06<00:00, 11.50it/s]


Valid Epoch: 6
loss: 1.0250733879548084
accuracy: 0.749901
precision: 0.458684
recall: 0.516927
F1: 0.483254
AUC: 0.749421



100%|██████████| 625/625 [00:08<00:00, 70.95it/s] 


Train Epoch: 7
Loss: 0.8038832220077514


100%|██████████| 79/79 [00:06<00:00, 11.29it/s]


Valid Epoch: 7
loss: 1.0275592494614516
accuracy: 0.745154
precision: 0.451820
recall: 0.534962
F1: 0.486703
AUC: 0.748990



100%|██████████| 625/625 [00:08<00:00, 70.86it/s] 


Train Epoch: 8
Loss: 0.7969004862785339


100%|██████████| 79/79 [00:06<00:00, 11.48it/s]


Valid Epoch: 8
loss: 1.0252509833891181
accuracy: 0.750297
precision: 0.459529
recall: 0.517545
F1: 0.483591
AUC: 0.749141



100%|██████████| 625/625 [00:08<00:00, 71.48it/s] 


Train Epoch: 9
Loss: 0.7916098605155945


100%|██████████| 79/79 [00:06<00:00, 11.51it/s]


Valid Epoch: 9
loss: 1.0363187269319463
accuracy: 0.746440
precision: 0.453645
recall: 0.530120
F1: 0.485554
AUC: 0.746968



100%|██████████| 625/625 [00:08<00:00, 71.44it/s] 


Train Epoch: 10
Loss: 0.7872112084388733


100%|██████████| 79/79 [00:06<00:00, 11.47it/s]


Valid Epoch: 10
loss: 1.038079438330252
accuracy: 0.752176
precision: 0.461570
recall: 0.515112
F1: 0.483594
AUC: 0.749421



100%|██████████| 625/625 [00:08<00:00, 71.44it/s] 


Train Epoch: 11
Loss: 0.7843875405311584


100%|██████████| 79/79 [00:06<00:00, 11.44it/s]


Valid Epoch: 11
loss: 1.0421081600309927
accuracy: 0.752373
precision: 0.462973
recall: 0.504397
F1: 0.479516
AUC: 0.747742



100%|██████████| 625/625 [00:08<00:00, 69.59it/s] 


Train Epoch: 12
Loss: 0.7821734395980835


100%|██████████| 79/79 [00:07<00:00, 11.25it/s]


Valid Epoch: 12
loss: 1.0572454868992673
accuracy: 0.747132
precision: 0.453794
recall: 0.513601
F1: 0.478608
AUC: 0.746095



100%|██████████| 625/625 [00:08<00:00, 69.68it/s] 


Train Epoch: 13
Loss: 0.7817610609054565


100%|██████████| 79/79 [00:06<00:00, 11.42it/s]


Valid Epoch: 13
loss: 1.0468581492387796
accuracy: 0.754450
precision: 0.466657
recall: 0.496185
F1: 0.477626
AUC: 0.748067



100%|██████████| 625/625 [00:08<00:00, 70.60it/s] 


Train Epoch: 14
Loss: 0.7790017313957215


100%|██████████| 79/79 [00:06<00:00, 11.45it/s]


Valid Epoch: 14
loss: 1.0554074577138395
accuracy: 0.751483
precision: 0.461252
recall: 0.502280
F1: 0.477563
AUC: 0.747561



100%|██████████| 625/625 [00:08<00:00, 70.96it/s] 


Train Epoch: 15
Loss: 0.7783238967895508


100%|██████████| 79/79 [00:07<00:00, 11.25it/s]


Valid Epoch: 15
loss: 1.0596637891817697
accuracy: 0.747627
precision: 0.452311
recall: 0.506893
F1: 0.474969
AUC: 0.744651



100%|██████████| 625/625 [00:08<00:00, 69.96it/s] 


Train Epoch: 16
Loss: 0.7768620523452758


100%|██████████| 79/79 [00:06<00:00, 11.35it/s]


Valid Epoch: 16
loss: 1.0657930962647064
accuracy: 0.759889
precision: 0.472401
recall: 0.467350
F1: 0.466899
AUC: 0.745698



100%|██████████| 625/625 [00:08<00:00, 69.96it/s] 


Train Epoch: 17
Loss: 0.777294733428955


100%|██████████| 79/79 [00:07<00:00, 11.07it/s]


Valid Epoch: 17
loss: 1.057172275042232
accuracy: 0.748418
precision: 0.454095
recall: 0.506926
F1: 0.475441
AUC: 0.746172



100%|██████████| 625/625 [00:08<00:00, 69.61it/s] 


Train Epoch: 18
Loss: 0.7768491014480591


100%|██████████| 79/79 [00:06<00:00, 11.36it/s]


Valid Epoch: 18
loss: 1.0593930060350443
accuracy: 0.753362
precision: 0.461143
recall: 0.489052
F1: 0.471610
AUC: 0.746387



100%|██████████| 625/625 [00:08<00:00, 70.99it/s] 


Train Epoch: 19
Loss: 0.7746786587715149


100%|██████████| 79/79 [00:06<00:00, 11.35it/s]


Valid Epoch: 19
loss: 1.0596915762635726
accuracy: 0.758604
precision: 0.471331
recall: 0.480927
F1: 0.472825
AUC: 0.747887



100%|██████████| 625/625 [00:08<00:00, 70.59it/s] 


Train Epoch: 20
Loss: 0.7741651511192322


100%|██████████| 79/79 [00:06<00:00, 11.45it/s]


Valid Epoch: 20
loss: 1.0574604280387299
accuracy: 0.748022
precision: 0.453297
recall: 0.510963
F1: 0.476885
AUC: 0.745680



100%|██████████| 625/625 [00:08<00:00, 70.61it/s] 


Train Epoch: 21
Loss: 0.7739366086006164


100%|██████████| 79/79 [00:06<00:00, 11.29it/s]


Valid Epoch: 21
loss: 1.0641404322431058
accuracy: 0.748912
precision: 0.452502
recall: 0.491283
F1: 0.467891
AUC: 0.743575



100%|██████████| 625/625 [00:08<00:00, 70.91it/s] 


Train Epoch: 22
Loss: 0.7727512150764465


100%|██████████| 79/79 [00:06<00:00, 11.36it/s]


Valid Epoch: 22
loss: 1.0627609619611427
accuracy: 0.750297
precision: 0.455396
recall: 0.492948
F1: 0.470192
AUC: 0.744255



100%|██████████| 625/625 [00:08<00:00, 70.72it/s] 


Train Epoch: 23
Loss: 0.7746822437286377


  0%|          | 0/79 [00:03<?, ?it/s]Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/xansar/.conda/envs/RecommenderSystem/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Users/xansar/.conda/envs/RecommenderSystem/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
  File "/Users/xansar/PycharmProjects/RecommenderSystem/Recommender-System-Pytorch/MyImplement/utils/utils.py", line 3, in <module>
    import pandas_profiling as pp
  File "/Users/xansar/.conda/envs/RecommenderSystem/lib/python3.8/site-packages/pandas_profiling/__init__.py", line 6, in <module>
    from pandas_profiling.controller import pandas_decorator
  File "/Users/xansar/.conda/envs/RecommenderSystem/lib/python3.8/site-packages/pandas_profiling/controller/pandas_decorator.py", line 4, in <module>
    from pandas_profiling.profile_report import ProfileReport
 

KeyboardInterrupt: 