In [1]:
import os
import sys
import pickle
import torch
import pandas as pd
from torch import nn as nn
from torch.utils.data import DataLoader, Dataset

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import BertModel, BertTokenizer

from typing import NewType, List, Tuple, Dict, Any

# myself
sys.path.append('/root/projects/nlp-code-examples')
from src.easy_bert.TextClassifiy import models
from src.easy_bert.TextClassifiy import tools
from src.easy_bert.TextClassifiy import utils

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
class Config():
    use_gpu = True
    gpu_id = 0
    preprocess = True
    bert_path = '/root/pretrained/bert-base-chinese'
    data_path = '/root/data/public/ChnSentiCorp'

    # 预处理后存放文件位置
    out_path = 'data/out'
    max_len = 256
    batch_size = 32
    dropout = 0.3
    num_hidden = 768
    num_classes = 1
    train_batch_size = 32
    epoch = 1
    seed = 1234
    early_stopping_patience = 6
    model_name = 'bert'
    train_log = 10
    log_interval = 10
cfg = Config()
cfg.cwd = os.getcwd()

In [3]:
if cfg.use_gpu and torch.cuda.is_available():
    device = torch.device('cuda', cfg.gpu_id)
else:
    device = torch.device('cpu')

In [4]:
# 数据预处理
tools.preprocess(cfg)

In [5]:
train_data_path = os.path.join(cfg.cwd, cfg.out_path, 'train.pkl')
valid_data_path = os.path.join(cfg.cwd, cfg.out_path, 'valid.pkl')
test_data_path = os.path.join(cfg.cwd, cfg.out_path, 'test.pkl')

train_dataset = tools.CustomDataset(train_data_path)
valid_dataset = tools.CustomDataset(valid_data_path)
test_dataset = tools.CustomDataset(test_data_path)

In [6]:
train_dataloader = DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True, collate_fn=tools.collate_fn(cfg))
valid_dataloader = DataLoader(valid_dataset, batch_size=cfg.batch_size, shuffle=True, collate_fn=tools.collate_fn(cfg))
test_dataloader = DataLoader(test_dataset, batch_size=cfg.batch_size, shuffle=True, collate_fn=tools.collate_fn(cfg))

Some weights of the model checkpoint at /root/pretrained/bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.001,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]
cfg.dataset_len = len(train_dataset)
num_train_steps = int(cfg.dataset_len / cfg.train_batch_size * cfg.epoch)
optimizer = AdamW(optimizer_parameters, lr=3e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
)

# criterion = nn.CrossEntropyLoss()
criterion = nn.BCEWithLogitsLoss()
best_f1, best_epoch = -1, 0
es_loss, es_f1, es_epoch, es_patience, best_es_epoch, best_es_f1, es_path, best_es_path = 1e8, -1, 0, 0, 0, -1, '', ''
train_losses, valid_losses = [], []

In [18]:
writer = None
for epoch in range(1, cfg.epoch + 1):
    utils.manual_seed(cfg.seed + epoch)
    train_loss = tools.train(epoch, model, train_dataloader, optimizer, scheduler, criterion, device, writer, cfg)
    valid_f1, valid_loss = tools.validate(epoch, model, valid_dataloader, criterion, device, cfg)
    # scheduler.step(valid_loss)
    model_path = model.save(epoch, cfg)
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    if best_f1 < valid_f1:
        best_f1 = valid_f1
        best_epoch = epoch
    # 使用 valid loss 做 early stopping 的判断标准
    if es_loss > valid_loss:
        es_loss = valid_loss
        es_f1 = valid_f1
        es_epoch = epoch
        es_patience = 0
        es_path = model_path
    else:
        es_patience += 1
        if es_patience >= cfg.early_stopping_patience:
            best_es_epoch = es_epoch
            best_es_f1 = es_f1
            best_es_path = es_path
            
if best_es_path == '':
    best_es_path = es_path
            
_ , test_loss = tools.validate(-1, model, test_dataloader, criterion, device, cfg)