In [104]:
import numpy as np
import pandas as pd
import os
import torch.nn as nn
import torch
from tqdm import tqdm
import torch.nn.functional as F
from sklearn.metrics import f1_score, classification_report, accuracy_score
from transformers import BertModel, BertTokenizer, get_linear_schedule_with_warmup
from typing import List
from torch.utils.data import DataLoader, RandomSampler, TensorDataset, SequentialSampler

def set_seed(seed):
    random.seed(seed) 
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

In [73]:
# logger
import datetime
import logging
import time


def custom_logger(name: str, log_level: str):

    logger = logging.getLogger(name)
    logger.setLevel(log_level)
    logger.propagate = False
    logger.handlers = []
    #  自定义log 格式
    # formatter = logging.Formatter('[%(asctime)s]-[%(name)s:%(levelname)s]-[%(process)d-%(thread)d]:%(message)s')
    formatter = logging.Formatter('[%(asctime)s]-[%(name)s:%(levelname)s]: %(message)s')
    #  使用utc-时间
    def _utc8_aera(timestamp):
        now = datetime.datetime.utcfromtimestamp(timestamp) + datetime.timedelta(hours=8)
        return now.timetuple()

    formatter.converter = _utc8_aera
    custom_handler = logging.StreamHandler()
    custom_handler .setFormatter(formatter)

    logger.addHandler(custom_handler)

    return logger

In [88]:
# tokenize
def tokenize_corpus(text_list: List[str], tokenizer: BertTokenizer) -> List[List[int]]:
    tokenized_data = []
    for text in tqdm(text_list):
        token = tokenizer.tokenize(text)
        token = ["[CLS]"] + token
        token = tokenizer.convert_tokens_to_ids(token)
        tokenized_data.append(token)
    return tokenized_data


def mask_token_ids(token_ids: List[List[int]], padded_size):
    input_ids = []
    input_types = []
    input_masks = []

    for token in tqdm(token_ids):
        types = [0] * (len(token))
        masks = [1] * (len(token))

        # pad
        if len(token) < padded_size:
            types = types + [1] * (padded_size - len(token))
            masks = masks + [0] * (padded_size - len(token))
            token = token + [0] * (padded_size - len(token))
        else:
            types = types[:padded_size]
            masks = masks[:padded_size]
            token = token[:padded_size]

        assert len(token) == len(masks) == len(types) == padded_size

        input_ids.append(token)
        input_types.append(types)
        input_masks.append(masks)

    return input_ids, input_types, input_masks

def split_train_dataset(input_ids: List[List[int]], input_types: List[List[int]],
                        input_masks: List[List[int]], labels: List[List[int]], batch_size: int, ratio: float) -> (
        DataLoader, DataLoader):
    random_order = list(range(len(input_ids)))
    np.random.shuffle(random_order)

    input_ids_train = np.array([input_ids[i] for i in random_order[:int(len(input_ids) * ratio)]])
    input_types_train = np.array([input_types[i] for i in random_order[:int(len(input_ids) * ratio)]])
    input_masks_train = np.array([input_masks[i] for i in random_order[:int(len(input_ids) * ratio)]])
    y_train = np.array([labels[i] for i in random_order[:int(len(input_ids) * ratio)]])

    input_ids_test = np.array(
        [input_ids[i] for i in random_order[int(len(input_ids) * ratio):]])
    input_types_test = np.array(
        [input_types[i] for i in random_order[int(len(input_ids) * ratio):]])
    input_masks_test = np.array(
        [input_masks[i] for i in random_order[int(len(input_ids) * ratio):]])
    y_test = np.array([labels[i] for i in random_order[int(len(input_ids) * ratio):]])

    train_data = TensorDataset(torch.LongTensor(input_ids_train),
                               torch.LongTensor(input_types_train),
                               torch.LongTensor(input_masks_train),
                               torch.LongTensor(y_train))
    train_sampler = RandomSampler(train_data)
    train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, drop_last=True)

    valid_data = TensorDataset(torch.LongTensor(input_ids_test),
                               torch.LongTensor(input_types_test),
                               torch.LongTensor(input_masks_test),
                               torch.LongTensor(y_test))
    valid_sampler = SequentialSampler(valid_data)
    valid_loader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size, drop_last=True)
    return train_loader, valid_loader


In [75]:
logger = custom_logger('train', 'INFO')

In [71]:
# load data
def get_data(file_path, padded_size):
    corpus = []
    labels = []
    df = pd.read_csv(file_path)
    df.dropna(inplace=True)
    for i in range(len(df)):
        text = df.iloc[i]["微博中文内容"].strip()
        label = df.iloc[i]["情感倾向"]
        if label not in ['-1', '0', '1']:
            continue
        label=int(label)
        if label == -1:
            label= 2
        corpus.append(text)
        labels.append(label)
    assert len(corpus) == len(labels)
    corpus = [text[0:padded_size] for text in corpus]
    return corpus, labels

In [5]:
test_path = os.path.join('data', 'nCov_10k_test.csv')
test_df = pd.read_csv(test_path)
test_df

Unnamed: 0,微博id,微博发布时间,发布人账号,微博中文内容,微博图片,微博视频
0,4456068992182160,01月01日 23:38,-精緻的豬豬女戰士-,#你好2020#新年第一天元气满满的早起出门买早饭结果高估了自己抗冻能力回家成功冻发烧（大概...,['https://ww2.sinaimg.cn/thumb150/745aa591ly1g...,[]
1,4456424178427250,01月02日 23:09,liujunyi88,大宝又感冒鼻塞咳嗽了，还有发烧。队友加班几天不回。感觉自己的情绪在家已然是随时引爆的状态。情...,[],[]
2,4456797466940200,01月03日 23:53,ablsa,还要去输两天液，这天也太容易感冒发烧了，一定要多喝热水啊?,['https://ww3.sinaimg.cn/orj360/006fTidCly1gaj...,[]
3,4456791021108920,01月03日 23:27,喵吃鱼干Lynn,我太难了别人怎么发烧都没事就我一检查甲型流感?,[],[]
4,4457086404997440,01月04日 19:01,我的发小今年必脱单,果然是要病一场的喽回来第三天开始感冒今儿还发烧了喉咙眼睛都难受的一匹怎么样能不经意让我的毕设...,[],[]
...,...,...,...,...,...,...
9995,4464179518243680,01月24日 08:46,一隻敦敦仔,「2020的黑天鹅事件」>2019-nCov?,[],[]
9996,4464274073923100,01月24日 15:02,艺哥的明信片,心灵鸡汤#武汉加油#我们所有人，和我们这个国家一起，正在经历着一场这个星球上史无前例的考验...,['https://ww1.sinaimg.cn/orj360/b633842dgy1gb7...,[]
9997,4464289160945130,01月24日 16:02,金融柑仔店,武大人民医院：发热咳嗽并非新冠肺炎的唯一首发症状(来自@界面新闻)武汉大学人民医院研究组1月...,['https://ww1.sinaimg.cn/orj360/72c6e287ly1gb7...,[]
9998,4465347950314820,01月27日 14:09,G顾曲周郎Z,闭关第二天发现一根白发2019-nCoV?,['https://ww1.sinaimg.cn/orj360/65eccabcly1gbb...,[]


In [100]:
# some arguments
epochs = 20
padded_size = 512
batch_size = 8
learning_rate = 1e-5
ratio = 0.8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [72]:
train_path = os.path.join('data', 'nCoV_100k_train.labled.csv')
# test_path = os.path.join('data', 'nCov_10k_test')
# train_df = pd.read_csv(train_path)
corpus,labels = get_data(train_path, 512)
tokenizer = BertTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")
tokenized_train_data = tokenize_corpus(corpus, tokenizer)
logger.info('tokenized')

NameError: name 'tokenize_corpus' is not defined

In [89]:
tokenizer = BertTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")
tokenized_train_data = tokenize_corpus(corpus, tokenizer)
logger.info('tokenized')

100%|██████████| 99560/99560 [01:38<00:00, 1008.56it/s]
[2022-08-04 16:30:13,458]-[train:INFO]: tokenized


In [94]:
input_ids, input_types, input_masks = mask_token_ids(tokenized_train_data, padded_size)
logger.info('masked')

100%|██████████| 99560/99560 [00:06<00:00, 15036.67it/s]
[2022-08-04 16:36:32,119]-[train:INFO]: masked


In [96]:
train_loader, valid_loader = split_train_dataset(input_ids, input_types, input_masks,
                                                 labels,
                                                 batch_size,
                                                 ratio)

In [98]:
# model definition
class BertClassifier(nn.Module):
    def __init__(self, hidden_size=768, mid_size=256):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("hfl/chinese-roberta-wwm-ext")
        # self.bert = BertModel.from_pretrained("nghuyong/ernie-gram-zh")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, mid_size),
            nn.BatchNorm1d(mid_size),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(mid_size, 3),
        )

    def forward(self, x):
        context = x[0]
        types = x[1]
        mask = x[2]

        # _, pooled = self.bert(context, token_type_ids=types,
        #                       attention_mask=mask,
        #                       output_all_encoded_layers=False)
        # last_hidden_state, pooled = self.bert(context, token_type_ids=types,
                            #   attention_mask=mask)
        # pooled = self.bert(context, token_type_ids=types,
        #                       attention_mask=mask).last_hidden_state[:,1,:]
        pooled = self.bert(context, token_type_ids=types,
                              attention_mask=mask).pooler_output
        # print(pooled)
        context_embedding = self.dropout(pooled)

        output = self.classifier(context_embedding)
        output = F.softmax(output, dim=1)
        return output

In [99]:
model = BertClassifier()

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [102]:
model.to(device)
logger.info(f"+++ model init on {device} +++")

[2022-08-04 16:43:41,734]-[train:INFO]: +++ model init on cuda +++


In [105]:
# optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_group_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = torch.optim.AdamW(optimizer_group_parameters,
                     lr=learning_rate,
                     eps=1e-8)
logger.info("+++ optimizer init +++")
num_training_steps = int(len(train_input_ids) * ratio) * epochs
warmup = 0.1
num_warmup_steps = num_training_steps * warmup
scheduler = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps-num_warmup_steps)

[2022-08-04 16:46:22,794]-[train:INFO]: +++ optimizer init +++


In [106]:
def train_step(model, device, train_loader, optimizer, epoch, scheduler):
    model.train()
    criterion = nn.CrossEntropyLoss()
    for batch_idx, (x1, x2, x3, y) in enumerate(train_loader):
        x1_g, x2_g, x3_g, y_g = x1.to(device), x2.to(device), x3.to(device), y.to(device)
        y_pred = model([x1_g, x2_g, x3_g])
        optimizer.zero_grad()

        loss = criterion(y_pred, y_g)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        # if (batch_idx + 1) % 100 == 0:
        if (batch_idx + 1) % 500 == 0:
            logger.info('Train Epoch: {} [{}/{} ({:.2f}%)]\tLoss: {:.6f}'.format(epoch, (batch_idx + 1) * len(x1),
                                                                           len(train_loader.dataset),
                                                                           100. * batch_idx / len(train_loader),
                                                                           loss.item()))


def valid_step(model, device, valid_loader):
    model.eval()
    valid_loss = 0.0
    valid_true = []
    valid_pred = []
    criterion = nn.CrossEntropyLoss()
    # for batch_idx, (x1, x2, x3, y) in tqdm(enumerate(valid_loader)):
    for batch_idx, (x1, x2, x3, y) in enumerate(valid_loader):
        x1_g, x2_g, x3_g, y_g = x1.to(device), x2.to(device), x3.to(device), y.to(device)
        with torch.no_grad():
            y_pred_pa_all = model([x1_g, x2_g, x3_g])

        valid_loss += criterion(y_pred_pa_all, y_g)
        batch_true = y_g.cpu()
        batch_pred = y_pred_pa_all.detach().cpu().numpy()
        for item in batch_pred:
            valid_pred.append(item.argmax(0))
        for item in np.array(batch_true):
            valid_true.append(item)

    valid_loss /= len(valid_loader)
    logger.info('Test set: Average loss: {:.4f}'.format(valid_loss))
    valid_true = np.array(valid_true)
    valid_pred = np.array(valid_pred)
    avg_acc = accuracy_score(valid_true, valid_pred)
    avg_f1s = f1_score(valid_true, valid_pred, average='macro')

    logger.info('Average: Accuracy: {:.3f}%, F1Score: {:.3f}'.format(100 * avg_acc, 100 * avg_f1s))
    logger.info(classification_report(valid_true, valid_pred))

    return avg_acc, avg_f1s, valid_loss


In [110]:
# main train
best_acc = 0.0
best_epoch = 0
for epoch in range(1, epochs + 1):
    train_step(model, device, train_loader, optimizer, epoch, scheduler)
    acc, fis, loss = valid_step(model, device, valid_loader)
    if best_acc < acc:
        best_acc = acc
        best_epoch = epoch
    bert_classifier_path = os.path.join(output_path, 'bert_classifier_epoch{}.pth'.format(epoch))
logger.info(f"+++ bert train done, best epoch: {best_epoch} +++")

RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 10.76 GiB total capacity; 9.46 GiB already allocated; 95.44 MiB free; 9.51 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF