# **使用BERT进行情感分类**

In [1]:
import collections
import os
import random
import tarfile
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
from transformers import BertTokenizer, BertModel

import sys

In [2]:
sys.path.append("../utils")
import d2lzh as d2l

In [3]:
torch.cuda.set_device(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
DATA_ROOT = "../datasets/IMDB"

## **数据读取**

我们使用了IMDB的情感数据集，这个数据集分为训练和测试用的数据集两个数据集，分别包含25000电影评论，正反例数据量相同

### **读取数据**

In [5]:
fname = os.path.join(DATA_ROOT, 'aclImdb_v1.tar.gz')
if not os.path.exists(os.path.join(DATA_ROOT, 'aclImdb')):
    print("从压缩包解压.....")
    with tarfile.open(fname, 'r') as f:
        f.extractall(DATA_ROOT)

In [6]:
from tqdm import tqdm
def read_imdb(folder='train', data_root=r'C:\D\ProgramFile\jupyter\learning\torch_learn\dive_to_dp\datasets\IMDB\aclImdb'):
    data = []
    for label in ['pos', 'neg']:
        folder_name = os.path.join(data_root, folder, label)
        for file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '').lower()
                data.append([review, 1 if label == 'pos' else 0])
    random.shuffle(data)
    return data

train_data, test_data = read_imdb('train'), read_imdb('test')

100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:07<00:00, 1613.62it/s]
100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:07<00:00, 1606.57it/s]
100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:08<00:00, 1562.28it/s]
100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:07<00:00, 1568.74it/s]


In [7]:
len(train_data)

25000

### **预处理数据**

预处理包括:
- 分词
- 截断和补0

In [8]:
vocab_file = 'C:\D\ProgramFile\jupyter\Playing\sentiment\pretrained_model\multi_cased_L-12_H-768_A-12/vocab.txt'
tokenizer = BertTokenizer.from_pretrained(vocab_file)

In [9]:
print(tokenizer.tokenize('I am talented'), tokenizer.encode('I am talented'))

['i', 'am', 'talent', '##ed'] [101, 177, 10392, 27411, 10336, 102]


In [10]:
def tokenize(data):
    return [tokenizer.encode(review) for review, _ in data]

In [11]:
# 截断和补零
def preprocess_imdb(data):
    max_l = 200
    
    def pad(x):
        return x[:max_l] if max_l < len(x) else x + [0] * (max_l - len(x))
    def seg(x):
        return [0] * max_l
    def mask(x):
        return [1] * max_l if len(x) >= max_l else [1] * len(x) + [0] * (max_l - len(x))
    features = torch.tensor([pad(st) for st in tokenize(data)])
    segments = torch.tensor([seg(st) for st in tokenize(data)])
    masks = torch.tensor([mask(st) for st in tokenize(data)])
    labels = torch.tensor([score for _, score in data])
    return features, segments, masks, labels

In [12]:
preprocess_imdb(train_data)

(tensor([[  101, 10464, 10108,  ...,     0,     0,     0],
         [  101,   177, 10529,  ...,     0,     0,     0],
         [  101, 63923, 10112,  ...,   102,     0,     0],
         ...,
         [  101, 11371, 11155,  ...,     0,     0,     0],
         [  101, 10458, 10944,  ..., 12888, 11206,   118],
         [  101, 10105, 23200,  ..., 14918, 86119, 23011]]),
 tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]),
 tensor([0, 1, 1,  ..., 1, 1, 1]))

### **创建迭代器**

In [13]:
batch_size = 8
train_set = Data.TensorDataset(*preprocess_imdb(train_data))
test_set = Data.TensorDataset(*preprocess_imdb(test_data))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

In [15]:
for X, seg, mask, y in train_iter:
    print('X', X.shape, 'seg', seg.shape,'mask', mask.shape, 'y', y.shape)
    break
'#batches:', len(train_iter)

X torch.Size([32, 250]) seg torch.Size([32, 250]) mask torch.Size([32, 250]) y torch.Size([32])


('#batches:', 782)

## **模型**

In [20]:
from transformers import BertConfig, BertForPreTraining

In [30]:
class BertClassifier(nn.Module):
    def __init__(self, num_tokens):
        super(BertClassifier, self).__init__()
        model_dir = r'C:\D\ProgramFile\jupyter\Playing\sentiment\pretrained_model\multi_cased_L-12_H-768_A-12'
        config = BertConfig.from_json_file(os.path.join(model_dir, 'bert.config.json'))
        self.encoder = BertForPreTraining.from_pretrained(os.path.join(model_dir, 'bert.model.ckpt.index'),
                                                 from_tf = True,
                                                 config = config).bert
        # self.encoder.resize_token_embeddings(num_tokens)
        self.decoder = nn.Linear(768, 2) # bert的输出是768维的
        
    def forward(self, inputs, segments, masks ):
        inputs= inputs.long()
        outputs, _ = self.encoder(inputs, token_type_ids=segments, attention_mask=masks)
        encoding = outputs[0][0]
        # outs = self.decoder(encoding.view(-1, 768))
        return encoding

In [31]:
net = BertClassifier(len(tokenizer))

In [32]:
net(torch.tensor([101, 23, 45, 53, 102]).unsqueeze(0), torch.tensor([0, 0, 0, 0, 0]).unsqueeze(0), torch.tensor([1, 1, 1, 1, 1]).unsqueeze(0)).shape

torch.Size([768])

## **训练**

In [34]:
def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0.0, 0
    for X, seg, mask, y in data_iter:
        if isinstance(net, torch.nn.Module):
            net.eval() # 开启评估模式
            acc_sum += (net(X, seg, mask).argmax(dim=1)==y).float().sum().item()
            net.train() # 开启训练模式
        else:
            if ('is_training' in net.__code__.co_varnames):
                acc_sum += (net(X, is_training=False).argmax(dim=1)==y).float().sum().item()
            else:
                acc_sum += (net(X).argmax(dim=1)==y).float().sum().item()
        n += y.shape[0]
    return acc_sum/n

In [40]:
import time
def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, seg, mask, y in train_iter:
            X = X.to(device)
            seg = seg.to(device)
            mask = mask.to(device)
            y = y.to(device)
            y_hat = net(X, seg, mask)
            l = loss(y_hat, y) 
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))

In [41]:
lr, num_epochs = 0.01, 5
# 要过滤掉不计算梯度的embedding参数
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()
train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

training on  cuda


RuntimeError: CUDA out of memory. Tried to allocate 94.00 MiB (GPU 0; 6.00 GiB total capacity; 4.47 GiB already allocated; 75.14 MiB free; 77.94 MiB cached)