In [10]:
import os
import json

import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel


In [2]:
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

In [9]:
token_output = tokenizer("你好世界", padding="max_length", truncation=True, max_length=16, return_tensors="pt")
input_ids = token_output["input_ids"]
token_type_ids = token_output["token_type_ids"]
attention_mask = token_output["attention_mask"]
print(input_ids.shape)

torch.Size([1, 16])


In [12]:
model = BertModel.from_pretrained("bert-base-chinese")
model

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [31]:
outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, output_hidden_states=True)
print(len(outputs))
print(outputs[0].shape)
print(outputs[1].shape)
# 有 13 个输出, 第一个是 embeddings, 后面 12 个是每一层的输出, bert-base-chinese 有 12 层
print(len(outputs[2]))
print(outputs[2][0].shape)
print(outputs[2][1].shape)
# 最后一层等价于 outputs[0]
assert torch.equal(outputs[0], outputs[2][12])

3
torch.Size([1, 16, 768])
torch.Size([1, 768])
13
torch.Size([1, 16, 768])
torch.Size([1, 16, 768])


True

In [24]:
outputs[0]

tensor([[[-0.3580, -0.3439,  0.1678,  ...,  0.6644,  0.3729,  0.3799],
         [ 0.2059, -0.3735, -0.2777,  ..., -0.6587, -0.2550,  0.1424],
         [ 0.4845,  0.0801, -0.2680,  ...,  0.9598,  0.0976,  0.1302],
         ...,
         [-0.0041,  0.0122, -0.2613,  ...,  0.4369, -0.1302,  0.1112],
         [-0.0025, -0.0252, -0.2783,  ...,  0.5084, -0.0796,  0.1456],
         [-0.0063, -0.1832, -0.2105,  ...,  0.1964, -0.3326, -0.0889]]],
       grad_fn=<NativeLayerNormBackward0>)

In [28]:
outputs[2][12]

tensor([[[-0.3580, -0.3439,  0.1678,  ...,  0.6644,  0.3729,  0.3799],
         [ 0.2059, -0.3735, -0.2777,  ..., -0.6587, -0.2550,  0.1424],
         [ 0.4845,  0.0801, -0.2680,  ...,  0.9598,  0.0976,  0.1302],
         ...,
         [-0.0041,  0.0122, -0.2613,  ...,  0.4369, -0.1302,  0.1112],
         [-0.0025, -0.0252, -0.2783,  ...,  0.5084, -0.0796,  0.1456],
         [-0.0063, -0.1832, -0.2105,  ...,  0.1964, -0.3326, -0.0889]]],
       grad_fn=<NativeLayerNormBackward0>)

In [19]:
torch.tensor(1).shape

torch.Size([])

In [5]:
from data import *

In [6]:
train_file = r"D:\code\py_nlp_classify\data\train.csv"
label_file = r"D:\code\py_nlp_classify\data\label.json"
train_df = load_dataframe(train_file)
label2id, _ = load_label(label_file)
train_dataset = CustomDataset(train_df, label2id, tokenizer)

加载数据, 从 D:\code\py_nlp_classify\data\train.csv 文件中, 原始大小是 (53360, 2)
加载数据, 从 D:\code\py_nlp_classify\data\train.csv 文件中, 清理后大小是 (53360, 2)


In [15]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=2)

In [16]:
next(iter(train_dataloader))[-1].shape

torch.Size([2, 1])