In [4]:
import os
import json
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel

from data_utils import MyDataset, my_collate, load_data

In [5]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
base_model = AutoModel.from_pretrained('bert-base-chinese')

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
train_dataloader, test_dataloader = load_data("tnews", "./data", tokenizer, 2, 2, "bert", "dualcl", workers=0)

In [13]:
example = next(iter(train_dataloader))
example

[{'input_ids': tensor([[ 101, 3125, 3152, 2031,  860, 6568, 2791, 6756, 3136, 4906, 1092, 3180,
            686, 5500, 1093, 3952,  102, 4905, 2094, 1762,  784,  720, 3340,  816,
            678, 3291, 2159, 3211, 1355, 5715, 8043,  102,    0,    0,    0,    0,
              0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
              0],
          [ 101, 3125, 3152, 2031,  860, 6568, 2791, 6756, 3136, 4906, 1092, 3180,
            686, 5500, 1093, 3952,  102,  697, 1920, 6381, 2497, 8013, 3419, 3360,
           2768, 1235, 1894, 1325, 1380, 5018,  671,  782, 8024,  924, 5384, 7439,
           1927, 6428, 2130, 2768, 5468, 4673,  125,  121, 2399,  671, 1896,  715,
            102]]),
  'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
           0],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   

In [19]:
inputs, labels = example

In [21]:
inputs["input_ids"]

tensor([[ 101, 3125, 3152, 2031,  860, 6568, 2791, 6756, 3136, 4906, 1092, 3180,
          686, 5500, 1093, 3952,  102, 4905, 2094, 1762,  784,  720, 3340,  816,
          678, 3291, 2159, 3211, 1355, 5715, 8043,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0],
        [ 101, 3125, 3152, 2031,  860, 6568, 2791, 6756, 3136, 4906, 1092, 3180,
          686, 5500, 1093, 3952,  102,  697, 1920, 6381, 2497, 8013, 3419, 3360,
         2768, 1235, 1894, 1325, 1380, 5018,  671,  782, 8024,  924, 5384, 7439,
         1927, 6428, 2130, 2768, 5468, 4673,  125,  121, 2399,  671, 1896,  715,
          102]])

In [27]:
token_list = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
print(" ".join(token_list))

[CLS] 故 文 娱 体 财 房 车 教 科 军 旅 世 股 农 游 [SEP] 种 子 在 什 么 条 件 下 更 容 易 发 芽 ？ [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [31]:
for a, b, c, d in zip(token_list, inputs["position_ids"][0], inputs["token_type_ids"][0], inputs["attention_mask"][0]):
    print(a, b, c, d)

[CLS] tensor(0) tensor(0) tensor(1)
故 tensor(0) tensor(0) tensor(1)
文 tensor(0) tensor(0) tensor(1)
娱 tensor(0) tensor(0) tensor(1)
体 tensor(0) tensor(0) tensor(1)
财 tensor(0) tensor(0) tensor(1)
房 tensor(0) tensor(0) tensor(1)
车 tensor(0) tensor(0) tensor(1)
教 tensor(0) tensor(0) tensor(1)
科 tensor(0) tensor(0) tensor(1)
军 tensor(0) tensor(0) tensor(1)
旅 tensor(0) tensor(0) tensor(1)
世 tensor(0) tensor(0) tensor(1)
股 tensor(0) tensor(0) tensor(1)
农 tensor(0) tensor(0) tensor(1)
游 tensor(0) tensor(0) tensor(1)
[SEP] tensor(1) tensor(0) tensor(1)
种 tensor(2) tensor(0) tensor(1)
子 tensor(3) tensor(0) tensor(1)
在 tensor(4) tensor(0) tensor(1)
什 tensor(5) tensor(0) tensor(1)
么 tensor(6) tensor(0) tensor(1)
条 tensor(7) tensor(0) tensor(1)
件 tensor(8) tensor(0) tensor(1)
下 tensor(9) tensor(0) tensor(1)
更 tensor(10) tensor(0) tensor(1)
容 tensor(11) tensor(0) tensor(1)
易 tensor(12) tensor(0) tensor(1)
发 tensor(13) tensor(0) tensor(1)
芽 tensor(14) tensor(0) tensor(1)
？ tensor(15) tensor(0) tens