## 1.加载和使用预训练模型

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModel

# 加载模型
model = AutoModel.from_pretrained("google-bert/bert-base-chinese")

In [8]:
type(model)

transformers.models.bert.modeling_bert.BertModel

In [9]:
model = AutoModel.from_pretrained("./pretrained/bert-base-chinese")

In [10]:
type(model)

transformers.models.bert.modeling_bert.BertModel

In [11]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [14]:
# 加载模型
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-chinese", num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [16]:
type(model)

transformers.models.bert.modeling_bert.BertForSequenceClassification

## 2. 加载和使用Tokenizer

### 2.1 加载

In [2]:
from transformers import AutoTokenizer

# 加载分词
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [3]:
tokenizer = AutoTokenizer.from_pretrained("./pretrained/bert-base-chinese")

In [4]:
type(tokenizer)

transformers.models.bert.tokenization_bert_fast.BertTokenizerFast

### 2.2 使用

In [5]:
# 1.分词（tokenize）
tokens = tokenizer.tokenize("我爱自然语言处理")
tokens

['我', '爱', '自', '然', '语', '言', '处', '理']

In [6]:
# 2.token->id
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[2769, 4263, 5632, 4197, 6427, 6241, 1905, 4415]

In [7]:
# 3.id->token
tokens = tokenizer.convert_ids_to_tokens(ids)
tokens

['我', '爱', '自', '然', '语', '言', '处', '理']

In [11]:
# 4.encode
ids = tokenizer.encode("我爱自然语言处理", padding='max_length', max_length=20)
ids

[101,
 2769,
 4263,
 5632,
 4197,
 6427,
 6241,
 1905,
 4415,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [14]:
text = tokenizer.decode(ids, skip_special_tokens=True)
text.replace(" ", "")

'我爱自然语言处理'

In [17]:
# 5. __call__
inputs = tokenizer("我爱自然语言处理", return_tensors='pt')
inputs

{'input_ids': tensor([[ 101, 2769, 4263, 5632, 4197, 6427, 6241, 1905, 4415,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [58]:
# 批量Tokenizer
texts = ["我爱自然语言处理", "我爱人工智能", "我们一起学习"]
inputs = tokenizer(texts, padding=True)
inputs

{'input_ids': [[101, 2769, 4263, 5632, 4197, 6427, 6241, 1905, 4415, 102], [101, 2769, 4263, 782, 2339, 3255, 5543, 102, 0, 0], [101, 2769, 812, 671, 6629, 2110, 739, 102, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]}

### 2.3 分词模型配合使用

In [22]:
from transformers import AutoModel, AutoTokenizer
import torch

texts = ["我爱自然语言处理", "我爱人工智能", "我们一起学习"]

model = AutoModel.from_pretrained("google-bert/bert-base-chinese")
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese")

inputs = tokenizer(texts, padding=True, return_tensors='pt')

with torch.no_grad():
    output = model(**inputs)

print(output.last_hidden_state.shape)
print(output.pooler_output.shape)

torch.Size([3, 10, 768])
torch.Size([3, 768])


In [25]:
from transformers import AutoModel, AutoModelForSequenceClassification
import torch

texts = ["我爱自然语言处理", "我爱人工智能", "我们一起学习"]

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-chinese")
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese")

inputs = tokenizer(texts, padding=True, return_tensors='pt')

with torch.no_grad():
    output = model(**inputs)

print(output.logits.shape)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([3, 2])


## 3. Datasets

### 3.1 加载数据集

In [47]:
from datasets import load_dataset

dataset_dict = load_dataset("csv", data_files={'train': 'data/raw/train.csv', 'test': 'data/raw/test.csv'})
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['cat', 'label', 'review'],
        num_rows: 62774
    })
    test: Dataset({
        features: ['cat', 'label', 'review'],
        num_rows: 62774
    })
})

In [30]:
dataset_dict = load_dataset("csv", data_files='data/raw/train.csv')
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['cat', 'label', 'review'],
        num_rows: 62774
    })
})

### 3.2 查看数据集

In [48]:
# 1. 获取Dataset
dataset = dataset_dict['train']
dataset

Dataset({
    features: ['cat', 'label', 'review'],
    num_rows: 62774
})

In [32]:
# 访问行
dataset[0]

{'cat': '书籍',
 'label': 1,
 'review': '做父母一定要有刘墉这样的心态，不断地学习，不断地进步，不断地给自己补充新鲜血液，让自己保持一颗年轻的心。我想，这是他能很好的和孩子沟通的一个重要因素。读刘墉的文章，总能让我看到一个快乐的平易近人的父亲，他始终站在和孩子同样的高度，给孩子创造着一个充满爱和自由的生活环境。很喜欢刘墉在字里行间流露出的做父母的那种小狡黠，让人总是忍俊不禁，父母和子女之间有时候也是一种战斗，武力争斗过于低级了，智力较量才更有趣味。所以，做父母的得加把劲了，老思想老观念注定会一败涂地，生命不息，学习不止。家庭教育，真的是乐在其中。'}

In [49]:
dataset[0:3]

{'cat': ['书籍', '书籍', '书籍'],
 'label': [1, 1, 1],
 'review': ['做父母一定要有刘墉这样的心态，不断地学习，不断地进步，不断地给自己补充新鲜血液，让自己保持一颗年轻的心。我想，这是他能很好的和孩子沟通的一个重要因素。读刘墉的文章，总能让我看到一个快乐的平易近人的父亲，他始终站在和孩子同样的高度，给孩子创造着一个充满爱和自由的生活环境。很喜欢刘墉在字里行间流露出的做父母的那种小狡黠，让人总是忍俊不禁，父母和子女之间有时候也是一种战斗，武力争斗过于低级了，智力较量才更有趣味。所以，做父母的得加把劲了，老思想老观念注定会一败涂地，生命不息，学习不止。家庭教育，真的是乐在其中。',
  '作者真有英国人严谨的风格，提出观点、进行论述论证，尽管本人对物理学了解不深，但是仍然能感受到真理的火花。整本书的结构颇有特点，从当时（本书写于八十年代）流行的计算机话题引入，再用数学、物理学、宇宙学做必要的铺垫——这些内容占据了大部分篇幅，最后回到关键问题：电脑能不能代替人脑。和现在流行的观点相反，作者认为人的某种“洞察”是不能被算法模拟的。也许作者想说，人的灵魂是无可取代的。',
  '作者长篇大论借用详细报告数据处理工作和计算结果支持其新观点。为什么荷兰曾经县有欧洲最高的生产率？为什么在文化上有着深刻纽带关系的中国和日本却在经济发展上有着极大的差异？为什么英国的北美殖民地造就了经济强大的美国，而西班牙的北美殖民却造就了范后的墨西哥？……很有价值，但不包括【中国近代史专业】。']}

In [50]:
# 获取列
dataset[0]['review']

'做父母一定要有刘墉这样的心态，不断地学习，不断地进步，不断地给自己补充新鲜血液，让自己保持一颗年轻的心。我想，这是他能很好的和孩子沟通的一个重要因素。读刘墉的文章，总能让我看到一个快乐的平易近人的父亲，他始终站在和孩子同样的高度，给孩子创造着一个充满爱和自由的生活环境。很喜欢刘墉在字里行间流露出的做父母的那种小狡黠，让人总是忍俊不禁，父母和子女之间有时候也是一种战斗，武力争斗过于低级了，智力较量才更有趣味。所以，做父母的得加把劲了，老思想老观念注定会一败涂地，生命不息，学习不止。家庭教育，真的是乐在其中。'

## 3.3 预处理数据集

In [51]:
# 1.删除列
dataset = dataset.remove_columns(['cat'])

In [52]:
dataset.features

{'label': Value('int64'), 'review': Value('string')}

In [53]:
# 2.过滤行
dataset = dataset.filter(lambda x: x['review'] is not None)

Filter:   0%|          | 0/62774 [00:00<?, ? examples/s]

In [54]:
# 3.划分数据集
dataset_dict = dataset.train_test_split(test_size=0.2)
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 50218
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 12555
    })
})

In [55]:
dataset_dict['train']

Dataset({
    features: ['label', 'review'],
    num_rows: 50218
})

In [61]:
# 4. map操作(逐条)
train_dataset = dataset_dict['train']
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese")


def encode(example):
    # example:{label:1,review:'******'}
    inputs = tokenizer(example['review'], padding='max_length', max_length=128, truncation=True)
    inputs['labels'] = example['label']
    return inputs


train_dataset = train_dataset.map(encode, batched=True, remove_columns=['label', 'review'])
train_dataset.features

Map:   0%|          | 0/50218 [00:00<?, ? examples/s]

{'input_ids': List(Value('int32')),
 'token_type_ids': List(Value('int8')),
 'attention_mask': List(Value('int8')),
 'labels': Value('int64')}

In [60]:
# 5. map操作(按批)
test_dataset = dataset_dict['test']
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese")

def batch_encode(batch):
    # example:{label:[1,0,1,0],review:['******','******','******','******']}
    inputs = tokenizer(batch['review'],padding='max_length', max_length=128, truncation=True)
    inputs['labels'] = batch['label']
    return inputs

test_dataset = test_dataset.map(batch_encode, batched=True, remove_columns=['label', 'review'])

Map:   0%|          | 0/12555 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 12555
})

In [63]:
# 6. map操作(dataset_dict)
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese")
def batch_encode(batch):
    # example:{label:[1,0,1,0],review:['******','******','******','******']}
    inputs = tokenizer(batch['review'],padding='max_length', max_length=128, truncation=True)
    inputs['labels'] = batch['label']
    return inputs

dataset_dict = dataset_dict.map(batch_encode, batched=True, remove_columns=['label', 'review'])

Map:   0%|          | 0/50218 [00:00<?, ? examples/s]

## 3.4 保存数据集

In [64]:
# 1.arrow
dataset_dict.save_to_disk('data/processed')

Saving the dataset (0/1 shards):   0%|          | 0/50218 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12555 [00:00<?, ? examples/s]

In [67]:
from datasets import load_from_disk

dataset = load_from_disk('data/processed/train')
type(dataset)

datasets.arrow_dataset.Dataset

In [68]:
# 2.json
train_dataset.to_json('data/processed/json/train.jsonl')

Creating json from Arrow format:   0%|          | 0/51 [00:00<?, ?ba/s]

48627392

In [69]:
# 3.csv
train_dataset.to_csv('data/processed/csv/train.csv')

Creating CSV from Arrow format:   0%|          | 0/51 [00:00<?, ?ba/s]

59536773

### 3.5 集成Dataloader

In [71]:
from torch.utils.data import DataLoader

train_dataset = load_from_disk('data/processed/train')
train_dataset.set_format(type='torch')
dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [72]:
for batch in dataloader:
    print(batch)
    break

{'input_ids': tensor([[ 101, 7231, 1166,  ...,    0,    0,    0],
        [ 101, 6432, 1962,  ...,    0,    0,    0],
        [ 101, 1778,  782,  ...,    0,    0,    0],
        ...,
        [ 101, 2140, 6564,  ...,    0,    0,    0],
        [ 101, 2523, 1962,  ...,    0,    0,    0],
        [ 101, 3221, 3633,  ..., 3322,  117,  102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
        0, 1, 1, 1, 0, 1, 1, 1])}
