In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
!pip install datasets
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 4.7 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 96.4 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 96.3 MB/s 
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 99.8 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 95.6 MB/s 
Installing collected package

In [None]:
import datasets
import warnings
import random
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, BertForTokenClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score
from random import shuffle

In [None]:
dataset = datasets.load_dataset('polyglot_ner', 'zh')

Downloading builder script:   0%|          | 0.00/6.01k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/86.1k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

Downloading and preparing dataset polyglot_ner/zh to /root/.cache/huggingface/datasets/polyglot_ner/zh/1.0.0/bb2e45c90cd345c87dfd757c8e2b808b78b0094543b511ac49bc0129699609c1...


Downloading data:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1570853 [00:00<?, ? examples/s]

Dataset polyglot_ner downloaded and prepared to /root/.cache/huggingface/datasets/polyglot_ner/zh/1.0.0/bb2e45c90cd345c87dfd757c8e2b808b78b0094543b511ac49bc0129699609c1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
print(len(dataset['train']))

1570853


In [None]:
# choose a random number to pick 7000 samples
random.seed(10)
rand=random.randrange(0,10000)
print(rand)
# get the dataset
data_zh = datasets.load_dataset('polyglot_ner', 'zh', split='train[9361:16361]')
print(data_zh)

9361




Dataset({
    features: ['id', 'lang', 'words', 'ner'],
    num_rows: 7000
})


In [None]:
#extract labels
elements = []
for r in data_zh['ner']:
  for i in r:
    elements.append(i)
labels = set(elements)

label_to_id = {l : i for i,l in enumerate(labels)}
id_to_label = {i : l for i, l in enumerate(labels)}
print(id_to_label)

df_labels = [[label_to_id[l] for l in row] for row in data_zh['ner']] 

{0: 'PER', 1: 'LOC', 2: 'ORG', 3: 'O'}


# tokenize the dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/624 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/269k [00:00<?, ?B/s]

In [None]:
encoded_data_zh = [tokenizer(item['words'], padding = 'max_length' , truncation = True, return_offsets_mapping= True, is_split_into_words=True, max_length=128) for item in data_zh]

In [None]:
# change the encoded data type in order to fit the model
encoded_data_zh_pd = pd.DataFrame(columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
for i in range(len(encoded_data_zh)):
  labels = -100*np.ones(128)
  for j in range(len(df_labels[i])):
    labels[j+1] = df_labels[i][j]
  encoded_data_zh_pd.loc[i] = [encoded_data_zh[i]['input_ids'],encoded_data_zh[i]['token_type_ids'],encoded_data_zh[i]['attention_mask'],labels]

encoded_dataset = []
cols = ['input_ids', 'token_type_ids', 'attention_mask', 'labels']
for i in range(len(encoded_data_zh_pd)):
    row = {}
    for col in cols:
        if col == 'labels':
            row[col] = torch.LongTensor([encoded_data_zh_pd[col][i]])
        else:
            row[col] = torch.tensor([encoded_data_zh_pd[col][i]])
    encoded_dataset.append(row)

for item in encoded_dataset:
  for key in item:
    item[key] = torch.squeeze(item[key])

  row[col] = torch.LongTensor([encoded_data_zh_pd[col][i]])
  row[col] = torch.LongTensor([encoded_data_zh_pd[col][i]])


#Train the model

In [None]:
#set the model
model = BertForTokenClassification.from_pretrained('bert-base-chinese', num_labels=5)
    

#shuffle and then split the dataset
shuffle(encoded_dataset)
train_set1000 = encoded_dataset[:1000]
train_set3000=encoded_dataset[1000:4000]
eva_set = encoded_dataset[4050:6050]

Downloading:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-c

In [None]:
#flatten the labels
def flatten(preds):
    true_label = preds.label_ids 
    pred_label = preds.predictions.argmax(-1) 
    t = []
    p = []
    for i in range(len(true_label)):
        for j in range(len(true_label[i])):
            if true_label[i][j] != -100:
                t.append(true_label[i][j])
                p.append(pred_label[i][j])
    return t, p

## Fine-tuned with 1,000 sentences

In [None]:
torch.backends.cudnn.enable = True
torch.backends.cudnn.benchmark = True

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
model.to(device)
training_args = TrainingArguments(
    num_train_epochs=5,
    per_device_train_batch_size=16,# If using colab or no cuda, please reduce epoches and set a smaller batch
    per_device_eval_batch_size=16, # don't make batch = 4 since the model lose the ability of converge
    output_dir='results',
    logging_dir='logs',
    no_cuda=False,  # defaults to false anyway, just to be explicit
    )

#train
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_set1000,
    )
trainer.train()


cuda


***** Running training *****
  Num examples = 1000
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 315
  Number of trainable parameters = 101680901
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=315, training_loss=0.07923921857561383, metrics={'train_runtime': 56.7808, 'train_samples_per_second': 88.058, 'train_steps_per_second': 5.548, 'total_flos': 326629804800000.0, 'train_loss': 0.07923921857561383, 'epoch': 5.0})

In [None]:
preds = trainer.predict(eva_set)
t, p = flatten(preds)
print('f1 micro score is', f1_score(t, p,average='micro'))
print('f1 macro score is', f1_score(t, p,average='macro'))


***** Running Prediction *****
  Num examples = 2000
  Batch size = 16


f1 micro score is 0.9625651555836361
f1 macro score is 0.6616059820302794


## Fine-tuned with 3,000 sentences

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
model.to(device)
training_args = TrainingArguments(
    num_train_epochs=5,
    per_device_train_batch_size=16,# If using colab or no cuda, please reduce epoches and set a smaller batch
    per_device_eval_batch_size=16, # don't make batch = 4 since the model lose the ability of converge
    output_dir='results',
    logging_dir='logs',
    no_cuda=False,  # defaults to false anyway, just to be explicit
    )

#train
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_set3000,
    )
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 3000
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 940
  Number of trainable parameters = 85057540


cuda


Step,Training Loss
500,0.0185


Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in results/checkpoint-500/tokenizer_config.json
Special tokens file saved in results/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=940, training_loss=0.01261544608055277, metrics={'train_runtime': 160.526, 'train_samples_per_second': 93.443, 'train_steps_per_second': 5.856, 'total_flos': 979880555520000.0, 'train_loss': 0.01261544608055277, 'epoch': 5.0})

In [None]:
preds = trainer.predict(eva_set)
t, p = flatten(preds)
print('f1 micro score is', f1_score(t, p, average='micro'))
print('f1 macro score is', f1_score(t, p, average='macro'))

***** Running Prediction *****
  Num examples = 2000
  Batch size = 16


f1 micro score is 0.9661190965092402
f1 macro score is 0.7525435630225409


## Fine-tuned with 3,000 sentences and frozen embeddings

In [None]:
# frozen embeddings

for param in model.bert.embeddings.parameters():
  param.requires_grad = False

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
model.to(device)
training_args = TrainingArguments(
    num_train_epochs=5,
    per_device_train_batch_size=16,# If using colab or no cuda, please reduce epoches and set a smaller batch
    per_device_eval_batch_size=16, # don't make batch = 4 since the model lose the ability of converge
    output_dir='results',
    logging_dir='logs',
    no_cuda=False,  # defaults to false anyway, just to be explicit
    )

#train
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_set3000,
    )
trainer.train()

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-chinese/snapshots/84b432f646e4047ce1b5db001d43a348cd3f6bd0/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "posi

cuda


Step,Training Loss
500,0.1075


Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in results/checkpoint-500/tokenizer_config.json
Special tokens file saved in results/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=940, training_loss=0.0645549865479165, metrics={'train_runtime': 159.8027, 'train_samples_per_second': 93.866, 'train_steps_per_second': 5.882, 'total_flos': 979880555520000.0, 'train_loss': 0.0645549865479165, 'epoch': 5.0})

In [None]:
preds = trainer.predict(eva_set)
t, p = flatten(preds)
print('f1 micro score is', f1_score(t, p, average='micro'))
print('f1 macro score is', f1_score(t, p, average='macro'))

***** Running Prediction *****
  Num examples = 2000
  Batch size = 16


f1 micro score is 0.9667509082293476
f1 macro score is 0.7441267239196769
