In [1]:
#!pip install transformers
#!pip install datasets
#!pip install evaluate

from transformers import AutoTokenizer, DistilBertModel, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer, AutoModelForMaskedLM
import evaluate
import numpy as np

from datasets import Dataset as ds
from datasets import DatasetDict, load_dataset
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler
from torch.utils.data import random_split


device = torch.device('cuda:2')

In [2]:
def read_chinese_data(inputfilename):
    with open(inputfilename, "r") as inputfile:
        sentences = []
        collection_words = []
        collection_labels = []
        for line in inputfile:
            if line[0] == '#':
                continue
            columns = line.split()
            #print(words)
            if columns == []:
                sentences.append((''.join(collection_words), collection_labels))
                collection_words = []
                collection_labels = []
                continue
            collection_words.append(columns[1])
            collection_labels += [1] + ([0] * (len(columns[1]) - 1))

    return sentences

In [3]:
train_sentences = read_chinese_data('zh_gsd-ud-train.conllu')
test_sentences = read_chinese_data('zh_gsd-ud-test.conllu')

In [4]:
train_sentences[:5]

[('看似簡單，只是二選一做決擇，但其實他們代表的是你周遭的親朋好友，試著給你不同的意見，但追根究底，最後決定的還是自己。',
  [1,
   0,
   1,
   0,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   0,
   1,
   1,
   1,
   0,
   1,
   0,
   1,
   0,
   1,
   1,
   1,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   1,
   1,
   1,
   1,
   1,
   1,
   0,
   1,
   1,
   0,
   1,
   1,
   1,
   0,
   0,
   0,
   1,
   1,
   0,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   1]),
 ('其便當都是買來的，就算加熱也是由媽媽負責（後來揭曉其實是避免帶來厄運），父親則在電視台上班。',
  [1,
   1,
   0,
   1,
   0,
   1,
   0,
   1,
   1,
   1,
   0,
   1,
   0,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   1,
   0,
   1,
   1,
   1,
   0,
   1,
   1,
   1,
   0,
   1,
   1,
   0,
   1]),
 ('這次遊行最大的特色，在於越來越多年輕人上街遊行，而且當中不乏行動激烈的躁少年。',
  [1,
   1,
   1,
   0,
   1,
   0,
   1,
   1,
   0,
   1,
   1,
   1,
   1,
   0,
   0,
   1,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   1,
   0,
   1,
   0,
   1,
   0,
   1,
 

In [5]:
def index_chars(sentences):
    megasentence = ''.join(sentences)
    char_list = set()
    for c in megasentence:
        char_list.add(c)
    char_list = list(char_list)
    return char_list, {char_list[x]:x for x in range(len(char_list))}

In [6]:
int_index, char_index = index_chars([x[0] for x in train_sentences])

In [7]:
int_index[40], char_index[int_index[40]]

('玖', 40)

In [8]:
class Chinese_Dataset(Dataset):
    def __init__(self, sequences, tokenizer):
        self.sequences = [x[0] for x in sequences]
        self.ner_tags = [x[1] for x in sequences]
        int_index, char_index = index_chars([x for x in self.sequences])
        self.int_indices = int_index
        self.char_indices = char_index
        self.tokenizer = tokenizer

    def __len__(self):
         return len(self.sequences)

    def __getitem__(self, idx):
        encoding = self.tokenizer(self.sequences[idx], padding='max_length', truncation=True, max_length=182, return_tensors='pt')

        max_seq_length = len(encoding['input_ids'][0])
        labels = self.ner_tags[idx]
        padded_labels = labels + [0] * (max_seq_length - len(labels))

        item = {
            'tokens': self.sequences[idx],
            'input_ids': encoding['input_ids'].squeeze(0),
            #'attention_mask': encoding['attention_mask'],
            'labels': padded_labels,
        }
        #item = {"id": idx, "tokens": self.sequences[idx], "tags": self.ner_tags[idx]}

        return item

In [9]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

In [10]:
data = Chinese_Dataset(train_sentences, tokenizer)

In [11]:
#checking if everything is working
print("Dataset Length:", len(data))
sample_item = data[40]
print("Sample Item:", sample_item)

Dataset Length: 3997
Sample Item: {'tokens': '現存的三棟屋村位於三棟屋路近和宜合交匯處一帶。', 'input_ids': tensor([ 101, 4412, 2100, 4638,  676, 3477, 2238, 3333,  855, 3176,  676, 3477,
        2238, 6662, 6818, 1469, 2139, 1394,  769, 1274, 5993,  671, 2380,  511,
         102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    

In [12]:
data.int_indices[40], data.char_indices[int_index[40]]

('玖', 40)

In [13]:
example = data[40]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=False)
# Convert token IDs to tokens (words)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 '現',
 '存',
 '的',
 '三',
 '棟',
 '屋',
 '村',
 '位',
 '於',
 '三',
 '棟',
 '屋',
 '路',
 '近',
 '和',
 '宜',
 '合',
 '交',
 '匯',
 '處',
 '一',
 '帶',
 '。',
 '[SEP]']

In [14]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [15]:
def assign_labels(sentences):
    labels_list = []
    for i in range(len(sentences)):
        labels = []
        tags = sentences[i]["labels"]
        for tag in tags:
            sentence_labels = []
            if tag == 1:
                labels.append("beginning_of_word")
            else:
                labels.append("continuation_of_word")
        labels_list.append(labels)
    return labels_list

In [16]:
id2labels = assign_labels(data)

In [17]:
#!pip install seqeval
seqeval = evaluate.load("seqeval")

labels = [id2labels[i] for i in example[f"labels"]]

In [18]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [19]:
id2label = {
    1: "beginning_of_word",
    0: "continuation_of_word",
}

In [20]:
label2id = {
    "beginning_of_word": 1,
    "continuation_of_word": 0,
}

In [21]:
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)
chinese_model = AutoModelForMaskedLM.from_pretrained("bert-base-chinese", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
#chinese_model.to(device)
#model.to(device)

In [23]:
training_steps = (2 * len(data))/16

In [24]:
#!pip install transformers[torch]
training_args = TrainingArguments(
    output_dir="BERT_and_chinese",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="no",
    load_best_model_at_end=False,
    push_to_hub=False,
    #measure to avoid cuda errors with Trainer when training chinese BERT
    eval_accumulation_steps=1,
)

In [25]:
train_dataset = int(0.8*len(data))
validation_dataset = len(data) - train_dataset

train, validation = random_split(data, (train_dataset, validation_dataset))
print(len(train), len(validation), len(data))

3197 800 3997


In [26]:
trainer = Trainer(
    model=chinese_model,
    args=training_args,
    train_dataset=train,
    eval_dataset=validation,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train,
#     eval_dataset=validation,
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics
# )

In [27]:
trainer.train()
trainer.save_model()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


In [28]:
#fine tuning

from transformers import create_optimizer, pipeline
from transformers.keras_callbacks import KerasMetricCallback
import tensorflow as tf

In [29]:
batch_size = 16
num_train_epochs = 3
num_train_steps = (len(data)// batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

In [30]:
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)
chinese_model = TFAutoModelForTokenClassification.from_pretrained("bert-base-chinese", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForTokenClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able t

In [31]:
def transform_dataset(dataset):
  data = {}
  for i in range(len(dataset)):
    item = dataset[i]
    input_ids = item["input_ids"]
    labels = item["labels"]
    data[input_ids] = labels
  format = {"input_ids" : list(data.keys()), "labels" : list(data.values())}
  dataset = ds.from_dict(format)
  return dataset

In [32]:
train_dataset = transform_dataset(train)
validation_dataset = transform_dataset(validation)

In [33]:
train_dataset

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 3197
})

In [34]:
chinese_dataset = DatasetDict({"train":train_dataset, "test":validation_dataset})

In [35]:
chinese_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 3197
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 800
    })
})

In [36]:
tf_train_set = model.prepare_tf_dataset(
    chinese_dataset["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    chinese_dataset["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [37]:
chinese_model.compile(optimizer=optimizer)
#model.compile(optimizer=optimizer)

In [38]:
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

In [39]:
chinese_model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=metric_callback)
#model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=metric_callback)

Epoch 1/3



Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7d36a068fb50>

In [40]:
text = test_sentences[0]

In [41]:
text

('然而，這樣的處理也衍生了一些問題。', [1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1])

In [42]:
from transformers import AutoTokenizer

In [43]:
tokenizer = AutoTokenizer.from_pretrained("BERT_and_chinese")
inputs = tokenizer(text[0], return_tensors="pt")

In [44]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("BERT_and_chinese")
with torch.no_grad():
    logits = model(**inputs).logits

Some weights of BertForTokenClassification were not initialized from the model checkpoint at BERT_and_chinese and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
#Gets the class with the highest probability, and uses the model’s id2label mapping to convert it to a text label:

predictions = torch.argmax(logits, dim=2)
predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
predicted_token_class

['continuation_of_word',
 'continuation_of_word',
 'continuation_of_word',
 'continuation_of_word',
 'continuation_of_word',
 'continuation_of_word',
 'continuation_of_word',
 'continuation_of_word',
 'continuation_of_word',
 'beginning_of_word',
 'continuation_of_word',
 'continuation_of_word',
 'beginning_of_word',
 'continuation_of_word',
 'beginning_of_word',
 'continuation_of_word',
 'continuation_of_word',
 'continuation_of_word',
 'continuation_of_word']