# Fine-tuned BERT Model for NER on Music Dataset

In [1]:
%run -i "../util/lang_utils.ipynb"

In [26]:
from datasets import (
    load_dataset, Dataset, Features, Value,
    ClassLabel, Sequence, DatasetDict)
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from transformers import DataCollatorForTokenClassification
from transformers import (
    AutoModelForTokenClassification, TrainingArguments, Trainer)
import numpy as np
from sklearn.model_selection import train_test_split
from evaluate import load

In [4]:
music_ner_df = pd.read_csv("../data/music_ner.csv")
def change_label(input_label):
    input_label = input_label.replace("_deduced", "")
    return input_label
music_ner_df["label"] = music_ner_df["label"].apply(change_label)
music_ner_df["text"] = music_ner_df["text"].apply(lambda x: x.replace("|", ", "))
music_ner_df.head()

Unnamed: 0,id,text,start_offset,end_offset,label
0,13434,"i love radioheads kid a something similar , k...",7,17,Artist_known
1,13434,"i love radioheads kid a something similar , k...",61,71,Artist_or_WoA
2,13435,anything similar to i fight dragons,20,35,WoA
3,13436,music similar to ccrs travelin band,17,30,Artist
4,13437,songs similar to blackout by boris,17,25,WoA


In [6]:
# Data preprocessing
ids = list(set(music_ner_df["id"].values))
docs = {}
for id in ids:
    entity_rows = music_ner_df[music_ner_df["id"]==id]
    text = entity_rows.head(1)["text"].values[0]
    doc = small_model(text)
    ents = []
    for _, row in entity_rows.iterrows():
        start = row["start_offset"]
        end = row["end_offset"]
        label = row["label"]
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is not None:
            ents.append(span)
    doc.ents = ents
    docs[doc.text] = doc

In [None]:
# Parses data into tokens; maps NER tags to integers, reconstructs sentences, 
# Extracts the predicted entity spans
data_file = "../data/music_ner_bio.bio"
tag_mapping = {"O": 0, "B-Artist": 1, "I-Artist": 2, "B-WoA": 3, "I-WoA": 4}
with open(data_file, "r") as f:
    data = f.read()
tokens = [] # word lists per sentence
ner_tags = [] # integer NER tag lists per sentence
spans = [] # extracted entity spans per sentence
sentences = data.split("\n\n")
for sentence in sentences:
    words = [] # words in this sentence
    tags = [] # integer NER tags in this sentence
    this_sentence_spans = [] # NER spans from model output
    word_tag_pairs = sentence.split("\n") # word-tag pairs, separated by <TAB>
    for pair in word_tag_pairs:
        (word, tag) = pair.split("\t")
        words.append(word)
        tags.append(tag_mapping[tag])
    sentences_text = " ".join(words)
    try:
        doc = docs[sentences_text]
    except:
        pass
    ent_dict = {}
    for ent in doc.ents:
        this_sentence_spans.append(f"{ent.label_}: {ent.text}")
    tokens.append(words)
    ner_tags.append(tags)
    spans.append(this_sentence_spans)

In [15]:
# Split data into train and test sets
indices = range(0, len(spans))
train, test = train_test_split(indices, test_size=0.1)
train_tokens = []
train_ner_tags = []
train_spans = []
test_tokens = []
test_ner_tags = []
test_spans = []
for i, (toke, ner_tag, span) in enumerate(zip(tokens, ner_tags, spans)):
    if i in train:
        train_tokens.append(toke)
        train_ner_tags.append(ner_tag)
        train_spans.append(span)
    else:
        test_tokens.append(toke)
        test_ner_tags.append(ner_tag)
        test_spans.append(span)
print(len(train_tokens), len(test_tokens))

539 60


In [16]:
training_df = pd.DataFrame({"tokens":train_tokens,
                            "ner_tags":train_ner_tags,
                            "spans":train_spans})
testing_df = pd.DataFrame({"tokens":test_tokens,
                           "ner_tags":test_ner_tags,
                           "spans":test_spans})
training_df["text"] = training_df["tokens"].apply(lambda x: " ".join(x))
testing_df["text"] = testing_df["tokens"].apply(lambda x: " ".join(x))
training_df.dropna(inplace=True)
testing_df.dropna(inplace=True)
training_df.head()

Unnamed: 0,tokens,ner_tags,spans,text
0,"[anything, similar, to, i, fight, dragons]","[0, 0, 0, 1, 2, 2]",[WoA: i fight dragons],anything similar to i fight dragons
1,"[music, similar, to, ccrs, travelin, band]","[0, 0, 0, 1, 3, 4]",[Artist: ccrs travelin],music similar to ccrs travelin band
2,"[songs, similar, to, blackout, by, boris]","[0, 0, 0, 3, 0, 1]","[WoA: blackout, Artist: boris]",songs similar to blackout by boris
3,"[songs, similar, to, trios, da, da, da]","[0, 0, 0, 1, 3, 4, 4]","[Artist_known: trios, WoA: da da da]",songs similar to trios da da da
4,"[aything, similar, to, radioheads, everything,...","[0, 0, 0, 1, 3, 4, 4, 4, 4]","[Artist_known: radioheads, WoA: everything in ...",aything similar to radioheads everything in it...


In [20]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
features = Features({
    "tokens": Sequence(feature=Value(dtype="string", id=None),
                       length=-1, id=None),
    "ner_tags": Sequence(feature=ClassLabel(names=["O", "B-Artist", "I-Artist", "B-WoA", "I-WoA"]),
                         length=-1, id=None),
    "spans": Sequence(feature=Value(dtype="string", id=None),
                      length=-1, id=None),
    "text": Value(dtype="string", id=None)
})
training_dataset = Dataset.from_pandas(training_df, features=features)
testing_dataset = Dataset.from_pandas(testing_df, features=features)
dataset = DatasetDict({
    "train": training_dataset,
    "test": testing_dataset
})
print(dataset["train"].features)
label_names = dataset["train"].features["ner_tags"].feature.names
print(dataset)

{'tokens': List(Value('string')), 'ner_tags': List(ClassLabel(names=['O', 'B-Artist', 'I-Artist', 'B-WoA', 'I-WoA'])), 'spans': List(Value('string')), 'text': Value('string')}
DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'spans', 'text'],
        num_rows: 539
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'spans', 'text'],
        num_rows: 60
    })
})


In [21]:
def tokenize_adjust_labels(all_samples_per_split):
    tokenized_samples = tokenizer.batch_encode_plus(
        all_samples_per_split["text"])
    total_adjusted_labels = []
    for k in range(0, len(tokenized_samples["input_ids"])):
        prev_wid = -1
        word_ids_list = tokenized_samples.word_ids(batch_index=k)
        existing_label_ids = all_samples_per_split["ner_tags"][k]
        i = -1
        adjusted_label_ids = []
        for wid in word_ids_list:
            if wid is None:
                adjusted_label_ids.append(-100)
            elif wid != prev_wid:
                i += 1
                adjusted_label_ids.append(existing_label_ids[i])
                prev_wid = wid
            else:
                label_name = label_names[existing_label_ids[i]]
                adjusted_label_ids.append(existing_label_ids[i])
        total_adjusted_labels.append(adjusted_label_ids)
    tokenized_samples["labels"] = total_adjusted_labels
    return tokenized_samples

In [22]:
tokenized_dataset = dataset.map(
    tokenize_adjust_labels,batched=True)

Map:   0%|          | 0/539 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

In [23]:
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer)

In [24]:
metric = load("seqeval")
def compute_metrics(data):
    predictions, labels = data
    predictions = np.argmax(predictions, axis=2)
    
    data = zip(predictions, labels)
    data = [
        [(p, l) for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in data
    ]
    
    true_predictions = [
        [label_names[p] for (p, l) in data_point]
        for data_point in data
    ]
    true_labels = [
        [label_names[l] for (p, l) in data_point]
        for data_point in data
    ]
    
    results = metric.compute(predictions=true_predictions,
                             references=true_labels)
    flat_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    for k in results.keys():
        if (k not in flat_results.keys()):
            flat_results[k + "_f1"] = results[k]["f1"]
    return flat_results

Downloading builder script: 0.00B [00:00, ?B/s]

In [28]:
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(label_names))
training_args = TrainingArguments(
    output_dir="./fine_tune_bert_output",
    eval_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=7,
    weight_decay=0.01,
    logging_steps=1000,
    run_name="ep_10_tokenized_l1",
    save_strategy="no",
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss


TrainOutput(global_step=238, training_loss=0.25610123962915243, metrics={'train_runtime': 376.9644, 'train_samples_per_second': 10.009, 'train_steps_per_second': 0.631, 'total_flos': 50108074210740.0, 'train_loss': 0.25610123962915243, 'epoch': 7.0})

In [29]:
trainer.evaluate()



{'eval_loss': 0.2008223682641983,
 'eval_overall_precision': 0.5444444444444444,
 'eval_overall_recall': 0.6125,
 'eval_overall_f1': 0.5764705882352941,
 'eval_overall_accuracy': 0.9241935483870968,
 'eval_Artist_f1': 0.5739130434782609,
 'eval_WoA_f1': 0.5818181818181819,
 'eval_runtime': 2.7652,
 'eval_samples_per_second': 21.698,
 'eval_steps_per_second': 1.447,
 'epoch': 7.0}

In [30]:
trainer.save_model("../models/bert_fine_tuned")

In [31]:
model = AutoModelForTokenClassification.from_pretrained("../models/bert_fine_tuned")
tokenizer = AutoTokenizer.from_pretrained("../models/bert_fine_tuned")

In [32]:
text = "music similar to morphie robocobra quartet | featuring elements like saxophone prominent bass"
from transformers import pipeline
pipeline = pipeline(
    task="token-classification", model=model, tokenizer=tokenizer,
    aggregation_strategy="simple")
result = pipeline(text)
for entity in result:
    print(entity)

Device set to use cpu


{'entity_group': 'LABEL_0', 'score': np.float32(0.9992376), 'word': 'music similar to', 'start': 0, 'end': 16}
{'entity_group': 'LABEL_1', 'score': np.float32(0.855024), 'word': 'morphie roboco', 'start': 17, 'end': 31}
{'entity_group': 'LABEL_2', 'score': np.float32(0.71867263), 'word': '##bra quartet', 'start': 31, 'end': 42}
{'entity_group': 'LABEL_0', 'score': np.float32(0.99892324), 'word': '| featuring elements like saxophone prominent bass', 'start': 43, 'end': 93}
