In [1]:
# Load datasets with pandas
import pandas as pd

_DATASET_DIR = 'C:/Users/ADMIN/Desktop/DATN/Extract_information/data/split_mave/'

df_train_positives = pd.read_json(_DATASET_DIR + 'train/mave_positives.jsonl', lines=True)
df_train_negatives = pd.read_json(_DATASET_DIR + 'train/mave_negatives.jsonl', lines=True)
df_train = pd.concat([df_train_positives, df_train_negatives])

df_val_positives = pd.read_json(_DATASET_DIR + 'eval/mave_positives.jsonl', lines=True)
df_val_negatives = pd.read_json(_DATASET_DIR + 'eval/mave_negatives.jsonl', lines=True)
df_val = pd.concat([df_val_positives, df_val_negatives])

df_train.head()

Unnamed: 0,id,category,title,attributes
0,B00005LE4P,Laptops,ThinkPad T22 2647 - PIII 900 MHz - RAM 128 MB ...,"[{'key': 'Screen Size', 'evidences': [{'value'..."
1,B00005LE4P,Laptops,ThinkPad T22 2647 - PIII 900 MHz - RAM 128 MB ...,"[{'key': 'Processor Speed', 'evidences': [{'va..."
2,B00005LE4P,Laptops,ThinkPad T22 2647 - PIII 900 MHz - RAM 128 MB ...,"[{'key': 'Resolution', 'evidences': [{'value':..."
3,B00005NBJB,Laptops,"Apple iBook Laptop (500-MHz PowerPC G3, 128 MB...","[{'key': 'Processor Speed', 'evidences': [{'va..."
4,B00005NBIS,Laptops,"Apple iBook Laptop (500-MHz PowerPC G3, 64 MB ...","[{'key': 'Processor Speed', 'evidences': [{'va..."


In [2]:
# Group by product id
df_train_grouped = df_train.groupby(['id']).agg({'attributes': 'sum', 'title': 'min', 'category': 'min'})
df_val_grouped = df_val.groupby(['id']).agg({'attributes': 'sum', 'title': 'min', 'category': 'min'})

In [3]:
from transformers import AutoTokenizer

model_checkpoint = 'microsoft/deberta-v3-large'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# Determine NER tags relevant - Focus on ner tags contained in the test set
def extract_ner_tags(example):
    new_ner_tags = ['{}_{}'.format('_'.join(example['category'].split(' ')), '_'.join(attribute['key'].split(' '))) for attribute in example['attributes']]
    return new_ner_tags

ner_tags = set()

[ner_tags.update(tags) for tags in df_train_grouped.apply(extract_ner_tags, axis=1).tolist()]

# Create processed ner tags
processed_ner_tags = ['O']
for ner_tag in ner_tags:
    processed_ner_tags.append('B-{}'.format(ner_tag))
    processed_ner_tags.append('I-{}'.format(ner_tag))

# Create dict ner_tags 2 numbers in list
processed_ner_tags_2_number = dict(zip(processed_ner_tags, [i for i in range(0, len(processed_ner_tags))]))
print(processed_ner_tags_2_number)

{'O': 0, 'B-Flash_Memory_Cards_Capacity': 1, 'I-Flash_Memory_Cards_Capacity': 2, 'B-Digital_Cameras_Sensor_Size': 3, 'I-Digital_Cameras_Sensor_Size': 4, 'B-Flash_Memory_Cards_Memory_Stick_Format': 5, 'I-Flash_Memory_Cards_Memory_Stick_Format': 6, 'B-Laptops_Screen_Size': 7, 'I-Laptops_Screen_Size': 8, 'B-Digital_Cameras_Resolution': 9, 'I-Digital_Cameras_Resolution': 10, 'B-Laptops_Processor_Speed': 11, 'I-Laptops_Processor_Speed': 12, 'B-Laptops_Resolution': 13, 'I-Laptops_Resolution': 14, 'B-Laptops_Battery_Life': 15, 'I-Laptops_Battery_Life': 16, 'B-Laptops_Weight': 17, 'I-Laptops_Weight': 18, 'B-Laptops_Number_of_Cores': 19, 'I-Laptops_Number_of_Cores': 20, 'B-Digital_Cameras_Optical_Zoom': 21, 'I-Digital_Cameras_Optical_Zoom': 22, 'B-Flash_Memory_Cards_SD_Format': 23, 'I-Flash_Memory_Cards_SD_Format': 24, 'B-Laptops_Refresh_Rate': 25, 'I-Laptops_Refresh_Rate': 26, 'B-Laptops_Processor_Brand': 27, 'I-Laptops_Processor_Brand': 28, 'B-Digital_Cameras_Sensor_Type': 29, 'I-Digital_Came

In [10]:
def assign_ner_tags(example):
    token_input = tokenizer(example['title'])
    example['tokens'] = tokenizer.convert_ids_to_tokens(token_input['input_ids'])
    
    ner_tags = [0 for token in example['tokens']]
    for attribute in example['attributes']:
        cat_attr = '{}_{}'.format('_'.join(example['category'].split(' ')), '_'.join(attribute['key'].split(' ')))
        if 'B-{}'.format(cat_attr) in processed_ner_tags:
            for evidence in attribute['evidences']:
                if evidence['value'] in example['title']:
                    begin = evidence['begin'] + len(example['tokens'][0]) + 1 #Take care of CLS token
                    end = evidence['end'] + len(example['tokens'][0]) + 1
                    token_position = 0
                    found_beginning = False
                    position_update = {}
                    relevant_tokens = example['tokens'].copy()
                    current_token = relevant_tokens[0]
                    relevant_tokens = relevant_tokens [1:]
                    for position in range(0, len(example['title'])):
                        if not found_beginning:
                            if position == begin:
                                position_update[token_position] = 'B-{}'.format(cat_attr)
                                found_beginning = True 
                            
                        elif position >= begin and position < end and token_position not in position_update:
                            position_update[token_position] = 'I-{}'.format(cat_attr)
                        
                        if position > end  + 1:
                            break
                        
                        # Shorten current token
                        if len(current_token) > 1:
                            current_token = current_token[1:]
                        
                        # Jump to next token
                        elif len(current_token) == 1:
                            current_token = relevant_tokens[0]
                            relevant_tokens = relevant_tokens [1:]
                            token_position += 1
                        else:
                            print(current_token)
                            print('Something went wrong!')
                    
                    # Assign positions
                    for position, found_ner_tag in position_update.items():
                            ner_tags[position] = processed_ner_tags_2_number[found_ner_tag]
    ner_tags[0] = -100
    ner_tags[-1] = -100
    return ner_tags

In [11]:
df_train_grouped['ner_tags'] = df_train_grouped.apply(assign_ner_tags, axis=1)
df_val_grouped['ner_tags'] = df_val_grouped.apply(assign_ner_tags, axis=1)

In [7]:
df_train_grouped_reduced = df_train_grouped[['title', 'category','ner_tags']]
df_val_grouped_reduced = df_val_grouped[['title', 'category','ner_tags']]
df_train_grouped_reduced

Unnamed: 0_level_0,title,category,ner_tags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1060220822,Essential 64GB Samsung Galaxy Tab 10.1 Micro S...,Flash Memory Cards,"[-100, 0, 11, 12, 0, 0, 0, 0, 0, 0, 29, 17, 30..."
106022206X,Essential 64GB AT&T F160 Micro SDHC Card is cu...,Flash Memory Cards,"[-100, 0, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
1060223929,Essential 64GB HTC EVO 4G LTE Micro SDHC Card ...,Flash Memory Cards,"[-100, 0, 11, 12, 0, 0, 0, 0, 0, 29, 17, 30, 0..."
106023565X,Essential 64GB Sony Xperia miro Micro SDHC Car...,Flash Memory Cards,"[-100, 0, 11, 12, 0, 0, 0, 0, 29, 30, 30, 0, 0..."
1060237482,Essential 64GB Acer Iconia A1-830 Micro SDHC C...,Flash Memory Cards,"[-100, 0, 11, 12, 0, 0, 0, 0, 0, 0, 0, 29, 17,..."
...,...,...,...
B01HHCJEK0,Canon EOS 6D Digital SLR Camera with EF 24-105...,Digital Cameras,"[-100, 0, 13, 0, 0, 13, 14, 14, 0, 0, 0, 0, 0,..."
B01HHR572O,Sigma SD Quattro Digital Camera with 30mm F1.4...,Digital Cameras,"[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
B01HI9NT6M,2019 Samsung 11.6” Thin & Lightweight HD Chrom...,Laptops,"[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
B01HIJIZR0,2016 High Performance Flagship Toshiba FHD IPS...,Laptops,"[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 33, 34, 34, ..."


In [30]:
#!pip install datasets
# Convert to huggingface dataset
from datasets import Dataset, DatasetDict, ClassLabel, Features, Value, Sequence

features = Features({'title': Value('string'), 
                     'category' : Value('string'), 
                     'ner_tags': Sequence(feature=ClassLabel(names=processed_ner_tags)), 
                     'id': Value('string')})

train_dataset = Dataset.from_pandas(df_train_grouped_reduced, features=features)
val_dataset = Dataset.from_pandas(df_val_grouped_reduced, features=features)

raw_datasets = DatasetDict({"train":train_dataset, "val": val_dataset})

In [31]:
def tokenize_and_align_sequences(examples):
    tokenized_inputs = tokenizer(
        examples["title"])
    tokenized_inputs["labels"] = examples["ner_tags"]
    return tokenized_inputs

In [32]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_sequences,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map:   0%|          | 0/18083 [00:00<?, ? examples/s]

Map:   0%|          | 0/2212 [00:00<?, ? examples/s]

## Start training

In [34]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [35]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,   27,   28,    0,    0,    0,    0,    0,    0,   15,   19,
           16,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0, -100],
        [-100,    0,   27,   28,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
         -100, -100]])

In [36]:
#!pip install seqeval
#!pip install evaluate
import evaluate

metric = evaluate.load("seqeval")

In [37]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [38]:
ner_feature = raw_datasets["train"].features["ner_tags"]
label_names = ner_feature.feature.names

In [39]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [40]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForTokenClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'deberta.embeddings.position_embeddings.weight']
- This IS expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a B

In [41]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="4"
os.environ["TOKENIZERS_PARALLELISM"]="false"

In [42]:
#!pip3 install torch torchvision torchaudio
#!pip install accelerate -U

from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="/ceph/alebrink/MAVE/baselines/named_entity_recognition/deberta-v3-large-finetuned-ner-10epochs-V2",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    push_to_hub=False,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    
)
trainer.train()