In [1]:
import torch
from transformers import BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import wandb

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

  from .autonotebook import tqdm as notebook_tqdm


cuda


## 1. Load your dataset


In [2]:
def process_conll_data(example):
    # Split lines and filter out empty lines
    lines = [line.strip() for line in example['text'].split('\n') if line.strip()]

    # Split each line into its components
    tokens, ner_tags = [], []
    for line in lines:
        parts = line.split()
        if len(parts) == 2:  # Ensure that each line has exactly 2 parts
            tokens.append(parts[0])
            ner_tags.append(parts[1])
        else:
            print(f"Skipping line with unexpected format: {line}")

    return {
        'tokens': tokens,
        'ner_tags': ner_tags
    }
    
data_files = {
    'train': 'data/train.txt',
    'validation': 'data/val.txt',
    'test': 'data/test.txt'
}

dataset = load_dataset('text', data_files=data_files)
processed_datasets = dataset.map(process_conll_data)


Downloading and preparing dataset text/default to /home/shemmati/.cache/huggingface/datasets/text/default-2cdcb03b52ac90d6/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files: 100%|██████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 3448.32it/s]
Extracting data files: 100%|█████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 40.07it/s]
                                                                           

Dataset text downloaded and prepared to /home/shemmati/.cache/huggingface/datasets/text/default-2cdcb03b52ac90d6/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:13<00:00,  4.34s/it]
                                                                                                                                               

## 2. Load the tokenizer and model


In [6]:
unique_ner_tags = set()
for entry in processed_datasets["train"]["ner_tags"]:
    unique_ner_tags.update(entry)

num_labels = len(unique_ner_tags)

In [9]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
#model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER", num_labels=num_labels)
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER", num_labels=num_labels, ignore_mismatched_sizes=True)
#### this ignore size mismatch might cause major issues ...


# Define a data collator to handle token-level tasks (like NER)
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([7]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 3. Tokenize the dataset


In [10]:
def tokenize_function(examples):
    return tokenizer(examples["tokens"], truncation=True, padding="max_length", is_split_into_words=True)

tokenized_datasets = processed_datasets.map(tokenize_function, batched=True, num_proc=4)


                                                                                                                                               

## 4. Train


In [13]:
wandb.init(project='NEDAI',name='try1')

model.to(device)

training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    report_to="wandb",  # Log to wandb
    logging_steps=100,
    do_train=True,
    do_eval=True,
    output_dir="./results",
)

# Define the Trainer
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["validation"],
    tokenizer = tokenizer,
)

# Train the model
trainer.train()
wandb.finish()

# Save the model
model.save_pretrained("./ner_model")
tokenizer.save_pretrained("./ner_model")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`