In [1]:
import torch
from transformers import BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import wandb

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

  from .autonotebook import tqdm as notebook_tqdm


cuda


## 1. Load your dataset


In [2]:
def process_sample(batch):
    tokens_list = []
    tags_list = []
    tokens = []
    tags = []
    
    for line in batch['text']:
        if line:  # non-empty line means we have a token-tag pair
            token, tag = line.split()  # assuming space is the delimiter
            tokens.append(token)
            tags.append(tag)
        else:  # empty line means end of sentence
            tokens_list.append(tokens)
            tags_list.append(tags)
            tokens = []
            tags = []
    
    # Add remaining tokens and tags if there's any
    if tokens:
        tokens_list.append(tokens)
        tags_list.append(tags)
    
    return {'tokens': tokens_list, 'tags': tags_list}
    

data_files = {
    'train': 'data/filtered_train.txt',
    'validation': 'data/filtered_val.txt',
    'test': 'data/filtered_test.txt'
}

# Load the dataset from local files without specifying a script
dataset = load_dataset('text', data_files=data_files)
pdataset = dataset.map(process_sample, batched=True, remove_columns=['text'])


Downloading and preparing dataset text/default to /home/shemmati/.cache/huggingface/datasets/text/default-3810249c42304ce8/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 4554.08it/s]
Extracting data files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 80.81it/s]
                                                                      

Dataset text downloaded and prepared to /home/shemmati/.cache/huggingface/datasets/text/default-3810249c42304ce8/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  1.81it/s]
                                                                                                                                                                   

## 2. Load the tokenizer and model


In [3]:
id2label = {
    0: "O",
    1: "B-name",
    2: "I-name",
    3: "B-redshift",
    4: "I-redshift",
    5: "B-RA",
    6: "I-RA",
    7: "B-DEC",
    8: "I-DEC",
    9: "B-Type",
    10: "I-Type",
}
label2id = {"O": 0,
          "B-name": 1,
          "I-name": 2,
          "B-redshift": 3,
          "I-redshift": 4,
          "B-RA": 5,
          "I-RA": 6,
          "B-DEC": 7,
          "I-DEC": 8,
          "B-Type": 9,
          "I-Type": 10,
         }

In [4]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER", add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER", num_labels=11, id2label=id2label, label2id=label2id,ignore_mismatched_sizes=True)

# Define a data collator to handle token-level tasks (like NER)
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([11]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([11, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 3. Tokenize the dataset


In [5]:
def recursive_label2id_conversion(label, label2id):
    if isinstance(label, str):
        return label2id[label]
    elif isinstance(label, list):
        return [recursive_label2id_conversion(l, label2id) for l in label]
    else:
        raise ValueError("Unsupported label type")
        
def tokenize_and_align_labels2(examples, label2id):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"tags"]):
        converted_label = recursive_label2id_conversion(label, label2id)

        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(converted_label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

#tokenized_datasets = pdataset.map(tokenize_function, batched=True, num_proc=4)
tokenized_datap = pdataset.map(tokenize_and_align_labels2, batched=True, fn_kwargs={"label2id": label2id},num_proc=4)


                                                                                                                                                                   

## 4. Train


In [None]:
wandb.init(project='NEDAI',name='try1')

model.to(device)

training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=30,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    report_to="wandb",  # Log to wandb
    logging_steps=20,
    do_train=True,
    do_eval=True,
    output_dir="./results",
)

# Define the Trainer
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = tokenized_datap["train"],
    eval_dataset = tokenized_datap["validation"],
    tokenizer = tokenizer,
)

# Train the model
trainer.train()
wandb.finish()

# Save the model
model.save_pretrained("./ner_model")
tokenizer.save_pretrained("./ner_model")

[34m[1mwandb[0m: Currently logged in as: [33mshoubaneh[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/shemmati/.local/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_9259/866231022.py", line 29, in <module>
    trainer.train()
  File "/home/shemmati/.local/lib/python3.7/site-packages/transformers/trainer.py", line 1649, in train
    ignore_keys_for_eval=ignore_keys_for_eval,
  File "/home/shemmati/.local/lib/python3.7/site-packages/transformers/trainer.py", line 1989, in _inner_training_loop
    args.max_grad_norm,
  File "/home/shemmati/.local/lib/python3.7/site-packages/accelerate/accelerator.py", line 1894, in clip_grad_norm_
    return torch.nn.utils.clip_grad_norm_(parameters, max_norm, norm_type=norm_type)
  File "/home/shemmati/.local/lib/python3.7/site-packages/torch/nn/utils/clip_grad.py", line 33, in clip_grad_norm_
    grads = [p.grad for p in parameters if p.grad is not None]
  File "/home/shemmati/.local/lib/pyt

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/shemmati/.local/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_9259/866231022.py", line 29, in <module>
    trainer.train()
  File "/home/shemmati/.local/lib/python3.7/site-packages/transformers/trainer.py", line 1649, in train
    ignore_keys_for_eval=ignore_keys_for_eval,
  File "/home/shemmati/.local/lib/python3.7/site-packages/transformers/trainer.py", line 1989, in _inner_training_loop
    args.max_grad_norm,
  File "/home/shemmati/.local/lib/python3.7/site-packages/accelerate/accelerator.py", line 1894, in clip_grad_norm_
    return torch.nn.utils.clip_grad_norm_(parameters, max_norm, norm_type=norm_type)
  File "/home/shemmati/.local/lib/python3.7/site-packages/torch/nn/utils/clip_grad.py", line 33, in clip_grad_norm_
    grads = [p.grad for p in parameters if p.grad is not None]
  File "/home/shemmati/.local/lib/pyt

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/shemmati/.local/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_9259/866231022.py", line 29, in <module>
    trainer.train()
  File "/home/shemmati/.local/lib/python3.7/site-packages/transformers/trainer.py", line 1649, in train
    ignore_keys_for_eval=ignore_keys_for_eval,
  File "/home/shemmati/.local/lib/python3.7/site-packages/transformers/trainer.py", line 1989, in _inner_training_loop
    args.max_grad_norm,
  File "/home/shemmati/.local/lib/python3.7/site-packages/accelerate/accelerator.py", line 1894, in clip_grad_norm_
    return torch.nn.utils.clip_grad_norm_(parameters, max_norm, norm_type=norm_type)
  File "/home/shemmati/.local/lib/python3.7/site-packages/torch/nn/utils/clip_grad.py", line 33, in clip_grad_norm_
    grads = [p.grad for p in parameters if p.grad is not None]
  File "/home/shemmati/.local/lib/pyt

# Inference

In [1]:
from datasets import load_dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import evaluate
import numpy as np

id2label = {
    0: "O",
    1: "B-ap_name1",
    2: "I-ap_name1",
    3: "B-vz1",
    4: "I-vz1",
    5: "B-coordx1",
    6: "I-coordx1",
    7: "B-coordy1",
    8: "I-coordy1",
    9: "B-type1",
    10: "I-type1",
}
label2id = {"O": 0,
          "B-ap_name1": 1,
          "I-ap_name1": 2,
          "B-vz1": 3,
          "I-vz1": 4,
          "B-coordx1": 5,
          "I-coordx1": 6,
          "B-coordy1": 7,
          "I-coordy1": 8,
          "B-type1": 9,
          "I-type1": 10,
         }

def format_pred_for_print(pred, paragraph):
    '''
    returns a pretty string with the predictions in paragraph highlighted.
    pred: prediction output from a pipeline
    paragraph: the original text the predictions were made on
    '''
    
    RED_START = '\x1b[31m'
    RED_END = '\x1b[0m'
    
    formatted_string=''
    end=0
    
    for entry in pred:
        start = entry['start']
        # add what's in between
        formatted_string += paragraph[end:start]
        # add the entry
        end = entry['end']
        label = entry['entity']
        score = ' {:.2f}'.format(entry['score'])
        formatted_string+= RED_START+'['+paragraph[start:end]+' ('+label+score+')]'+RED_END
        
    formatted_string+= paragraph[end:]
    return(formatted_string)
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=UserWarning)


  from .autonotebook import tqdm as notebook_tqdm


In [None]:

tokenizer = AutoTokenizer.from_pretrained("./results/checkpoint-62000/")
model = AutoModelForTokenClassification.from_pretrained("./results/checkpoint-62000/")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)


In [None]:
text = "In this work we studied in detail galaxiy M45 which is at z=0.1 and very actively forming stars. We also compared this to a high redshift object SN-2318 which recently exploded nearby and might belong to a different host."
pred = nlp(text)
print(format_pred_for_print(pred,text))

## Inference with PEFT

In [43]:
peft_model_id = 'NER-BERT-lora-token-classification/checkpoint-182580//'
config = PeftConfig.from_pretrained(peft_model_id)
tokenizerpeft = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
inference_model = AutoModelForTokenClassification.from_pretrained(config.base_model_name_or_path, num_labels=11, id2label=id2label, label2id=label2id,ignore_mismatched_sizes=True)
modelpeft = PeftModel.from_pretrained(inference_model, peft_model_id)
nlpeft = pipeline("ner", model=modelpeft, tokenizer=tokenizerpeft)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([11]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([11, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The model 'PeftModelForTokenClassification' is not supported for ner. Supported models are ['AlbertForTokenClassification', 'BertForTokenClassification', 'BigBirdForTokenClassification', 'BioGptForTokenClassification', 'BloomForTokenClassification', 'CamembertForTokenClassification', 'CanineForTokenClassification', 'ConvBertForTokenClassification', 'Data2VecTextForTokenClassification', 'DebertaForTokenClassification', 'DebertaV2ForTokenClassification', 'DistilBertForTokenClas

In [47]:
text = "We observed the sky with Keck in J-band filter. It was a clear night. Then we studied in detail galaxiy M31 which is at z=0.22 and very actively forming stars. We also compared this to a high redshift object SN231 which recently exploded nearby and might belong to a different host."
pred = nlpeft(text)

print(format_pred_for_print(pred,text))

We observed the sky with Keck in J-band filter. It was a clear night. Then we studied in detail galaxiy [31m[M (B-ap_name1 0.78)][0m[31m[31 (I-ap_name1 0.50)][0m which is at z=[31m[0 (B-vz1 0.91)][0m.22 and very actively forming stars. We also compared this to a high redshift object [31m[S (B-ap_name1 0.58)][0mN[31m[23 (I-ap_name1 0.56)][0m[31m[1 (I-ap_name1 0.59)][0m which recently exploded nearby and might belong to a different host.


In [45]:
text = 'How much dust is there in Abell 563.'
pred = nlpeft(text)

print(format_pred_for_print(pred,text))

 How much dust is there in [31m[Abel (B-ap_name1 1.00)][0m[31m[l (I-ap_name1 1.00)][0m [31m[56 (I-ap_name1 1.00)][0m[31m[3 (I-ap_name1 1.00)][0m.
