In [1]:
!pip install datasets
!pip install transformers




In [2]:
!pip install accelerate>=0.21.0

In [3]:
# Now, let's import the necessary modules
import torch
import re
from datasets import load_dataset
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification


In [4]:
dataset = load_dataset("surrey-nlp/PLOD-CW")

In [5]:
# Assuming 'train' split, adjust accordingly
train_data = dataset['train']
val_data = dataset['validation']
test_data = dataset['test']

In [6]:
# Define the abbreviation labels
abbreviation_labels = ["B-ABBREV", "I-ABBREV", "O"]

# Initialize the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [7]:
# Preprocess the data
def preprocess_data(data):
    preprocessed_data = []
    for sample in data:
        tokens = sample['tokens']
        text = " ".join(tokens).lower()
        text = re.sub(r'[^\w\s]', '', text)
        preprocessed_data.append({'text': text, 'label': sample['ner_tags']})
    return preprocessed_data

In [8]:
train_data = preprocess_data(train_data)
val_data = preprocess_data(val_data)
test_data = preprocess_data(test_data)

# Tokenize and encode the training, validation, and test datasets
train_encodings = tokenizer([x['text'] for x in train_data], truncation=True, padding=True, return_tensors="pt")
val_encodings = tokenizer([x['text'] for x in val_data], truncation=True, padding=True, return_tensors="pt")
test_encodings = tokenizer([x['text'] for x in test_data], truncation=True, padding=True, return_tensors="pt")


In [9]:
# Convert the labels to their corresponding numerical values
def convert_labels_to_numerical(labels):
    return [[abbreviation_labels.index(label) for label in sample if label in abbreviation_labels] for sample in labels]

train_labels = convert_labels_to_numerical([x['label'] for x in train_data])
val_labels = convert_labels_to_numerical([x['label'] for x in val_data])
test_labels = convert_labels_to_numerical([x['label'] for x in test_data])

# Create PyTorch Dataset objects for training, validation, and testing
train_dataset = [{"input_ids": train_encodings['input_ids'][i], "attention_mask": train_encodings['attention_mask'][i], "labels": torch.tensor(train_labels[i])} for i in range(len(train_labels))]
val_dataset = [{"input_ids": val_encodings['input_ids'][i], "attention_mask": val_encodings['attention_mask'][i], "labels": torch.tensor(val_labels[i])} for i in range(len(val_labels))]
test_dataset = [{"input_ids": test_encodings['input_ids'][i], "attention_mask": test_encodings['attention_mask'][i], "labels": torch.tensor(test_labels[i])} for i in range(len(test_labels))]


In [10]:
# Initialize the BERT-based token classification model
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(abbreviation_labels))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./output',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=DataCollatorForTokenClassification(tokenizer)
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [12]:
# Train the model
trainer.train()

# Evaluate the model on the test set
trainer.evaluate(test_dataset)


  0%|          | 0/402 [00:00<?, ?it/s]

{'loss': 0.0, 'learning_rate': 4.875621890547264e-05, 'epoch': 0.07}
{'loss': 0.0, 'learning_rate': 4.7512437810945275e-05, 'epoch': 0.15}
{'loss': 0.0, 'learning_rate': 4.626865671641791e-05, 'epoch': 0.22}
{'loss': 0.0, 'learning_rate': 4.502487562189055e-05, 'epoch': 0.3}
{'loss': 0.0, 'learning_rate': 4.3781094527363184e-05, 'epoch': 0.37}


  0%|          | 0/16 [00:00<?, ?it/s]

Checkpoint destination directory ./output\checkpoint-50 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'eval_loss': nan, 'eval_runtime': 28.2166, 'eval_samples_per_second': 4.465, 'eval_steps_per_second': 0.567, 'epoch': 0.37}
{'loss': 0.0, 'learning_rate': 4.253731343283582e-05, 'epoch': 0.45}
{'loss': 0.0, 'learning_rate': 4.1293532338308464e-05, 'epoch': 0.52}
{'loss': 0.0, 'learning_rate': 4.00497512437811e-05, 'epoch': 0.6}
{'loss': 0.0, 'learning_rate': 3.8805970149253736e-05, 'epoch': 0.67}
{'loss': 0.0, 'learning_rate': 3.756218905472637e-05, 'epoch': 0.75}


  0%|          | 0/16 [00:00<?, ?it/s]

Checkpoint destination directory ./output\checkpoint-100 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'eval_loss': nan, 'eval_runtime': 29.2682, 'eval_samples_per_second': 4.305, 'eval_steps_per_second': 0.547, 'epoch': 0.75}
{'loss': 0.0, 'learning_rate': 3.631840796019901e-05, 'epoch': 0.82}
{'loss': 0.0, 'learning_rate': 3.5074626865671645e-05, 'epoch': 0.9}
{'loss': 0.0, 'learning_rate': 3.383084577114428e-05, 'epoch': 0.97}
{'loss': 0.0, 'learning_rate': 3.258706467661692e-05, 'epoch': 1.04}
{'loss': 0.0, 'learning_rate': 3.1343283582089554e-05, 'epoch': 1.12}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_runtime': 30.0084, 'eval_samples_per_second': 4.199, 'eval_steps_per_second': 0.533, 'epoch': 1.12}
{'loss': 0.0, 'learning_rate': 3.009950248756219e-05, 'epoch': 1.19}
{'loss': 0.0, 'learning_rate': 2.885572139303483e-05, 'epoch': 1.27}
{'loss': 0.0, 'learning_rate': 2.7611940298507467e-05, 'epoch': 1.34}
{'loss': 0.0, 'learning_rate': 2.6368159203980103e-05, 'epoch': 1.42}
{'loss': 0.0, 'learning_rate': 2.512437810945274e-05, 'epoch': 1.49}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_runtime': 44.211, 'eval_samples_per_second': 2.85, 'eval_steps_per_second': 0.362, 'epoch': 1.49}
{'loss': 0.0, 'learning_rate': 2.3880597014925373e-05, 'epoch': 1.57}
{'loss': 0.0, 'learning_rate': 2.2636815920398012e-05, 'epoch': 1.64}
{'loss': 0.0, 'learning_rate': 2.139303482587065e-05, 'epoch': 1.72}
{'loss': 0.0, 'learning_rate': 2.0149253731343285e-05, 'epoch': 1.79}
{'loss': 0.0, 'learning_rate': 1.890547263681592e-05, 'epoch': 1.87}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_runtime': 45.2551, 'eval_samples_per_second': 2.784, 'eval_steps_per_second': 0.354, 'epoch': 1.87}
{'loss': 0.0, 'learning_rate': 1.7661691542288558e-05, 'epoch': 1.94}
{'loss': 0.0, 'learning_rate': 1.6417910447761194e-05, 'epoch': 2.01}
{'loss': 0.0, 'learning_rate': 1.5174129353233832e-05, 'epoch': 2.09}
{'loss': 0.0, 'learning_rate': 1.3930348258706468e-05, 'epoch': 2.16}
{'loss': 0.0, 'learning_rate': 1.2686567164179105e-05, 'epoch': 2.24}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_runtime': 45.5454, 'eval_samples_per_second': 2.766, 'eval_steps_per_second': 0.351, 'epoch': 2.24}
{'loss': 0.0, 'learning_rate': 1.1442786069651743e-05, 'epoch': 2.31}
{'loss': 0.0, 'learning_rate': 1.0199004975124378e-05, 'epoch': 2.39}
{'loss': 0.0, 'learning_rate': 8.955223880597016e-06, 'epoch': 2.46}
{'loss': 0.0, 'learning_rate': 7.711442786069652e-06, 'epoch': 2.54}
{'loss': 0.0, 'learning_rate': 6.467661691542288e-06, 'epoch': 2.61}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_runtime': 34.2682, 'eval_samples_per_second': 3.677, 'eval_steps_per_second': 0.467, 'epoch': 2.61}
{'loss': 0.0, 'learning_rate': 5.2238805970149255e-06, 'epoch': 2.69}
{'loss': 0.0, 'learning_rate': 3.980099502487563e-06, 'epoch': 2.76}
{'loss': 0.0, 'learning_rate': 2.736318407960199e-06, 'epoch': 2.84}
{'loss': 0.0, 'learning_rate': 1.4925373134328358e-06, 'epoch': 2.91}
{'loss': 0.0, 'learning_rate': 2.4875621890547267e-07, 'epoch': 2.99}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_runtime': 27.1692, 'eval_samples_per_second': 4.638, 'eval_steps_per_second': 0.589, 'epoch': 2.99}
{'train_runtime': 12327.9953, 'train_samples_per_second': 0.261, 'train_steps_per_second': 0.033, 'train_loss': 0.0, 'epoch': 3.0}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': nan,
 'eval_runtime': 28.6683,
 'eval_samples_per_second': 5.337,
 'eval_steps_per_second': 0.698,
 'epoch': 3.0}

In [24]:
# Print evaluation metrics
print(evaluation_results)

{'eval_loss': nan, 'eval_runtime': 26.6505, 'eval_samples_per_second': 5.741, 'eval_steps_per_second': 0.75, 'epoch': 3.0}


In [25]:
# Save the model
output_dir = "C:\\Users\\Administrator\\Downloads\\Abbreviation detection system"
trainer.save_model(output_dir)

In [27]:
from transformers import BertTokenizerFast, BertForTokenClassification
import torch

# Load the saved model
model = BertForTokenClassification.from_pretrained("C:\\Users\\Administrator\\Downloads\\Abbreviation detection system")

# Initialize the tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Define a function to preprocess new data
def preprocess_text(text):
    # Tokenize the text
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    return inputs

# Example text for testing
text = "A SAMPLE TEST."

# Preprocess the text
inputs = preprocess_text(text)

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)

# Extract predicted labels
predicted_labels = torch.argmax(outputs.logits, dim=2).tolist()[0]

# Convert label IDs back to labels
predicted_labels = [model.config.id2label[label_id] for label_id in predicted_labels]

# Print the predicted labels
print(predicted_labels)


['LABEL_2', 'LABEL_2', 'LABEL_0', 'LABEL_0', 'LABEL_2', 'LABEL_2']
