In [None]:
 
 
import pandas as pd
from datasets import load_dataset

# Load the dataset
def load_conll_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    sentences = []
    labels = []
    current_sentence = []
    current_labels = []
    
    for line in lines:
        if line.strip():
            token, label = line.split('\t')
            current_sentence.append(token)
            current_labels.append(label)
        else:
            if current_sentence:
                sentences.append(current_sentence)
                labels.append(current_labels)
                current_sentence = []
                current_labels = []
    
    return sentences, labels

# Load your labeled data
file_path = 'labeled_data.conll'  
sentences, labels = load_conll_dataset(file_path)

# Convert to DataFrame for easier handling
df = pd.DataFrame({'tokens': sentences, 'labels': labels})

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Map labels to integers
unique_labels = set(label for label_list in labels for label in label_list)
label_to_id = {label: idx for idx, label in enumerate(unique_labels)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

# Encode the labels
encoded_labels = [[label_to_id[label] for label in label_list] for label_list in labels]

# Train-test split
train_sentences, test_sentences, train_labels, test_labels = train_test_split(sentences, encoded_labels, test_size=0.2, random_state=42)

# Create datasets
train_dataset = Dataset.from_dict({'tokens': train_sentences, 'labels': train_labels})
test_dataset = Dataset.from_dict({'tokens': test_sentences, 'labels': test_labels})

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments

# Load tokenizer and model
model_name = "microsoft/deberta-v3-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(unique_labels))

# Tokenize the datasets
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True, padding=True)
    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Get word ids
        label_ids = [-100] * len(tokenized_inputs['input_ids'])  # Initialize label ids
        for j in range(len(label)):
            if word_ids[j] is not None:  # Get the index of the token
                label_ids[word_ids[j]] = label[j]  # Assign the label
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_tokenized = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized = test_dataset.map(tokenize_and_align_labels, batched=True)

# Set format for Trainer
train_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
training_args = TrainingArguments(
    output_dir="./ner_model",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
)

trainer.train()

In [None]:
trainer.evaluate()

In [None]:
model.save_pretrained("./ner_model")
tokenizer.save_pretrained("./ner_model")