In [1]:
import torch 
import numpy as np
import warnings
warnings.filterwarnings('ignore')


First we define, random seed for reproducability.

In [None]:
import random
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

Now we should load model and tokenizer. We choose Bert Base Turkish Cased as pretrained backbone.

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")



Now we will load dataset from conllu files by using conllu library.

In [4]:
import conllu
def load_conllu_file(file_path):
    """Load a .conllu file and return parsed sentences."""
    with open(file_path, 'r', encoding='utf-8') as f:
        sentences = conllu.parse(f.read())
    return sentences


sentences_train = load_conllu_file('ota_boun_ud-train.conllu')
sentences_test = load_conllu_file('ota_boun-ud-test.conllu')

Now we should extract tokens and it's pos labels from datasets.

In [5]:
def extract_tokens_and_labels(sentences):
    """Extract tokens and POS tags from parsed sentences."""
    tokens_list = []
    labels_list = []
    
    for sentence in sentences:
        tokens = []
        labels = []
        
        for token in sentence:
                
            tokens.append(token['form'])
            labels.append(token['upos'])
        
        if tokens:  # Only add non-empty sentences
            tokens_list.append(tokens)
            labels_list.append(labels)
    
    return tokens_list, labels_list

training_tokens_list, training_labels_list = extract_tokens_and_labels(sentences_train)
test_tokens_list, test_labels_list = extract_tokens_and_labels(sentences_test)

After some initial attemps, we realize that some data augmentation is required , so  we decide to add IMST corpus to our train set.

In [6]:
sentences_train_imst= load_conllu_file('tr_boun-ud-train.conllu')
training_tokens_list_imst, training_labels_list_imst = extract_tokens_and_labels(sentences_train_imst)

In [None]:
# Combine training data from both sources
training_tokens_list.extend(training_tokens_list_imst)
training_labels_list.extend(training_labels_list_imst)

print(f"Combined training data: {len(training_tokens_list)} sentences")


Now we need to determine unique pos labels and crate label2id, id2label dictionaries.

In [None]:
unique_labels = set()
for labels in training_labels_list:
    for label in labels:
        unique_labels.add(label)


unique_labels = sorted(list(unique_labels))

label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}
print(f"Found {len(unique_labels)} unique POS tags:")
print(unique_labels)


In [None]:
# Find max length.
tokenizer.model_max_length

## Dataset Creation
 
Now we'll create the dataset in the format required for BERT fine-tuning. This involves:
1. Tokenizing the input text using the BERT tokenizer
2. Aligning the POS labels with the tokenized input (handling subword tokenization)
3. Padding sequences to a fixed length and creating attention masks
4. Converting labels to numerical IDs and handling special tokens with -100 (ignored in loss)


In [None]:
# Create dataset in the required format
def create_dataset(tokens_list, labels_list, tokenizer, label2id, max_length=512):
    """Create dataset with tokenized inputs and aligned labels."""
    input_ids_list = []
    attention_mask_list = []
    labels_list_aligned = []
    
    for tokens, labels in zip(tokens_list, labels_list):
        # Tokenize the tokens
        tokenized = tokenizer(
            tokens,
            is_split_into_words=True,
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        )
        
        input_ids = tokenized['input_ids'].squeeze()
        attention_mask = tokenized['attention_mask'].squeeze()
        
        # Align labels with tokenized input
        word_ids = tokenized.word_ids()
        aligned_labels = []
        
        for word_id in word_ids:
            if word_id is None:
                # Special tokens get -100 (ignored in loss calculation)
                aligned_labels.append(-100)
            else:
                # Map original label to id
                aligned_labels.append(label2id[labels[word_id]])
        
        input_ids_list.append(input_ids)
        attention_mask_list.append(attention_mask)
        labels_list_aligned.append(torch.tensor(aligned_labels))
    
    return {
        'input_ids': input_ids_list,
        'attention_mask': attention_mask_list,
        'labels': labels_list_aligned
    }

# Create train and test datasets
train_dataset = create_dataset(training_tokens_list, training_labels_list, tokenizer, label2id)
test_dataset = create_dataset(test_tokens_list, test_labels_list, tokenizer, label2id)

print(f"Training dataset size: {len(train_dataset['input_ids'])}")
print(f"Test dataset size: {len(test_dataset['input_ids'])}")


Here we convert the dataset to a Hugging Face Dataset object.

In [11]:
from datasets import Dataset, DatasetDict
train_dataset = Dataset.from_dict(train_dataset)
test_dataset = Dataset.from_dict(test_dataset)

# Shuffle the training dataset
train_dataset = train_dataset.shuffle(seed=42)

    
dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

We define the compute_metrics function to compute the accuracy, precision, recall, and F1 score.

In [12]:
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
import numpy as np

def compute_metrics(eval_pred):
    """Compute metrics for POS tagging evaluation."""
    # We get the predictions from the model and labels from the dataset
    predictions, labels = eval_pred
    # By using argmax, we get the predicted label for each token
    predictions = np.argmax(predictions, axis=2)
    
    true_predictions = []
    true_labels = []
    # We iterate over the predictions and labels and skip the special tokens
    for prediction, label in zip(predictions, labels):
        for p, l in zip(prediction, label):
            if l != -100:  # Skip special tokens
                true_predictions.append(p)
                true_labels.append(l)
    
    # Calculate metrics
    accuracy = accuracy_score(true_labels, true_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, true_predictions, average='weighted'
    )
    
    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

Here we define the embedding processor and sequence tagger.

## 1) Embedding processor:
- We use the BERT model to get the contextualized embeddings of the input tokens.
- We apply dropout to the embeddings to regularize the model.
- We apply token masking to the embeddings to regularize the model.
- We follow "BUILDING FOUNDATIONS FOR NATURAL LANGUAGE
PROCESSING OF HISTORICAL TURKISH: RESOURCES AND
MODELS" paper for the choice of dropout rates. 

## 2) Sequence tagger:
- We use a linear layer to project the output of the embedding processor to the number of POS tags.
- We apply dropout to the output of the embedding processor to regularize the model.

## 3) POS model:
- We use the embedding processor and sequence tagger to create a POS model.
- We use a (optional) BiLSTM layer to get the contextualized embeddings of the input tokens inspired by STEPS dependency parser architecture.
- We apply dropout to the output of the BiLSTM layer to regularize the model.

## 4) Loss function:
- We use a loss function with label smoothing to regularize the model further.






In [13]:
import torch
import torch.nn as nn
import math
from transformers import AutoModel
from torch.optim.lr_scheduler import LambdaLR

class EmbeddingProcessor(nn.Module):
    """BERT with specific dropout configuration."""
    
    def __init__(self, bert_model_name, hidden_dropout=0.2, attention_dropout=0.2, 
                 output_dropout=0.5, token_mask_prob=0.15):
        super().__init__()
        
        self.bert = AutoModel.from_pretrained(bert_model_name)
        self.token_mask_prob = token_mask_prob
        
        # Override BERT's dropout settings
        self.bert.config.hidden_dropout_prob = hidden_dropout
        self.bert.config.attention_probs_dropout_prob = attention_dropout
        
        # Additional output dropout
        self.output_dropout = nn.Dropout(output_dropout)
        
        # Apply new dropout settings to BERT layers
        for layer in self.bert.encoder.layer:
            layer.attention.self.dropout = nn.Dropout(attention_dropout)
            layer.attention.output.dropout = nn.Dropout(hidden_dropout)
            layer.intermediate.dropout = nn.Dropout(hidden_dropout)
            layer.output.dropout = nn.Dropout(hidden_dropout)
    
    def forward(self, input_ids, attention_mask=None):
        # Apply token masking during training
        if self.training and self.token_mask_prob > 0:
            input_ids = self._apply_token_masking(input_ids, attention_mask)
        
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.output_dropout(outputs.last_hidden_state)
        
        return sequence_output
    
    def _apply_token_masking(self, input_ids, attention_mask):
        """Apply random token masking for regularization."""
        if attention_mask is None:
            return input_ids
            
        mask_token_id = self.bert.config.vocab_size - 1
        
        # Create random mask (excluding special tokens)
        rand = torch.rand_like(input_ids.float())
        mask_prob = (rand < self.token_mask_prob) & (attention_mask == 1)
        
        # Exclude CLS and SEP tokens
        mask_prob[:, 0] = False  # CLS
        for i in range(mask_prob.size(0)):
            seq_len = attention_mask[i].sum()
            if seq_len > 1:
                mask_prob[i, seq_len-1] = False  # SEP
        
        masked_input_ids = input_ids.clone()
        masked_input_ids[mask_prob] = mask_token_id
        
        return masked_input_ids


class SequenceTagger(nn.Module):
    """POS tagger with input dropout=0.2."""
    
    def __init__(self, input_size, num_labels, dropout=0.2):
        super().__init__()
        
        self.input_dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(input_size, num_labels)
        
    def forward(self, sequence_output):
        dropped_output = self.input_dropout(sequence_output)
        logits = self.classifier(dropped_output)
        return logits


class POSModel(nn.Module):
    """ model for POS tagging with BiLSTM layer."""
    
    def __init__(self, bert_model_name, num_pos_labels, lstm_hidden_dim=256, use_bilstm=False):
        super().__init__()
        
        self.num_labels = num_pos_labels
        self.lstm_hidden_dim = lstm_hidden_dim  
        self.use_bilstm = use_bilstm
        
        # STEPS embedding processor with specified dropout rates
        self.embedding_processor = EmbeddingProcessor(
            bert_model_name=bert_model_name,
            hidden_dropout=0.2,
            attention_dropout=0.2,
            output_dropout=0.5,
            token_mask_prob=0.15
        )
        
        bert_hidden_size = self.embedding_processor.bert.config.hidden_size
        
        # BiLSTM layer
        self.bilstm = nn.LSTM(
            input_size=bert_hidden_size,
            hidden_size=lstm_hidden_dim,
            num_layers=2,
            batch_first=True,
            bidirectional=True,
            dropout=0.3
        )
        
        # POS sequence tagger with input dropout=0.2
        # Input size should be 2*lstm_hidden_dim (bidirectional LSTM)
        if use_bilstm:
            self.pos_tagger = SequenceTagger(
                input_size=2 * lstm_hidden_dim,  
                num_labels=num_pos_labels,
                dropout=0.2
            )
        else:
            self.pos_tagger = SequenceTagger(
                input_size=bert_hidden_size,
                num_labels=num_pos_labels,
                dropout=0.2
            )
        
        # Loss function with label smoothing
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=-100, label_smoothing=0.1)
        
    def forward(self, input_ids, attention_mask=None, labels=None):
        # Get contextualized embeddings from BERT
        sequence_output = self.embedding_processor(input_ids, attention_mask)

        if self.use_bilstm:
            sequence_output, _ = self.bilstm(sequence_output)  # Get the output properly
        
        # POS tagging using BiLSTM output
        logits = self.pos_tagger(sequence_output)  # Use lstm_output, not sequence_output
        
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits.view(-1, self.num_labels), labels.view(-1))
        
        return {
            'loss': loss,
            'logits': logits
        }


Here we initialize the model and move it to the GPU. All experiments are done using 1x H100 GPU from Google Cloud.

In [None]:
# By changing use_bilstm=True, we can use BiLSTM layer.Also lstm_hidden_dim can be changed.
model = POSModel(
    bert_model_name="dbmdz/bert-base-turkish-cased",
    num_pos_labels=len(label2id),
    use_bilstm=False
)
device= "cuda:0"
model.to(device)

Here we define the training arguments. There are 2 scenarios we can follow:
1) Cold start training:
- We train only the sequence tagger.
- We use a learning rate of 3e-4.
- We use a batch size of 32.
- We use a max sequence length of 512.

2) Fine-tuning:
- We fine-tune the entire model on the train set.
- We use a learning rate of 3e-5.
- We use a batch size of 32.
- We use a max sequence length of 512.
- We use inverse square root learning rate scheduler with 400 warmup steps. (Inspired by paper: "BUILDING FOUNDATIONS FOR NATURAL LANGUAGE
PROCESSING OF HISTORICAL TURKISH: RESOURCES AND
MODELS" )


In [15]:
from transformers import Trainer, TrainingArguments
"""
cold_start_training_args = TrainingArguments(
    output_dir=f"./results",
    learning_rate=3e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="no",
    load_best_model_at_end=False,
    report_to="none",
)

# Custom trainer
cold_start_trainer = Trainer(
    model=model,
    args=cold_start_training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
""" 
# Training arguments (might need adjustment for different heads)
training_args = TrainingArguments(
    output_dir=f"./results",
    learning_rate=4e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    lr_scheduler_type="inverse_sqrt",
    warmup_steps=400, 
    num_train_epochs=20,
    eval_strategy="epoch",
    save_strategy="no",
    load_best_model_at_end=False,
    report_to="none",
)

# Custom trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
""" 
### Cold start training for classification layers and fine-tuning the entire model. ###

print("Cold start training for classification layers.")
for param in model.embedding_processor.bert.parameters():
    param.requires_grad = False
cold_start_trainer.train()
print("Step 2: Fine-tuning entire model...")
for param in model.embedding_processor.bert.parameters():
    param.requires_grad = True
trainer.train()
""" 




trainer.train()

In [None]:
# Save the trained model
torch.save(model.state_dict(), "turkish_pos_model.pth")
print("Model saved as turkish_pos_model.pth")
