In [13]:
# 1. Etiketleri tanÄ±mla
label_list = [
    "O",
    "B-IL", "I-IL",
    "B-ILCE", "I-ILCE",
    "B-MAHALLE", "I-MAHALLE",
    "B-CADDE", "I-CADDE",
    "B-SOKAK", "I-SOKAK",
    "B-NO", "I-NO",
    "B-POI", "I-POI",
    "B-DAIRE", "I-DAIRE",
    "B-POSTA", "I-POSTA",
    "B-ULKE", "I-ULKE",
]

# 2. EÅŸleme sÃ¶zlÃ¼kleri
id2label_dict = {i: label for i, label in enumerate(label_list)}
label2id_dict = {label: i for i, label in enumerate(label_list)}


In [14]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "dbmdz/bert-base-turkish-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),  # senin BIO etiket sayÄ±n
    id2label=id2label_dict,
    label2id=label2id_dict
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from datasets import load_dataset

dataset = load_dataset("json", data_files={
    "train": "converted/train.jsonl",
    "validation": "converted/dev.jsonl",
    "test": "converted/test.jsonl"
})

In [18]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",   # <-- ekledik
        max_length=128,         # <-- isteÄŸe gÃ¶re ayarlanabilir
        is_split_into_words=True
    )
    all_labels = []
    for i, labels in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                # bu kÄ±sÄ±m special token [CLS], [SEP], [PAD] iÃ§in
                label_ids.append(-100)
            else:
                orig_label = labels[word_idx]
                if word_idx != previous_word_idx:
                    label_ids.append(label2id_dict[orig_label])
                else:
                    # subword iÃ§in B- -> I- dÃ¼zeltmesi
                    if orig_label.startswith("B-"):
                        new_label = "I-" + orig_label[2:]
                    else:
                        new_label = orig_label
                    label_ids.append(label2id_dict[new_label])
            previous_word_idx = word_idx
        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs


tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/43535 [00:00<?, ? examples/s]

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 43535/43535 [00:05<00:00, 7620.33 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5441/5441 [00:00<00:00, 8061.15 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5443/5443 [00:00<00:00, 8272.33 examples/s]


In [19]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./bert-turkish-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
from transformers import pipeline

ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
print(ner("Ä°zmir KarabaÄŸlar BarÄ±ÅŸ Mahallesi 4512 Sokak No:7A"))