In [2]:
from transformers import pipeline
import re

pipe = pipeline(task="token-classification", model="uygarkurt/bert-restore-punctuation-turkish")

sample_text =  "Yığın karnıyla düşünür gözüyle öğrenir kalbiyle kızar"

out = pipe(sample_text)

print(out)


[{'entity': 'COMMA', 'score': 0.9467491, 'index': 6, 'word': 'düşünür', 'start': 15, 'end': 22}, {'entity': 'COMMA', 'score': 0.93059206, 'index': 8, 'word': 'öğrenir', 'start': 31, 'end': 38}, {'entity': 'PERIOD', 'score': 0.97779375, 'index': 11, 'word': 'kızar', 'start': 48, 'end': 53}]


In [3]:
def restore_punctuation(text, model_output):
    """
    Restores punctuation using character-level indexes from model output.
    Handles agglutinative suffixes by checking for following suffix tokens.
    """
    # Sort by start position to process in order
    predictions = sorted(model_output, key=lambda x: x['start'])

    # Convert to list for easier manipulation
    result = list(text)
    offset = 0  # Track how many characters we've inserted

    i = 0
    while i < len(predictions):
        pred = predictions[i]

        # Check if next token is a suffix
        has_suffix = (i + 1 < len(predictions) and
                      predictions[i + 1]['word'].startswith('##') and
                      predictions[i + 1]['start'] == pred['end'])

        # Skip if current token is a suffix
        if pred['word'].startswith('##'):
            i += 1
            continue

        # Only add punctuation if there's no suffix following
        if not has_suffix:
            insert_pos = pred['end'] + offset
            punct = {
                'PERIOD': '.',
                'QUESTION_MARK': '?',
                'COMMA': ','
            }.get(pred['entity'])

            if punct:
                result.insert(insert_pos, punct)
                offset += 1

        i += 1

    return ''.join(result)


# Test the function
sample_text = "Bunu söylemek için mi geldin"
out = pipe(sample_text)
restored_text = restore_punctuation(sample_text, out)
print(restored_text)

Bunu söylemek için mi geldin?


In [4]:
sample_text = "işlerimi bitirdim. Bunu söylemek için mi geldin?"
# Remove punctuation
sample_text = re.sub(r'[^\w\s]', '', sample_text)
out = pipe(sample_text)

print(out)

[{'entity': 'PERIOD', 'score': 0.95308465, 'index': 3, 'word': 'bitirdi', 'start': 9, 'end': 16}, {'entity': 'PERIOD', 'score': 0.8951597, 'index': 4, 'word': '##m', 'start': 16, 'end': 17}, {'entity': 'QUESTION_MARK', 'score': 0.97039175, 'index': 9, 'word': 'geldin', 'start': 40, 'end': 46}]
