In [1]:
from transformers import pipeline
import re

pipe = pipeline(task="token-classification", model="uygarkurt/bert-restore-punctuation-turkish", device="cpu")

sample_text = "Çok sıradan bir sebebim var"

out = pipe(sample_text)

print(out)



[{'entity': 'PERIOD', 'score': 0.9830124, 'index': 6, 'word': 'var', 'start': 24, 'end': 27}]


In [4]:
def restore_punctuation(text, model_output):
    """
    Restores punctuation using character-level indexes from model output.
    Handles agglutinative suffixes by checking for following suffix tokens.
    """
    # Sort by start position to process in order
    predictions = sorted(model_output, key=lambda x: x['start'])

    # Convert to list for easier manipulation
    result = list(text)
    offset = 0  # Track how many characters we've inserted

    i = 0
    while i < len(predictions):
        pred = predictions[i]

        # Check if next token is a suffix
        has_suffix = (i + 1 < len(predictions) and
                      predictions[i + 1]['word'].startswith('##') and
                      predictions[i + 1]['start'] == pred['end'])

        # Skip if current token is a suffix
        if pred['word'].startswith('##'):
            i += 1
            continue

        # Only add punctuation if there's no suffix following
        if not has_suffix:
            insert_pos = pred['end'] + offset
            punct = {
                'PERIOD': '.',
                'QUESTION_MARK': '?',
                'COMMA': ','
            }.get(pred['entity'])

            if punct:
                result.insert(insert_pos, punct)
                offset += 1

        i += 1

    return ''.join(result)


# Test the function
sample_text = "38.68 derece olduğunu söylüyor. Yuvarlarsak 38,7 dereceymiş teta"
out = pipe(sample_text)
restored_text = restore_punctuation(sample_text, out)
print(restored_text)

38..68 derece olduğunu söylüyor... Yuvarlarsak 38,,,7 dereceymiş teta


In [5]:
sample_text = "38.68 derece olduğunu söylüyor. Yuvarlarsa.k 38,7 dereceymiş teta."
# Remove punctuation
sample_text = re.sub(r'[^\w\s]', '', sample_text)
import time
start_time = time.time()
out = pipe(sample_text)
end_time = time.time()
print(f"Time taken: {end_time - start_time:.2f} seconds")

print(out)

Time taken: 0.05 seconds
[{'entity': 'PERIOD', 'score': 0.95676047, 'index': 5, 'word': 'söylüyor', 'start': 21, 'end': 29}, {'entity': 'COMMA', 'score': 0.47004408, 'index': 12, 'word': 'derece', 'start': 46, 'end': 52}, {'entity': 'COMMA', 'score': 0.54805917, 'index': 13, 'word': '##ym', 'start': 52, 'end': 54}, {'entity': 'COMMA', 'score': 0.48716187, 'index': 14, 'word': '##iş', 'start': 54, 'end': 56}, {'entity': 'PERIOD', 'score': 0.91442615, 'index': 15, 'word': 'te', 'start': 57, 'end': 59}, {'entity': 'PERIOD', 'score': 0.94705576, 'index': 16, 'word': '##ta', 'start': 59, 'end': 61}]
