In [1]:
from transformers import pipeline

pipe = pipeline(task="token-classification", model="uygarkurt/bert-restore-punctuation-turkish", device="cpu")

sample_text = "Çok sıradan bir sebebim var"

out = pipe(sample_text)

print(out)



[{'entity': 'COMMA', 'score': 0.6682326, 'index': 3, 'word': 'gidiyordu', 'start': 9, 'end': 18}, {'entity': 'COMMA', 'score': 0.5591094, 'index': 4, 'word': '##n', 'start': 18, 'end': 19}, {'entity': 'COMMA', 'score': 0.5128775, 'index': 9, 'word': 'değiliz', 'start': 39, 'end': 46}, {'entity': 'PERIOD', 'score': 0.63483, 'index': 11, 'word': 'bey', 'start': 56, 'end': 59}]


In [4]:
def restore_punctuation(text, model_output):
    """
    Restores punctuation using character-level indexes from model output.
    Only inserts punctuation when prediction score is above 0.7 (70%).
    Handles agglutinative suffixes, apostrophes, quotes, and Turkish punctuation rules.
    """
    # Sort by start position to process in order
    predictions = sorted(model_output, key=lambda x: x['start'])

    # Convert to list for easier manipulation
    result = list(text)
    offset = 0  # Track how many characters we've inserted

    i = 0
    while i < len(predictions):
        current_pred = predictions[i]
        
        # Skip predictions with low confidence
        if current_pred['score'] < 0.85:
            i += 1
            continue
        
        # Skip if current token is a suffix or part of apostrophe/quote
        if (current_pred['word'].startswith('##') or
            current_pred['word'] in ["'", '"'] or
            (i > 0 and predictions[i-1]['word'] in ["'", '"'])):
            i += 1
            continue

        # Find the last part of the current word
        last_pos = i
        while last_pos + 1 < len(predictions):
            next_pred = predictions[last_pos + 1]
            if (next_pred['word'].startswith('##') or
                next_pred['start'] == predictions[last_pos]['end']):
                last_pos += 1
            else:
                break

        # Only process punctuation if this is the last token of a word
        if current_pred['entity'] in ['PERIOD', 'QUESTION_MARK', 'COMMA']:
            # Get the position after the complete word
            insert_pos = predictions[last_pos]['end'] + offset
            
            # Skip if current token is a suffix - let the main token handle punctuation
            if current_pred['word'].startswith('##'):
                i = last_pos + 1
                continue

            # Don't insert punctuation in the middle of a word with apostrophe/quote
            if (insert_pos < len(result) and
                (result[insert_pos] in ["'", '"'] or
                 (insert_pos > 0 and result[insert_pos-1] in ["'", '"']))):
                i = last_pos + 1
                continue

            # For comma, check if there's a space after the insertion point
            if (current_pred['entity'] == 'COMMA' and
                insert_pos < len(result) - 1 and
                not result[insert_pos].isspace()):
                i = last_pos + 1
                continue

            # Check if there's already a punctuation mark
            if (insert_pos < len(result) and
                result[insert_pos] in ['.', ',', '?']):
                # Replace existing punctuation
                punct = {
                    'PERIOD': '.',
                    'QUESTION_MARK': '?',
                    'COMMA': ','
                }[current_pred['entity']]
                result[insert_pos] = punct
            else:
                # Check for existing punctuation in surrounding positions
                has_punct_before = (insert_pos > 0 and
                                  result[insert_pos - 1] in ['.', ',', '?'])
                has_punct_after = (insert_pos < len(result) and
                                 result[insert_pos] in ['.', ',', '?'])
                has_quote_after = (insert_pos < len(result) and
                                 result[insert_pos] == '"')

                # Skip if there's already punctuation nearby or quote after
                if has_punct_before or has_punct_after or has_quote_after:
                    i = last_pos + 1
                    continue

                # Insert new punctuation if no existing punctuation nearby
                punct = {
                    'PERIOD': '.',
                    'QUESTION_MARK': '?',
                    'COMMA': ','
                }[current_pred['entity']]
                result.insert(insert_pos, punct)
                offset += 1

        i = last_pos + 1

    return ''.join(result)

38..68 derece olduğunu söylüyor... Yuvarlarsak 38,,,7 dereceymiş teta


In [5]:
sample_text = """
Sen kaça gidiyorsun Hiçbirimiz satılık değiliz öğretmen bey
"""
# Remove punctuation
sample_text = re.sub(r'[^\w\s]', '', sample_text)
import time
start_time = time.time()
out = pipe(sample_text)
end_time = time.time()
print(f"Time taken: {end_time - start_time:.2f} seconds")

restored_text = restore_punctuation(sample_text, out)
print(out)

print(restored_text)


Time taken: 0.05 seconds
[{'entity': 'PERIOD', 'score': 0.95676047, 'index': 5, 'word': 'söylüyor', 'start': 21, 'end': 29}, {'entity': 'COMMA', 'score': 0.47004408, 'index': 12, 'word': 'derece', 'start': 46, 'end': 52}, {'entity': 'COMMA', 'score': 0.54805917, 'index': 13, 'word': '##ym', 'start': 52, 'end': 54}, {'entity': 'COMMA', 'score': 0.48716187, 'index': 14, 'word': '##iş', 'start': 54, 'end': 56}, {'entity': 'PERIOD', 'score': 0.91442615, 'index': 15, 'word': 'te', 'start': 57, 'end': 59}, {'entity': 'PERIOD', 'score': 0.94705576, 'index': 16, 'word': '##ta', 'start': 59, 'end': 61}]
