In [1]:
from transformers import pipeline

pipe = pipeline(task="token-classification", model="uygarkurt/bert-restore-punctuation-turkish", device='cpu')

sample_text = "yığın karnıyla düşünür gözüyle öğrenir kalbiyle kızar"

out = pipe(sample_text)

print(out)



[{'entity': 'COMMA', 'score': 0.9069116, 'index': 4, 'word': 'düşünür', 'start': 15, 'end': 22}, {'entity': 'COMMA', 'score': 0.89643276, 'index': 6, 'word': 'öğrenir', 'start': 31, 'end': 38}, {'entity': 'PERIOD', 'score': 0.9108698, 'index': 9, 'word': 'kızar', 'start': 48, 'end': 53}]


In [2]:
def punkt(n):
    if n in ['PERIOD', 'QUESTION_MARK', 'COMMA']:
        return {
            'PERIOD': '.',
        'QUESTION_MARK': '?',
        'COMMA': ','
        }[n]


def restore_punctuation(text, model_output):
    """
    Restores punctuation using character-level indexes from model output.
    Only inserts punctuation when prediction score is above 0.7 (70%).
    Handles agglutinative suffixes, apostrophes, quotes, and Turkish punctuation rules.
    """
    # Sort by start position to process in order
    predictions = sorted(model_output, key=lambda x: x['start'])

    # Convert to list for easier manipulation
    result = list(text)
    print(f"Result: {result}")
    offset = 0  # Track how many characters we've inserted

    i = 0

    while i < len(predictions):
        current_pred = predictions[i]

        # Skip predictions with low confidence
        if current_pred['score'] < 0.5:
            i += 1
            continue

        # Skip if there's a next token that starts with ##
        if (i + 1 < len(predictions) and predictions[i + 1]['word'].startswith('##')):
            i += 1
            continue
        

        if current_pred['word'] == punkt(current_pred['entity']):
            i += 1
            continue

        # Only process punctuation if this is the last token of a word
        if current_pred['entity'] in ['PERIOD', 'QUESTION_MARK', 'COMMA']:
            insert_pos = predictions[i]['end'] + offset
            
            if i < len(predictions) - 1 and predictions[i + 1]['word'] == punkt(current_pred['entity']):
                i += 1
                continue
            
            if i < len(predictions) - 1 and result[insert_pos] not in [' ', '\n']:
                i += 1
                continue

            punct = punkt(current_pred['entity'])

            result.insert(insert_pos, punct)
            offset += 1

            # Don't insert punctuation in the middle of a word with apostrophe/quote
            # if (insert_pos < len(result) and
            #     (result[insert_pos] in ["'", '"'] or
            #      (insert_pos > 0 and result[insert_pos-1] in ["'", '"']))):
            #     i = last_pos + 1
            #     continue

            # For comma, check if there's a space after the insertion point
            # if (current_pred['entity'] == 'COMMA' and
            #     insert_pos < len(result) - 1 and
            #     not result[insert_pos].isspace()):
            #     i = last_pos + 1
            #     continue

            # Check if there's already a punctuation mark
            # if (insert_pos < len(result) and  # TODO
            #     result[insert_pos] in ['.', ',', '?']):
            #     # Replace existing punctuation
            #     punct = {
            #         'PERIOD': '.',
            #         'QUESTION_MARK': '?',
            #         'COMMA': ','
            #     }[current_pred['entity']]
            #     result[insert_pos] = punct
            # else:
            # Check for existing punctuation in surrounding positions
            # has_punct_before = (insert_pos > 0 and
            #                   result[insert_pos - 1] in ['.', ',', '?'])
            # has_punct_after = (insert_pos < len(result) and
            #                  result[insert_pos] in ['.', ',', '?'])
            # has_quote_after = (insert_pos < len(result) and
            #                  result[insert_pos] == '"')

            # # Skip if there's already punctuation nearby or quote after
            # if has_punct_before or has_punct_after or has_quote_after:
            #     i = last_pos + 1
            #     continue

            # # Insert new punctuation if no existing punctuation nearby
            # punct = {
            #     'PERIOD': '.',
            #     'QUESTION_MARK': '?',
            #     'COMMA': ','
            # }[current_pred['entity']]
            # result.insert(insert_pos, punct)
            # offset += 1
        i += 1

    return ''.join(result)

In [5]:
sample_text = "yığın karnıyla düşünür gözüyle öğrenir kalbiyle kızar"
# Remove punctuation
# sample_text = re.sub(r'[^\w\s]', '', sample_text).replace('   ', ' ').replace('  ', ' ')
out = pipe(sample_text)

restored_text = restore_punctuation(sample_text, out)
print(out)

print(restored_text)


Result: ['y', 'ı', 'ğ', 'ı', 'n', ' ', 'k', 'a', 'r', 'n', 'ı', 'y', 'l', 'a', ' ', 'd', 'ü', 'ş', 'ü', 'n', 'ü', 'r', ' ', 'g', 'ö', 'z', 'ü', 'y', 'l', 'e', ' ', 'ö', 'ğ', 'r', 'e', 'n', 'i', 'r', ' ', 'k', 'a', 'l', 'b', 'i', 'y', 'l', 'e', ' ', 'k', 'ı', 'z', 'a', 'r']
[{'entity': 'COMMA', 'score': 0.9069116, 'index': 4, 'word': 'düşünür', 'start': 15, 'end': 22}, {'entity': 'COMMA', 'score': 0.89643276, 'index': 6, 'word': 'öğrenir', 'start': 31, 'end': 38}, {'entity': 'PERIOD', 'score': 0.9108698, 'index': 9, 'word': 'kızar', 'start': 48, 'end': 53}]
yığın karnıyla düşünür, gözüyle öğrenir, kalbiyle kızar.
