In [None]:
from transformers import pipeline
import re

pipe = pipeline(task="token-classification", model="uygarkurt/bert-restore-punctuation-turkish")

sample_text = "Niye doktor oldun o zaman Çok sıradan bir sebebim var annem istedi Ona hayır demek imkansızdır"

out = pipe(sample_text)

print(out)


In [2]:
def restore_punctuation(text, model_output):
    """
    Restores punctuation using character-level indexes from model output.
    Handles agglutinative suffixes and existing punctuation marks.
    """
    # Sort by start position to process in order
    predictions = sorted(model_output, key=lambda x: x['start'])
    
    # Convert to list for easier manipulation
    result = list(text)
    offset = 0  # Track how many characters we've inserted
    
    i = 0
    while i < len(predictions):
        current_pred = predictions[i]
        
        # If the prediction is for an existing punctuation mark, replace it if needed
        if (len(current_pred['word']) == 1 and 
            current_pred['word'] in ['.', ',', '?']):
            # Replace existing punctuation with predicted one if different
            punct = {
                'PERIOD': '.',
                'QUESTION_MARK': '?',
                'COMMA': ','
            }[current_pred['entity']]
            
            if result[current_pred['start'] + offset] != punct:
                result[current_pred['start'] + offset] = punct
            i += 1
            continue
        
        # Skip if current token is a suffix
        if current_pred['word'].startswith('##'):
            i += 1
            continue
        
        # Find the last suffix of current word
        last_pos = i
        while (last_pos + 1 < len(predictions) and 
               predictions[last_pos + 1]['word'].startswith('##') and 
               predictions[last_pos + 1]['start'] == predictions[last_pos]['end']):
            last_pos += 1
            
        # Add punctuation after the last suffix if it's not already there
        if current_pred['entity'] in ['PERIOD', 'QUESTION_MARK', 'COMMA']:
            insert_pos = predictions[last_pos]['end'] + offset
            
            # Check if there's already a punctuation mark
            if (insert_pos < len(result) and 
                result[insert_pos] in ['.', ',', '?']):
                # Replace existing punctuation
                punct = {
                    'PERIOD': '.',
                    'QUESTION_MARK': '?',
                    'COMMA': ','
                }[current_pred['entity']]
                result[insert_pos] = punct
            else:
                # Insert new punctuation
                punct = {
                    'PERIOD': '.',
                    'QUESTION_MARK': '?',
                    'COMMA': ','
                }[current_pred['entity']]
                result.insert(insert_pos, punct)
                offset += 1
        
        # Skip to after the last suffix
        i = last_pos + 1
            
    return ''.join(result)

In [None]:
sample_text = """Ve biz sana gelsek Koli taşısak İki kere rum rum rum yapın, sonra girin."""
# Remove punctuation
# sample_text = re.sub(r'[^\w\s]', '', sample_text).replace('   ', ' ').replace('  ', ' ')
out = pipe(sample_text)

restored_text = restore_punctuation(sample_text, out)
print(out)

print(restored_text)

# for sentence in restored_text.split('. '):
#     print(sentence)

