In [4]:
import re
import os

def clean_and_reconstruct_sentences(raw_text):
    # Step 1: Normalize whitespace and remove spurious symbols
    text = raw_text.replace('\r', ' ').replace('\n', ' ')  # flatten newlines
    text = re.sub(r'(?<=[A-Za-z])[-–]\s+', '', text)       # fix hyphenation across lines
    text = re.sub(r'\s+', ' ', text)                       # normalize multiple spaces

    # Step 2: Fix spacing around punctuation
    text = re.sub(r'\s*([.,!?;:])\s*', r'\1 ', text)

    # Step 3: Split into sentences (based on ., ?, ! with capital letter following)
    # Use lookahead for better control
    sentence_end_re = re.compile(r'(?<=[.!?])\s+(?=[A-Z])')
    sentences = sentence_end_re.split(text)

    # Step 4: Strip and remove junk
    clean_sentences = []
    for sentence in sentences:
        s = sentence.strip()
        s = re.sub(r'[^\w\s,.;!?()\[\]áàäâèéëêíìïîòóöôùúüûñʻ’‘“”\"\'-]+', '', s)
        if len(s.split()) > 2:  # filter out junk
            clean_sentences.append(s)

    return clean_sentences

# Batch process all files
if __name__ == "__main__":
    input_dir = r"outputs/sentences"
    output_dir = r"outputs/full_sentences"
    os.makedirs(output_dir, exist_ok=True)

    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, filename)

            with open(input_path, 'r', encoding='utf-8') as f:
                raw_text = f.read()

            sentences = clean_and_reconstruct_sentences(raw_text)

            with open(output_path, 'w', encoding='utf-8') as f:
                for sentence in sentences:
                    f.write(sentence + '\n')

            print(f"✅ {filename}: Extracted {len(sentences)} clean sentences.")


✅ 2015.464359.Ki-Jingthoh-Halor-Ka-Kolshor-Bad-Ka-Politik-Ed-1st_sentences.txt: Extracted 1282 clean sentences.
✅ 2015.464360.Ka-Niam-ki-khasi-Ka-Niam-Tip-blei-tip-brieu-Ed-1st_sentences.txt: Extracted 425 clean sentences.
✅ 2015.464361.Manik-Raitong_sentences.txt: Extracted 190 clean sentences.
✅ 2015.464362.Tynrai-Jingkheinfundamental-Arithmetic-Ed-4th_sentences.txt: Extracted 461 clean sentences.
✅ 2015.464363.Improbed-Instruction-In-Khasi-Ka-Jingpynroi_sentences.txt: Extracted 2832 clean sentences.
✅ 2015.464364.Ki-Saimuka-Na-Diengkynthong-1980_sentences.txt: Extracted 328 clean sentences.
✅ 2015.464365.Yillop-Sajeki-Passah_sentences.txt: Extracted 288 clean sentences.
✅ 2015.464366.Na-Ka-Hyndai-Sha-Ka-Lawai_sentences.txt: Extracted 1151 clean sentences.
✅ 2015.464367.U-Khain-Bad-Ka-Ngen_sentences.txt: Extracted 455 clean sentences.
✅ 2015.464368.Ka-History-Ka-Ktien-Khasi-Ed-2nd_sentences.txt: Extracted 1273 clean sentences.
✅ 2015.464369.Ki-Sermon-Synod-Bad-Assembly-Kitab-2_senten