# Overview

**Data Preprocessing Pipeline**

- Split large files (memory constraints with spacy)
- Sentence segmentation with LatinCy
- Cleaning the sentences with regex
- Filter short sentences
- Reconstruct the document files
- build author files
- build one corpus file
- Transform into .jsonl

In order to save checkpoints and ensure reproducibility, after each step the folder has been duplicated and renamed.

# Code

## Pipeline

In [None]:
split_large_files("../../data/preprocessing/1-smaller-files/") 

In [None]:
sentence_segmentation("../../data/preprocessing/2-segmented-sentences/")

In [None]:
clean_sentences("../../data/preprocessing/3-cleaned-sentences/")

In [None]:
filter_short_sentences("../../data/preprocessing/4-only-longer-sentences/")

In [None]:
reconstruct_documents("../../data/preprocessing/4-only-longer-sentences/", "../../data/preprocessing/5-documents/")

In [None]:
build_author_files("../../data/preprocessing/5-documents/", "../../data/preprocessing/6-authors/")

In [None]:
build_corpus_file("../../data/preprocessing/6-authors/", "../../data/preprocessing/7-corpus/corpus.txt")

In [None]:
transform_to_jsonl("../../data/preprocessing/5-documents/", "../../data/corpus/documents/")
transform_to_jsonl("../../data/preprocessing/6-authors/", "../../data/corpus/authors/")
transform_to_jsonl("../../data/preprocessing/7-corpus/", "../../data/corpus/corpus/")

## Imports and Initialisation

In [None]:
import os
import spacy
import re 
import json
from collections import defaultdict

nlp = spacy.load("la_core_web_lg")
nlp.max_length = 4000000

TOKEN_LIMIT = 10 

## Parts

### Splitting Files

In [None]:
def split_large_files(root_folder, max_lines=250):
    for dirpath, dirnames, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.endswith('.txt'):
                file_path = os.path.join(dirpath, filename)
                try:
                    with open(file_path, 'r', encoding='utf-8') as file:
                        lines = file.readlines()
                    line_count = len(lines)
                    
                    if line_count > max_lines:
                        base_name, ext = os.path.splitext(filename)
                        
                        for i in range(0, line_count, max_lines):
                            part_lines = lines[i:i + max_lines]
                            part_filename = f"{base_name}_part{i // max_lines + 1}{ext}"
                            part_path = os.path.join(dirpath, part_filename)
                            
                            with open(part_path, 'w', encoding='utf-8') as part_file:
                                part_file.writelines(part_lines)
                                
                        os.remove(file_path)
                        
                except Exception as e:
                    print(f"Failed to process {file_path}: {e}")

### Sentence Segmentation

In [None]:
def sentence_segmentation(root_folder):
    for subdir, _, files in os.walk(root_folder):
        for file in files:
            if file.lower().endswith('.txt'):
                file_path = os.path.join(subdir, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                sentences = sentence_segmentation_in_text(content)
                with open(file_path, 'w', encoding='utf-8') as f:
                    for sentence in sentences:
                        f.write(sentence + "\n")
                    print(file_path)

In [None]:
def sentence_segmentation_in_text(text):
    text = text.replace('\n', ' ')
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

### Cleaning 

In [None]:
def clean_sentences(folder):
    for root, dirs, files in os.walk(folder):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    sentences = f.readlines()
                
                # Process each sentence: strip whitespace, then apply both cleaning functions
                cleaned_sentences = []
                for sentence in sentences:
                    cleaned_sentence = clean(sentence)
                    cleaned_sentence = normalize_capitalization(cleaned_sentence)
                    cleaned_sentences.append(cleaned_sentence)
                
                # Write the cleaned sentences back to the file
                with open(file_path, "w", encoding="utf-8") as f:
                    f.write("\n".join(cleaned_sentences))
            
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")

In [None]:
def clean(text):
    patterns = [
        r'\bpage\b\s*\d*',                    # 'page' followed by numbers
        r'\b\w*\d+\w*\b',                     # alphanumeric with digits
        r'\[.*?\]',                           # brackets and content
        r'\bUERS\b[.,\s]*',                   # 'UERS' case-insensitive
        r'\bCAPUT\b\s*[IVXLCDM]+\.',          # 'CAPUT' + Roman numerals
        r'\bCAP\.\s*[IVXLCDM]+\.',            # 'CAP.' + Roman numerals
        r'\bGo back to text\b',               # specific phrase
        r'\bFront Matter\b',                  # specific phrase
        r'^(\b\w+\b[.,\s]*){1,3}$',           # short alphanumeric sequences
        r'\.{2,}',                            # multiple periods
        r'([.,\s])\1{1,}',                    # repeated punctuation/spaces
        r'[,.]{2,}',                          # mixed punctuations
        r'^\s*[.,]+',                         # leading punctuation/spaces
        r'^\b[IVXLCDM]+\b\.?',                # leading Roman numerals
        r'^\bibid\b\.?',                      # leading 'ibid.' case-insensitive
        r'\b[a-z]\.\b',                       # Removes standalone single letters (vowels + consonants) with a period
        r'\b[b-df-hj-np-tv-z]\b'              # Removes standalone consonants without a period
    ]

    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)

    # Replace non-word characters (except spaces, dots, and commas) with a space
    text = re.sub(r'[^\w\s.,]', ' ', text)

    # Remove spaces before commas and periods
    text = re.sub(r'\s+([,.])', r'\1', text)

    # Ensure space after punctuation (if followed by a letter or number)
    text = re.sub(r'([,.])(\w)', r'\1 \2', text)

    # Normalize spaces (remove extra spaces)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
def normalize_capitalization(text): 
    processed_tokens = []
    doc = nlp(text)
    for token in doc:
        if token.text.isupper():  # Check if the token is all uppercase
            if token.ent_type_:  # If it's a named entity, capitalize only the first letter
                processed_tokens.append(token.text.capitalize() + token.whitespace_)
            else:  # Otherwise, make it all lowercase
                processed_tokens.append(token.text.lower() + token.whitespace_)
        else:
            processed_tokens.append(token.text + token.whitespace_)  # Preserve original spacing
    
    return "".join(processed_tokens)  

### Filtering Short Sentences

In [None]:
def filter_short_sentences(folder, token_limit=TOKEN_LIMIT):
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                
                sentences = []
                
                with open(file_path, 'r', encoding='utf-8') as f:
                    lines = [line.strip() for line in f if line.strip()]
                
                # Tokenize using spaCy's efficient pipeline
                docs = list(nlp.pipe(lines))
                
                # Filter sentences based on token length
                sentences = [sent.text for sent in docs if len(sent) > token_limit]
                
                # Overwrite the original file
                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write("\n".join(sentences) + "\n")
                
                print(f"Finished processing {file}")

### Reconstruct Document Files and build Author files 

In [None]:
def reconstruct_documents(input_folder, output_folder):
    # Regex pattern to extract the "Author_Document_" prefix
    pattern = re.compile(r"^([A-Za-z0-9_-]+_[A-Za-z0-9_-]+)_")
    
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for root, _, files in os.walk(input_folder):
        merge(pattern, root, output_folder, files)

In [None]:
def build_author_files(input_folder, output_folder):
    # Regex pattern to extract the "Author_" prefix
    pattern = re.compile(r"^([A-Za-z0-9_-]+)_")

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    merge(pattern, input_folder, output_folder, os.listdir(input_folder))

In [None]:
def merge(pattern, input_folder, output_folder, files):
    # Dictionary to store file paths grouped by prefix
    file_groups = defaultdict(list)

    # Scan the directory and group files by prefix
    for filename in files:
        if filename.endswith(".txt"):
            match = pattern.match(filename)
            if match:
                prefix = match.group(1)
                file_groups[prefix].append(os.path.join(input_folder, filename))

    # Merge files for each prefix
    for prefix, file_list in file_groups.items():
        merged_filepath = os.path.join(output_folder, f"{prefix}.txt")

        with open(merged_filepath, "w", encoding="utf-8") as merged_file:
            for file_path in file_list:
                with open(file_path, "r", encoding="utf-8") as infile:
                    merged_file.write(infile.read())  # Add spacing between files

        print(f"Merged {len(file_list)} files into {merged_filepath}")

    print("Merging complete!")

### Build Corpus File

In [None]:
def build_corpus_file(input_folder, output_file):

    # Get all txt files in the directory
    txt_files = [f for f in os.listdir(input_folder) if f.endswith(".txt")]

    # Merge all txt files into one
    with open(output_file, "w", encoding="utf-8") as merged_file:
        for txt_file in txt_files:
            file_path = os.path.join(input_folder, txt_file)
        
            with open(file_path, "r", encoding="utf-8") as infile:
                merged_file.write(infile.read()+ "\n")
        
            print(f"Merged: {txt_file}")

    print(f"\nAll files merged into: {output_file}")


### Transforming into jsonl

In [None]:
def transform_to_jsonl(input_folder, output_folder):
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    for file in os.listdir(input_folder):
        file_path = os.path.join(input_folder, file)
        if os.path.isfile(file_path):  # Process only files
            txt_to_jsonl(file_path, output_folder)
            print(f"Finished processing {file}")

In [None]:
def txt_to_jsonl(file_path, output_folder):
    filename, ext = os.path.splitext(os.path.basename(file_path))  # Extract filename only
    jsonl_file = os.path.join(output_folder, filename + ".jsonl")

    # Ensure the JSONL file exists (though open() will create it if it doesn't)
    with open(file_path, 'r', encoding='utf-8') as infile, open(jsonl_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            sentence = line.strip()
            if sentence:  # Skip empty lines
                json_obj = {"sentence": sentence}
                outfile.write(json.dumps(json_obj, ensure_ascii=False) + '\n')