In [20]:
'''

Please note that some segments still exceeds 70 characters. 
It is because some segments does not include breakpoints mentioned in the rule sets.

'''
import os

def segment_file_preserving_lines(file_path, max_chars=70):
    def segment_line_preserving(line):
        # Only add a newline character to lines that are segmented
        if len(line) <= max_chars:
            return [line]  # Return the original line without adding a newline

        primary_breakpoints = [',']
        secondary_breakpoints = [' e ', ' mas ', ' porém ', ' contudo ', ' no entanto ']
        
        primary_segments = [s + ',' for s in line.split(',')[:-1]] + [line.split(',')[-1]]
        final_segments = []

        for segment in primary_segments:
            trimmed_segment = segment.strip()
            if len(trimmed_segment) <= max_chars:
                final_segments.append(trimmed_segment + '\n')
            else:
                for breakpoint in secondary_breakpoints:
                    if breakpoint in trimmed_segment:
                        sub_segments = trimmed_segment.split(breakpoint)
                        if len(sub_segments[0]) + len(breakpoint) <= max_chars:
                            final_segments.append(sub_segments[0].strip() + breakpoint.strip() + '\n')
                            final_segments.append(breakpoint.join(sub_segments[1:]).strip() + '\n')
                            break
                else:
                    words = trimmed_segment.split()
                    current_segment = ""
                    for word in words:
                        if len(current_segment) + len(word) + 1 > max_chars:
                            final_segments.append(current_segment.strip() + '\n')
                            current_segment = word
                        else:
                            current_segment += " " + word
                    if current_segment:
                        final_segments.append(current_segment.strip() + '\n')

        return final_segments

    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    segmented_text = []
    for line in lines:
        segmented_lines = segment_line_preserving(line)
        segmented_text.extend(segmented_lines)

    return segmented_text

def segment_all_txt_files_in_folder(input_folder_path, output_folder_path, max_chars=70):
    for filename in os.listdir(input_folder_path):
        if filename.endswith('.txt'):
            input_file_path = os.path.join(input_folder_path, filename)
            output_file_path = os.path.join(output_folder_path, f"segmented_{filename}")

            segmented_content = segment_file_preserving_lines(input_file_path, max_chars)
            with open(output_file_path, 'w', encoding='utf-8') as file:
                file.writelines(segmented_content)

            print(f"Segmented and saved: {filename}")

# Usage
input_folder_path = 'D:/OneDrive/Job/TDM/20230811_Tianxia/0101'  # Replace with your input folder path
output_folder_path = 'D:/OneDrive/Job/TDM/20230811_Tianxia/0202'  # Replace with your output folder path
max_chars = 70
segment_all_txt_files_in_folder(input_folder_path, output_folder_path, max_chars)


Segmented and saved: Green Silk Road.txt
