In [None]:
import os
import glob
from tqdm.notebook import tqdm

In [None]:
DCS_DATA_PATH = "./dcs_source/dcs/data/conllu/files"
OUTPUT_PATH = './extract'

In [None]:
# Function to process a single folder
def process_folder(work_name):
    input_folder_path = os.path.join(DCS_DATA_PATH, work_name)
    output_filepath = os.path.join(OUTPUT_PATH, f"{work_name}.txt")
    
    with open(output_filepath, 'w', encoding='utf-8') as outfile:
        # os.makedirs(output_filepath, exist_ok = True) 
        for conllu_file in sorted(glob.glob(os.path.join(input_folder_path, '*.conllu'))):
            # print(f"Processing file: {conllu_file}")
            with open(conllu_file, 'r', encoding='utf-8') as infile:
                chapter = ""
                text_line = ""
                sent_id = ""
                counter = ""
                subcounter = ""
                
                for line in infile:
                    if line.startswith("## chapter: "):
                        chapter = line.split(":")[1].strip()
                        outfile.write(f"{{{chapter}}}\n")
                    elif line.startswith("# text = "):
                        text_line = line[len("# text = "):].strip()
                    elif line.startswith("# sent_id = "):
                        sent_id = line.split("=")[1].strip()
                    elif line.startswith("# sent_counter = "):
                        counter = line.split("=")[1].strip()
                    elif line.startswith("# sent_subcounter = "):
                        subcounter = line.split("=")[1].strip()

                    elif line.startswith("1\t"):
                        # Analysis data starting, now have enough to output line

                        # Prepare identifier
                        identifier = ""
                        if chapter:
                            identifier += chapter
                        if counter and subcounter:
                            identifier += f" {counter}.{subcounter}"
                        elif counter:
                            identifier += f" {counter}"
                        elif sent_id:
                            identifier += f" {sent_id}"
                        else:
                            raise ValueError("No identifier info: work_name {work_name}, conllu_file {conllu_file}, text_line {text_line}")

                        if not text_line:
                            raise ValueError("No text_line: sent_id {sent_id}, counter {counter}, subcounter {subcounter}")

                        # Output line
                        outfile.write(f"{text_line} | [{identifier}]\n")

                    elif line[0] == '\n':
                        # Reset variables
                        text_line = ""
                        sent_id = ""
                        counter = ""
                        subcounter = ""

In [None]:
# Process all folders
all_works = sorted(os.listdir(DCS_DATA_PATH))
all_works.remove('.DS_Store')

for work_name in tqdm(all_works):
    # print(f"Processing folder: {work_name}")
    process_folder(work_name)

print("Extraction complete.")

In [5]:
len(all_works)

257

In [6]:
# analyze newly output plain-text version
file_sizes_kbs = {}
filenames_sorted = sorted(os.listdir(OUTPUT_PATH))
for filename in filenames_sorted:
    path = os.path.join(OUTPUT_PATH, filename)
    size = os.path.getsize(path)
    filename_sans_ext = filename[:-4] # minus '.txt'
    file_sizes_kbs[filename_sans_ext] = int(size / 1024 * 10) / 10

with open('file_sizes.tsv', 'w') as f:
    f.write(
        '\n'.join(
            [f"{k}\t{v}" for k,v in file_sizes_kbs.items()]
        )
    )