In [None]:
import os
import json
import re

def split_file_to_json(input_file, fixed_text, author, output_folder):

    # Ensure output folder exists, if not, create it
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Define the output file path in the specified output folder
    output_file_name = os.path.basename(input_file).replace('.txt', '.json') if input_file.endswith('.txt') else input_file + ".json"
    output_file = os.path.join(output_folder, output_file_name)
    
    # Remove all non-printable characters, including BOM, tabs, and extra spaces from the author field
    author = re.sub(r'[^\w\s]', '', author).strip().lower()

    # Read the entire content of the input file
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Remove all tags of the format {tag} from each line using regex
    lines = [re.sub(r'\{.*?\}', '', line) for line in lines]

    # Define the JSON structure
    output_data = {
        "book": fixed_text,
        "author": author,
        "stories": []
    }

    # Split into chunks based on every second newline
    chunk = []
    chunk_title = None
    for index, line in enumerate(lines):
        # Remove leading tabs and spaces from each line, and trim line endings
        cleaned_line = line.strip()

        # Use the first non-empty line as the title
        if chunk_title is None and cleaned_line:
            chunk_title = cleaned_line
        else:
            chunk.append(cleaned_line)

        # Split chunks on every second newline
        if line == "\n" and (index + 1 < len(lines) and lines[index + 1] == "\n"):
            if chunk:
                # Determine tag for the first and other chunks
                if len(output_data["stories"]) == 0:
                    # First chunk uses "author" as the tag for the title
                    tag = "author"
                else:
                    # Other chunks use "story" as the tag
                    tag = "story"
                
                # Prepare the content by joining lines and replacing \n with a space
                content = " ".join(chunk)
                
                # Replace repeating spaces with a single space
                content = re.sub(r'\s+', ' ', content).strip()

                # Convert chunk title and content to lowercase and clean special characters, keeping '-, . — ! ?'
                chunk_title = re.sub(r'[^\w\s\-,.!?—]', '', chunk_title).strip().lower() if chunk_title else ""
                content = re.sub(r'[^\w\s\-,.!?—]', '', content).strip().lower()

                # Add the chunk to the JSON structure if it is not empty
                if chunk_title and content:
                    output_data["stories"].append({
                        tag: chunk_title,
                        "content": content
                    })
                
            # Reset chunk and title
            chunk = []
            chunk_title = None

    # Handle any remaining lines in the last chunk
    if chunk:
        # Prepare the content by joining lines and replacing \n with a space
        content = " ".join(chunk)

        # Replace repeating spaces with a single space
        content = re.sub(r'\s+', ' ', content).strip()

        # Convert chunk title and content to lowercase and clean special characters, keeping '-, . — ! ?'
        chunk_title = re.sub(r'[^\w\s\-,.!?—]', '', chunk_title).strip().lower() if chunk_title else ""
        content = re.sub(r'[^\w\s\-,.!?—]', '', content).strip().lower()

        # Use "sources" for the last chunk if it is not empty
        if chunk_title and content:
            output_data["stories"].append({
                "id": chunk_title,
                "sources": content
            })

    # Write the JSON to the output file
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(output_data, file, ensure_ascii=False, indent=4)

    # Print the output file location
    print(f"Output file created: {output_file}")


In [None]:
# Example usage

directory = "books"
input_file = 'Velichko_Vylchev_-_Hityr_Petyr_-_1484-b.txt'  # Replace with your input file name
input_file = os.path.join(directory, input_file)
book_name = 'Хитър Петър - НАРОДНИТЕ АНЕКДОТИ ЗА ХИТЪР ПЕТЪР'  # Replace with your book title
author = 'Величко Вълчев'  # Replace with the name
output_folder = 'output_json_files'

split_file_to_json(input_file, book_name, author, output_folder)

In [None]:
data = [
    {"input_file": "Velichko_Vylchev_-_Hityr_Petyr_-_1484-b.txt", "book_name": "Хитър Петър - НАРОДНИТЕ АНЕКДОТИ ЗА ХИТЪР ПЕТЪР", "author": "Величко Вълчев"},
    {"input_file": "Sava_Popov_-_Hityr_Petyr_-_1503-b.txt", "book_name": "Хитър Петър", "author": "Сава Попов"},
    {"input_file": "Ran_Bosilek_-_Nerodena_moma._Neznaen_junak._Zhiva_voda_-_Narodni_prikazki_-_4123-b.txt", "book_name": "Неродена мома. Незнаен юнак. Жива вода", "author": "Ран Босилек"},
    {"input_file": "Ran_Bosilek_-_Garvan_grachi_-_547-b.txt", "book_name": "Гарван грачи", "author": "Ран Босилек"},
    {"input_file": "Leda_Mileva_-_Veseli_baloni_-_4580-b.txt", "book_name": "Весели Балони", "author": "Леда Милева"},
    {"input_file": "Leda_Mileva_-_V_edna_zoologicheska_gradina_-_7541-b.txt", "book_name": "В една зоологическа градина", "author": "Леда Милева"},
    {"input_file": "Konstantin_Konstantinov_-_Prikazki_za_tebe_-_Izbrani_prikazki_i_razkazi_-_264-b.txt", "book_name": "Приказки за тебе", "author": "Константин Константинов"},
    {"input_file": "Kalina_Malina_-_Simovata_cheta_-_9964-b.txt", "book_name": "Симовата чета", "author": "Калина Малина"},
    {"input_file": "Ivan_Zheglov_-_Hityr_Petyr_i_sin_-_5493-b.txt", "book_name": "Хитър Петър и син", "author": "Иван Жеглов"},
    {"input_file": "Gorska_hubavitsa_-_Narodni_prikazki_-_10270-b.txt", "book_name": "Горска хубавица", "author": "Народни приказки"},
    {"input_file": "Emilijan_Stanev_-_Prez_vodi_i_gori_-_3796-b.txt", "book_name": "През води и гори", "author": "Емилиян Станев"},
    {"input_file": "Emilijan_Stanev_-_Povest_za_edna_gora_-_4107-b.txt", "book_name": "Повест за една гора", "author": "Емилиян Станев"},
    {"input_file": "Emilijan_Stanev_-_Lakomoto_meche_-_4114-b.txt", "book_name": "Лакомото мече", "author": "Емилиян Станев"},
    {"input_file": "Elin_Pelin_-_Sychinenija_v_shest_toma_tom_peti_-_Jan_Bibijan_Jan_Bibijan_na_Lunata_Prikazki_-_965-b.txt", "book_name": "Ян Бибиян - Ян Бибиян на Луната - Приказки", "author": "Елин Пелин"},
    {"input_file": "Elin_Pelin_-_Sychinenija_v_shest_toma_-_tom_peti_-_683-b.txt", "book_name": "Съчинения в шест тома — том пети", "author": "Елин Пелин"},
    {"input_file": "Elin_Pelin_-_Svatbata_na_Chervenushko_-_Vesela_prikazka_v_stihove_-_10192-b.txt", "book_name": "Сватбата на Червенушко", "author": "Елин Пелин"},
    {"input_file": "Elin_Pelin_-_Povesti_i_razkazi_-_419-b.txt", "book_name": "Повести и разкази", "author": "Елин Пелин"},
    {"input_file": "Elin_Pelin_-_Pod_manastirskata_loza_-_487-b.txt", "book_name": "Под манастирската лоза", "author": "Елин Пелин"},
    {"input_file": "Elin_Pelin_-_Pizho_i_Pendo_-_486-b.txt", "book_name": "Пижо и Пендо", "author": "Елин Пелин"},
    {"input_file": "Elin_Pelin_-_Gori_Tililejski_-_697-b.txt", "book_name": "Гори Тилилейски", "author": "Елин Пелин"},
    {"input_file": "Djado_baba_i_vnuche_-_Narodni_prikazki_-_6548-b.txt", "book_name": "Дядо, баба и внуче", "author": "Николай Хайтов"},
    {"input_file": "Angel_Karalijchev_-_Toshko_Afrikanski_i_prikazki_-_5919-b.txt", "book_name": "Тошко Африкански и приказки", "author": "Ангел Каралийчев"},
    {"input_file": "Angel_Karalijchev_-_Prikazki_i_razkazi_-_437-b.txt", "book_name": "Приказки и разкази", "author": "Ангел Каралийчев"},
    {"input_file": "Angel_Karalijchev_-_Lyv_bez_opashka_-_5372-b.txt", "book_name": "Лъв без опашка", "author": "Ангел Каралийчев"},
    {"input_file": "Angel_Karalijchev_-_Imalo_edno_vreme_-_Prikazki_-_332-b.txt", "book_name": "Имало едно време", "author": "Ангел Каралийчев"},
    {"input_file": "Angel_Karalijchev_-_Bylgarski_narodni_prikazki_-_Tom_2_-_8471-b.txt", "book_name": "Български народни приказки - Том 2", "author": "Ангел Каралийчев"},
    {"input_file": "Angel_Karalijchev_-_Bylgarski_narodni_prikazki_-_Tom_1_-_5904-b.txt", "book_name": "Български народни приказки - Том 1", "author": "Ангел Каралийчев"},
    {"input_file": "Angel_Karalijchev_-_Bylgarski_narodni_prikazki_-_3889-b.txt", "book_name": "Български народни приказки ак", "author": "Ангел Каралийчев"},
    {"input_file": "Angel_Karalijchev_-_Bylgarski_narodni_prikazki_-_652-b.txt", "book_name": "Български народни приказки ак2", "author": "Ангел Каралийчев"}
]

# Using the configuration to call split_file_to_json in a loop
for entry in data:
    directory = "books"
    
    input_file = entry["input_file"]
    input_file = os.path.join(directory, input_file)
    book_name = entry["book_name"]
    author = entry["author"]
    output_folder = 'output_json_files'
    
    # Print the current file, book, and author being processed
    print(f"Processing file: {input_file}, Book: {book_name}, Author: {author}")
    
    split_file_to_json(input_file, book_name, author,output_folder)