##### "Os" allow go to every place in our operative system and pydub allow to do actions with audio files.


In [None]:
import os
from pydub import AudioSegment
import json

### How it works - Example

The `chapter_book_map` function reads two text files: one containing book information and another with chapter metadata. It creates a mapping between each chapter ID and its corresponding book title.

**Given:**
`books.txt`
```txt
1001 | The Adventures of Sherlock Holmes
1002 | Pride and Prejudice
```
`chapters.txt`
```text
128104 | 1272 | ... | ... | ... | 1001 | A Scandal in Bohemia
128105 | 1272 | ... | ... | ... | 1001 | The Red-Headed League
147956 | 1988 | ... | ... | ... | 1002 | Chapter 1
```


Calling `chapter_book_map("books.txt", "chapters.txt")` **would return:**

```text
{
    "128104": "The Adventures of Sherlock Holmes - (A Scandal in Bohemia)",
    "128105": "The Adventures of Sherlock Holmes - (The Red-Headed League)",
    "147956": "Pride and Prejudice - (Chapter 1)"
}
```

In [None]:
def chapter_book_map(books_path: str, chapters_path: str) -> dict:
    """ A function to create a mapping between chapter IDs and book titles.

    Args:
        books_path (str): path to the books.txt file
        chapters_path (str): path to the chapters.txt file

    Returns:
        dict: A dictionary mapping chapter IDs to book titles.
    """    
    
    book_id_to_title = {}
    chapter_to_book = {}
    
    with open(books_path, 'r', encoding='utf-8') as file:
        for line in file:
            if "|" in line:
                parts = line.strip().split("|")
                if len(parts) >= 2:
                    book_id = parts[0].strip()
                    title = parts[1].strip()
                    book_id_to_title[book_id] = title
                    
    with open(chapters_path, 'r', encoding='utf-8') as file:
        for line in file:
            if "|" in line:
                parts = [part.strip() for part in line.strip().split("|")]
                if len(parts) >= 7:
                    chapter_id = parts[0]
                    book_id = parts[5]
                    chapter_title = parts[6]
                    book_title = book_id_to_title.get(book_id, "Unknown Book")
                    chapter_to_book[chapter_id] = f"{book_title} - ({chapter_title})"
                    print(  f"Chapter ID: {chapter_id}, "
                            f"Book ID: {book_id}, " 
                            f"Book Title: {book_title}, "
                            f"Chapter Title: {chapter_title}"
                        )
    return chapter_to_book

### How it works - Example
The `load_transcriptions` function reads a transcription text file 
where each line starts with an audio ID followed by its corresponding 
transcription. It returns a dictionary that maps each audio ID to its 
transcription text.

**Given:**
`transcriptions.txt`
```text
1272-128104-0000 THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG
1272-128104-0001 HELLO WORLD
1272-128104-0002 THIS IS A TEST
```

Calling `load_transcriptions("transcript.txt")` **would return:**
```text
{
    "1272-128104-0000": "THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG",
    "1272-128104-0001": "HELLO WORLD",
    "1272-128104-0002": "THIS IS A TEST"
}
```

In [None]:
def load_transcriptions(route_txt: str) -> dict:
    """ This function converts a transcription file into a dictionary.

    Args:
        route_txt (str): path to the transcription file

    Returns:
        dict: dictionary with audio IDs as keys and their corresponding text 
        as values
    """    
    trans = {}
    with open(route_txt, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip():
                # Separating the ID and the text
                parts = line.strip().split(" ", 1) 
                if len(parts) == 2:
                    id_audio, text = parts
                    trans[id_audio] = text
    return trans

### How it works - Example

The `process_chapter` function takes a chapter directory containing .flac audio 
files and a transcription file, and splits the audio into segments of up to 30 
seconds. For each segment, it creates a .flac file and a corresponding .txt 
file with timestamped transcriptions. It also includes the book title 
(retrieved using the chapter_to_book dictionary) at the top of the 
transcription.

Suppose that you have:
**A folder chap_001** with audio files like:
```text
1272-128104-0000.flac
1272-128104-0001.flac
1272-128104-0002.flac
```
A transcription file **chap_001.txt** like:
```text
1272-128104-0000 THE QUICK BROWN FOX
1272-128104-0001 JUMPS OVER THE LAZY DOG
1272-128104-0002 HELLO WORLD
```

A mapping dictionary:
```text
chapter_to_book = {
    "chap_001": "English Stories - (Chapter One)"
}
```
Calling `process_chapter("chap_001", "chap_001.txt", "processed_chapters" chapter_to_book)` **would return:**

* A new folder `audio_segments/chap_001` containing:
- * segment_0.flac
- * segment_0.txt

Where the `.txt` might look like:
```text
Book title: English Stories - (Chapter One)

0: 1272-128104-0000 [0.00s - 4.23s]: THE QUICK BROWN FOX
1: 1272-128104-0001 [4.23s - 7.89s]: JUMPS OVER THE LAZY DOG
2: 1272-128104-0002 [7.89s - 10.52s]: HELLO WORLD
```



In [None]:
def process_chapter(route_chapter: str, 
                    route_txt: str, 
                    dest_dir: str,
                    chapter_to_book: dict
                    ) -> dict:
    """ This function processes a chapter directory, splitting audio files into
    segments and creating corresponding transcription files.

    Args:
        route_chapter (str): path to the chapter directory
        route_txt (str): path to the transcription file
        dest_dir (str): path to the destination directory for the segments
        chapter_to_book (dict): mapping of chapter IDs to book titles
        
    Returns: 
        dict: JSON structure containing the chapter ID, book name, and a list 
        of segments with their audio files, durations, and transcriptions.
    """    
    trans = load_transcriptions(route_txt)
    # sorting the audio files to ensure they are processed in order
    audios = sorted([f for f in os.listdir(route_chapter) if f.endswith(".flac")])
    
    # Segment audio initialization
    act_segment = AudioSegment.empty()
    # List to store transcriptions for the current segment
    act_trans = []
    max_duration = 30 * 1000
    # Duration accumulated in the current segment
    act_duration = 0
    count = 0
    
    # Create the directory for the chapter
    chapter_name = os.path.basename(route_chapter)
    dir_chapter = os.path.join(dest_dir, chapter_name)
    os.makedirs(dir_chapter, exist_ok=True)
    
    book_title = chapter_to_book.get(chapter_name, "Unknown Book")
    
    # Create the JSON structure for the chapter
    chapter_json = {
        "chapter_id": chapter_name,
        "book_name": book_title,
        "transcript": []
    }
    
    for audio in audios:
        # Delete the extension in the file
        id_audio = os.path.splitext(audio)[0]
        audio_route = os.path.join(route_chapter, audio)
        # Load the audio file
        actual_audio = AudioSegment.from_file(audio_route)
        duration = len(actual_audio)
        
        if len(act_segment) + len(actual_audio) > max_duration:
            name = f"segment_{count}"
            new_file_route = os.path.join(dir_chapter, name + ".flac")
            new_txt_route = os.path.join(dir_chapter, name + ".txt")
            # Export the new audio file
            act_segment.export(new_file_route, format="flac")
            
            # Write the new transcription file
            with open(new_txt_route, "w", encoding="utf-8") as file:
                file.write(f"Book title: {book_title}\n\n")
                for i, line in enumerate(act_trans):
                    file.write(f"{i}: {line}\n")
            
            # Create the segment data for the JSON structure
            segment_data = {
                "audio_file": name + ".flac",
                "duration": round(len(act_segment) / 1000, 2),
                "text_lines": act_trans,
                "full_text": " ".join([line.split("]: ", 1)[1]
                                        for line in act_trans])
            }
            
            # Append the segment data to the transcript list in the chapter JSON
            chapter_json["transcript"].append(segment_data)
            
            # Reset the initialization for the next segment
            count += 1
            act_segment = AudioSegment.empty()
            act_trans = []
            act_duration = 0
            
        # Calculate the start and end time for the transcription
        start = act_duration
        end = act_duration + duration
        
        # Look for the transcription in the dictionary
        trans_line = trans.get(id_audio," ")
        act_trans.append(   f"{id_audio} "
                            f"[{start/1000:.2f}s - {end/1000:.2f}s]: "
                            f"{trans_line}"
                        )
        
        act_segment += actual_audio
        act_duration += duration
        
    # Export the last segment if it has any audio
    if len(act_segment) > 0:
        name = f"segment_{count}"
        new_file_route = os.path.join(dir_chapter, name + ".flac")
        new_txt_route = os.path.join(dir_chapter, name + ".txt")
        act_segment.export(new_file_route, format="flac")
            
        with open(new_txt_route, "w", encoding="utf-8") as file:
            file.write(f"Book title: {book_title}\n\n")
            for i, line in enumerate(act_trans):
                    file.write(f"{i}: {line}\n")
                    
        segment_data = {
                "audio_file": name + ".flac",
                "duration": round(len(act_segment) / 1000, 2),
                "text_lines": act_trans,
                "full_text": " ".join([line.split("]: ", 1)[1]
                                        for line in act_trans])
            }
        chapter_json["transcript"].append(segment_data)
        
    return chapter_json

### How it works - Example

The `process_all_chapters` function iterates over a root directory containing 
multiple chapter folders. For each chapter, it identifies the corresponding transcription file and processes the audio and transcription using 
the process_chapter function.

Given:

```text
root_dir/
├── 1272_128104/
│   ├── 0000.flac
│   ├── 0001.flac
│   └── 1272_128104.txt
├── 1272_128105/
│   ├── 0000.flac
│   └── 1272_128105.txt
```

Calling `process_all_chapters("root_dir", "processed")` **would result in:**

- The function going through each chapter folder inside root_dir
- It will find the .txt transcription file
- Then it will call process_chapter to generate audio segments and 
transcription .txt files
- The resulting files will be stored in a new folder audio_segments/chap_001, audio_segments/chap_002, etc.

In [None]:
def process_all_chapters(root_dir: str, dest_dir: str, chapter_to_book: dict) -> None:
    """ This function processes all chapters in the given root directory.

    Args:
        root_dir (str): path to the root directory containing chapter 
        directories
        dest_dir (str): path to the destination directory for the segments
    """    
    # Create a list to store the dictionaries for each chapter
    data = []
    for chapter in os.listdir(root_dir):
        chapter_path = os.path.join(root_dir, chapter)
        if os.path.isdir(chapter_path):
            file_txt = [f for f in os.listdir(chapter_path)
                        if f.endswith(".txt")]
            if file_txt:
                route_txt = os.path.join(chapter_path, file_txt[0])
                # Process each chapter
                chapter_data = process_chapter(chapter_path, 
                                route_txt, 
                                dest_dir, 
                                chapter_to_book
                                )
                # Append the chapter dictionary to the data list
                data.append(chapter_data)
                
    # Create the json file with the data list
    with open(
        os.path.join(dest_dir, "data.json"), 
        "w", 
        encoding="utf-8"
    )as file:
        json.dump(data, file, indent=4, ensure_ascii=False)

In [98]:
def new_json_file(json_file: str, dest_dir: str) -> None:
    segments_count = 0
    chapter_count = 0
    json_cycle = 0
    all_chapters = []
    with open(json_file, "r", encoding="utf-8") as file:
        json_chapters = json.load(file)
        
    while segments_count < 100:
        chapter = json_chapters[chapter_count]
        segment_id = chapter["chapter_id"]
        all_transcripts = chapter["transcript"]
        transcript = all_transcripts[json_cycle]
        actual_dict = {
            "chapter_id": segment_id,
            "transcript": transcript,
        }
        all_chapters.append(actual_dict)
        chapter_count += 1
        if chapter_count == len(json_chapters):
            chapter_count = 0
            json_cycle += 1
        segments_count += 1
        print(f"number: {segments_count}, " 
            f"chapter: {segment_id}, "
            f"transcript: {transcript}")
        
    with open(
        os.path.join(dest_dir, "hundred_segments.json"), 
        "w", 
        encoding="utf-8"
    )as file:
        json.dump(all_chapters, file, indent=4, ensure_ascii=False)

In [None]:
root_dir = "datasets/LibriSpeech/chapter-groups/group_1"
dest_dir = "datasets/LibriSpeech/audio_segments"
os.makedirs(dest_dir, exist_ok=True)

books_path = "datasets/LibriSpeech/BOOKS.TXT"
chapters_path = "datasets/LibriSpeech/CHAPTERS.TXT"

chapter_to_book = chapter_book_map(books_path, chapters_path)
process_all_chapters(root_dir, dest_dir, chapter_to_book)

In [99]:
json_route = "datasets/LibriSpeech/audio_segments/data.json"
new_json_file(json_route, dest_dir)

number: 1, chapter: 105575, transcript: {'audio_file': 'segment_0.flac', 'duration': 24.18, 'text_lines': ['7976-105575-0000 [0.00s - 9.16s]: GRANT WAS ONLY A FEW MILES AWAY BUT ALTHOUGH COMMANDER IN CHIEF HE KNEW NOTHING OF THE HARDEST FOUGHT BATTLE OF THE CIVIL WAR UNTIL IT WAS OVER', '7976-105575-0001 [9.16s - 11.96s]: MY OWN REGIMENT WAS IN THE ADVANCE', '7976-105575-0002 [11.96s - 15.01s]: OUR BRIGADE WAS FEARFULLY OUTNUMBERED', "7976-105575-0003 [15.01s - 24.18s]: THERE WERE NO BREASTWORKS YET THAT ONE LITTLE BRIGADE OF HAMILTON'S DIVISION STOOD THERE IN THE OPEN AND REPULSED ASSAULT AFTER ASSAULT"], 'full_text': "GRANT WAS ONLY A FEW MILES AWAY BUT ALTHOUGH COMMANDER IN CHIEF HE KNEW NOTHING OF THE HARDEST FOUGHT BATTLE OF THE CIVIL WAR UNTIL IT WAS OVER MY OWN REGIMENT WAS IN THE ADVANCE OUR BRIGADE WAS FEARFULLY OUTNUMBERED THERE WERE NO BREASTWORKS YET THAT ONE LITTLE BRIGADE OF HAMILTON'S DIVISION STOOD THERE IN THE OPEN AND REPULSED ASSAULT AFTER ASSAULT"}
number: 2, chapte