<a href="https://colab.research.google.com/github/wasimmadha/dubbing-project-research/blob/main/librspeech_dataset_eda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!tar -xzf '/content/drive/MyDrive/Dubbing Project/libspeech/dev-clean.tar.gz' -C /content/

In [None]:
!tar -xzf '/content/drive/MyDrive/Dubbing Project/libspeech/train-clean-100.tar.gz' -C /content/

In [None]:
!pip install pronouncing

Collecting pronouncing
  Downloading pronouncing-0.2.0.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cmudict>=0.4.0 (from pronouncing)
  Downloading cmudict-1.0.18-py3-none-any.whl (939 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.4/939.4 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: pronouncing
  Building wheel for pronouncing (setup.py) ... [?25l[?25hdone
  Created wheel for pronouncing: filename=pronouncing-0.2.0-py2.py3-none-any.whl size=6234 sha256=596b9ee0187ae997a4668cf9071b3f9ab6e45891797b0062ef68ed2030dc8a85
  Stored in directory: /root/.cache/pip/wheels/05/f6/1d/599c67da1fa48c086d8c49e8fc6bd5f05bc9fa66fb04bed5db
Successfully built pronouncing
Installing collected packages: cmudict, pronouncing
Successfully installed cmudict-1.0.18 pronouncing-0.2.0


In [None]:
import re
import os

import torchaudio
import pronouncing


In [None]:
def split_filename_and_text(input_string):
    parts = input_string.split(None, 1)

    if len(parts) == 2:
        filename, text = parts
        return filename, text
    else:
        print("Error: Unable to split filename and text.")
        return None, None


In [None]:
def convert_text_to_phonemes(text):
    result = []
    for word in text.split():
        # Get phonemes for the word
        phones = pronouncing.phones_for_word(word)

        if len(phones) > 1:
            phones = [phones[1]]
        # Append the phonemes to the result
        result.extend(phones)

    return result


In [None]:
text = "foreign french"
convert_text_to_phonemes(text)

['F AA1 R AH0 N', 'F R EH1 N CH']

In [None]:
def get_mapped_librspeech_data(main_folder_path):
    output_dict = {}

    for main_speaker_dir in os.listdir(main_folder_path):
        main_speaker_path = os.path.join(main_folder_path, main_speaker_dir)
        for sub_speaker_dir in os.listdir(main_speaker_path):
            sub_speaker_path = os.path.join(main_speaker_path, sub_speaker_dir)
            files_list = os.listdir(sub_speaker_path)
            txt_file = [file for file in files_list if file.endswith(".txt")][0]
            txt_file = os.path.join(sub_speaker_path, txt_file)
            with open(txt_file, 'r', encoding='utf-8') as file:
                lines = file.readlines()

            for line in lines:
              filename, text = split_filename_and_text(line)

              file_path = os.path.join(sub_speaker_path, filename+'.flac')

              waveform, sample_rate = torchaudio.load(file_path)

              phonemes_result = convert_text_to_phonemes(text)
              phonemes = ' '.join(phonemes_result)

              if '# foreign french' in phonemes:
                  phonemes = phonemes.replace('# foreign french', '')

              if 'foreign' in phonemes or 'french' in phonemes:
                print(text)
                print(phonemes_result)

              output_dict[filename] = {
                  "file_path": file_path,
                  "text": text,
                  "phonemes": phonemes.split()
              }


    return output_dict



        # for files in os.listdir(sub_speaker_path):
        #   print(files)

In [None]:
train_folder_path = '/content/LibriSpeech/train-clean-100'
valid_folder_path = '/content/LibriSpeech/dev-clean'

train_output_dict = get_mapped_librspeech_data(train_folder_path)
valid_output_dict = get_mapped_librspeech_data(valid_folder_path)

In [None]:
with open('train_output_dict.json', 'w') as file:
    json.dump(train_output_dict, file)

with open('valid_output_dict.json', 'w') as file:
    json.dump(valid_output_dict, file)

In [None]:
len(list(train_output_dict.keys()))

28539

In [None]:
train_total_phonemes = []
for key in list(train_output_dict.keys()):
    phonemes = train_output_dict[key]["phonemes"]
    if 'foreign' in phonemes:
      print(key, train_output_dict[key])
    train_total_phonemes.extend(phonemes)

train_total_phonemes = set(train_total_phonemes)
print("total train phonemes: ", len(train_total_phonemes), train_total_phonemes)

total train phonemes:  72 {'AA2', 'EY1', 'CH', 'ZH', 'L', 'DH', 'AA1', 'W', 'SH', 'AY1', 'IY0', 'UW0', 'OY0', 'UH2', 'ER0', 'D', 'Y', 'UH1', 'EY2', 'K', 'S', 'AH2', 'OY2', 'HH', 'AO2', 'IY1', 'AY2', 'OW0', 'IH0', 'T', 'IH1', 'AW2', 'EH0', 'org,', 'AO1', 'EH1', 'EH2', 'R', 'NG', '#', 'UW2', 'IH2', 'EY0', 'UH0', 'JH', 'ER2', 'V', 'IY2', 'G', 'M', 'AH1', 'AE2', 'OY1', 'AE1', 'AW0', 'OW2', 'N', 'B', 'irish', 'AE0', 'AA0', 'AY0', 'OW1', 'F', 'P', 'TH', 'AH0', 'UW1', 'AW1', 'ER1', 'AO0', 'Z'}


In [None]:
valid_total_phonemes = []
for key in list(valid_output_dict.keys()):
    phonemes = valid_output_dict[key]["phonemes"]
    valid_total_phonemes.extend(phonemes)

valid_total_phonemes = set(valid_total_phonemes)
print("total train phonemes: ", len(valid_total_phonemes), valid_total_phonemes)

total train phonemes:  71 {'AA2', 'EY1', 'CH', 'ZH', 'L', 'DH', 'IY0', 'AY1', 'SH', 'W', 'AA1', 'UW0', 'UH2', 'ER0', 'Y', 'D', 'UH1', 'EY2', 'K', 'S', 'AH2', 'OY2', 'HH', 'AO2', 'IY1', 'AY2', 'OW0', 'IH0', 'T', 'IH1', 'AW2', 'EH0', 'org,', 'AO1', 'EH1', 'R', 'NG', 'EH2', '#', 'UW2', 'IH2', 'EY0', 'UH0', 'JH', 'ER2', 'V', 'IY2', 'G', 'M', 'AH1', 'AE2', 'OY1', 'AE1', 'AW0', 'OW2', 'B', 'N', 'irish', 'AE0', 'AA0', 'AY0', 'OW1', 'F', 'P', 'UW1', 'AH0', 'AW1', 'ER1', 'TH', 'AO0', 'Z'}


In [None]:
'foreign' in train_total_phonemes

False

In [None]:
train_total_phonemes

In [None]:
total_phonemes = list(train_total_phonemes.union(valid_total_phonemes))
len(list(total_phonemes)), list(total_phonemes)

In [None]:
reverse_phoneme_dict = {index: phoneme for index, phoneme in enumerate(total_phonemes)}
phoneme_dict = {phoneme: index for index, phoneme in enumerate(total_phonemes)}


In [None]:
reverse_phoneme_dict
phoneme_dict

In [None]:
import json

with open('phoneme_dict.json', 'w') as file:
    json.dump(phoneme_dict, file)

with open('reverse_phoneme_dict.json', 'w') as file:
    json.dump(reverse_phoneme_dict, file)


In [None]:
with open('phoneme_dict.json', 'r') as file:
    loaded_phoneme_dict = json.load(file)


In [None]:
loaded_phoneme_dict

{'AA2': 0,
 'CH': 1,
 'ZH': 2,
 'DH': 3,
 'UW0': 4,
 'W': 5,
 'IY0': 6,
 'OY0': 7,
 'UH2': 8,
 'D': 9,
 'Y': 10,
 'Z': 11,
 'OY2': 12,
 'HH': 13,
 'AY2': 14,
 'IH0': 15,
 'EH1': 16,
 'UW2': 17,
 'UH0': 18,
 'JH': 19,
 'ER2': 20,
 'V': 21,
 'IY2': 22,
 'G': 23,
 'M': 24,
 'AH1': 25,
 'irish': 26,
 'AA0': 27,
 'AY0': 28,
 'OW1': 29,
 'ER1': 30,
 'AO0': 31,
 'EY1': 32,
 'L': 33,
 'AA1': 34,
 'AY1': 35,
 'SH': 36,
 'ER0': 37,
 'AH0': 38,
 'UH1': 39,
 'K': 40,
 'S': 41,
 'AH2': 42,
 'AO2': 43,
 'IY1': 44,
 'OW0': 45,
 'T': 46,
 'IH1': 47,
 'AW2': 48,
 'EH0': 49,
 'org,': 50,
 'AO1': 51,
 'NG': 52,
 'EH2': 53,
 'R': 54,
 '#': 55,
 'EY0': 56,
 'AW0': 57,
 'AE2': 58,
 'AE1': 59,
 'OW2': 60,
 'N': 61,
 'B': 62,
 'AE0': 63,
 'F': 64,
 'AW1': 65,
 'P': 66,
 'TH': 67,
 'EY2': 68,
 'IH2': 69,
 'UW1': 70,
 'OY1': 71}