## Install Conda
The runtime will restart after installation, please execute the remaining cells after the restart.

In [2]:
!pip install -q condacolab
import condacolab
condacolab.install()

[0m✨🍰✨ Everything looks OK!


### Setup dependencies

In [3]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [1]:
%cd /content/
!git clone https://github.com/ming024/FastSpeech2

/content
Cloning into 'FastSpeech2'...
remote: Enumerating objects: 991, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 991 (delta 10), reused 7 (delta 7), pack-reused 978[K
Receiving objects: 100% (991/991), 330.31 MiB | 18.59 MiB/s, done.
Resolving deltas: 100% (175/175), done.


In [None]:
!pip install pyyaml==6.0 unidecode==1.3.6 tgt==1.4.4 pyworld==0.2.10
!pip install librosa==0.9.2 numpy==1.24.3 numba==0.57.0

#if you run into issues, try installing with --force-reinstall

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting librosa
  Using cached librosa-0.10.0.post2-py3-none-any.whl (253 kB)
Collecting numpy
  Using cached numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
Collecting numba
  Downloading numba-0.57.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typing-extensions>=4.1.1
  Using cached typing_extensions-4.5.0-py3-none-any.whl (27 kB)
Collecting msgpack>=1.0
  Downloading msgpack-1.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.8/316.8 kB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib>=

###Dataset requirements and settings
Each file has an accompanying text file with the transcription

In [2]:
import os

# The input dataset
dataset_name = 'MyDataset' #@param {type:'string'}
dataset_path = '/gdrive/MyDrive/subset' #@param {type:'string'}
speaker_name = 'universal' #@param {type:'string'}

# The output folder for processed data
output_path = '/content/output_dataset' #@param {type:'string'}

# MFA settings
text_file_extension = '.lab' #@param ['.txt','.lab']
corpus_name = 'metadata.csv' #@param {type:'string'}
lexicon_path = '/content/FastSpeech2/lexicon/librispeech-lexicon.txt' #@param {type:'string'}
allow_overwrite_existing_corpus = True #@param {type:'boolean'}

acoustic_model = 'english_us_arpa' #@param {type:'string'}
dictionary_file = '/content/FastSpeech2/lexicon/librispeech-lexicon.txt' #@param {type:'string'}

# Paths
preprocessed_data_path = os.path.join(output_path, 'preprocessed_data')
preprocessed_data_speaker_path = os.path.join(output_path, 'preprocessed_data',
                                              dataset_name)
raw_data_path = os.path.join(output_path, 'raw_data')
raw_data_speaker_path = os.path.join(output_path, 'raw_data', speaker_name)
corpus_path = raw_data_speaker_path
corpus_file_path = os.path.join(corpus_path, corpus_name)
mfa_output_path = os.path.join('/home/mfa_user', dataset_name, 'TextGrid')
textgrid_dir = os.path.join(preprocessed_data_speaker_path, 'TextGrid', speaker_name)

# Create directory structure
%mkdir -p $output_path
%mkdir -p $corpus_path
%mkdir -p $preprocessed_data_speaker_path
%mkdir -p $raw_data_speaker_path
%mkdir -p $textgrid_dir

### Install MFA

In [4]:
!conda install -c conda-forge montreal-forced-aligner

Collecting package metadata (current_repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | failed

CondaError: KeyboardInterrupt



In [None]:
# Excellent MFA tutorial: https://eleanorchodroff.com/mfa_tutorial.html
acoustic_model = 'english_us_arpa' #@param
dictionary_model = 'english_us_arpa' #@param

# Command must be run as unprivileged user
!useradd mfa_user
!su - mfa_user -c "echo hello as mfa_user"

!su - mfa_user -c "mfa version"
!su - mfa_user -c "mfa model download acoustic $acoustic_model"
!su - mfa_user -c "mfa model download dictionary $dictionary_model"

### Debug utilities
Copy files

In [None]:
import shutil
import os

src = '/gdrive/MyDrive/dataset/raw_data/universal' #@param {type:'string'}
dest = '/gdrive/MyDrive/subset/'

%mkdir $dest

for i in range(0,9):
  fname = f'p303_00{i+1}'
  wav = os.path.join(src, fname + '.wav')
  lab = os.path.join(src, fname + '.lab')
  dest_path = os.path.join(dest)
  shutil.copy(wav, dest_path)
  shutil.copy(lab, dest_path)
%ls $dest
  


mkdir: cannot create directory ‘/gdrive/MyDrive/subset/’: File exists
p303_001.lab  p303_003.lab  p303_005.lab  p303_007.lab  p303_009.lab
p303_001.wav  p303_003.wav  p303_005.wav  p303_007.wav  p303_009.wav
p303_002.lab  p303_004.lab  p303_006.lab  p303_008.lab
p303_002.wav  p303_004.wav  p303_006.wav  p303_008.wav


# Preprocess Data

### Make metadata.csv corpus
Saved to output_path/raw_data/speaker_name

In [45]:
import os

# Don't overwrite existing file
if not allow_overwrite_existing_corpus:
  assert(not os.path.exists(corpus_file_path)), 'Corpus file already exists, enable `allow_overwrite_existing_corpus` to disable this behavior.'


def concatenate_file_contents(filename):
    """ Reads text file and outputs string with name of file and contents """
    filename_no_ext = str(os.path.basename(filename)).replace(text_file_extension,'')
    with open(filename, 'r') as file:
      contents = file.read().strip()
      result = f"{filename_no_ext}|{contents}|{contents}\r\n"
      return result


def process_files_in_path(text_files_path, output_corpus_file_path):
    """ Open a file at output_corpus_path and write formatted data to it """
    with open(output_corpus_file_path, 'w') as f:
        # Get all .txt files in the specified path
        txt_files = [file for file in os.listdir(text_files_path) if file.endswith(text_file_extension)]
        txt_files_count = len(txt_files)
        if txt_files_count <= 0:
          print(f'No text files with extension {text_file_extension} found in {text_files_path}, try changing `text_file_extension` in settings')
        # Process each file and concatenate the contents
        for file in txt_files:
          file_path = os.path.join(text_files_path, file)
          output = concatenate_file_contents(file_path)
          f.write(output)
# Run
print(f'Dataset path: {dataset_path}')
print(f'Corpus path: {corpus_file_path}')
process_files_in_path(dataset_path, corpus_file_path)
print('Done')

Dataset path: /gdrive/MyDrive/subset
Corpus path: /content/output_dataset/raw_data/universal/metadata.csv
Done


## Make TextGrids

Create new configuration files

In [46]:
import os
import yaml
try:
    from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
    from yaml import Loader, Dumper

config_dir = f'/content/FastSpeech2/config/{dataset_name}'
copied_config_dir = os.path.join(output_path, 'configs')

!mkdir -p $config_dir 
!mkdir $copied_config_dir
!cp -r /content/FastSpeech2/config/LJSpeech/* $config_dir


def get_yaml_path(name):
  return os.path.join(config_dir, name+'.yaml')


def get_yaml_contents(name):
  with open(get_yaml_path(name), 'r') as f:
    return yaml.safe_load(f.read())
  

def write_yaml(name, contents):
  with open(get_yaml_path(name), 'w') as f:
      f.write(yaml.dump(contents))


# model.yaml - change speaker name
model = get_yaml_contents('model')
model['vocoder']['speaker'] = speaker_name
write_yaml('model', model)

# preprocess.yaml - update paths and add field to text
pp = get_yaml_contents('preprocess')
pp['dataset'] = dataset_name
pp['path']['corpus_path'] = corpus_path
pp['path']['lexicon_path'] = lexicon_path
pp['path']['raw_path'] = raw_data_path
pp['path']['preprocessed_path'] = preprocessed_data_speaker_path
pp['preprocessing']['text']['max_length'] = 4096  # Needed for training EfficientSpeech models
write_yaml('preprocess', pp)

# train.yaml - update paths
tr = get_yaml_contents('train')
tr['path']['ckpt_path'] = f'./output/ckpt/{dataset_name}'
tr['path']['log_path'] = f'./output/log/{dataset_name}'
tr['path']['result_path'] = f'./output/result/{dataset_name}'
write_yaml('train', tr)

print(f'Wrote configs in {config_dir}, copying to {copied_config_dir}')
!cp -r $config_dir $copied_config_dir
print('Done')

mkdir: cannot create directory ‘/content/output_dataset/configs’: File exists
Wrote configs in /content/FastSpeech2/config/MyDataset, copying to /content/output_dataset/configs
Done


## Prepare align
The following code is modified from https://github.com/ming024/FastSpeech2/blob/master/preprocessor/ljspeech.py


In [48]:
import os
import yaml
try:
    from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
    from yaml import Loader, Dumper

import librosa
import numpy as np
from scipy.io import wavfile
from tqdm import tqdm

# Workaround for importing text
import sys
sys.path.append('/content/FastSpeech2')
from text import _clean_text


def prepare_align(config):
    sampling_rate = config["preprocessing"]["audio"]["sampling_rate"]
    max_wav_value = config["preprocessing"]["audio"]["max_wav_value"]
    cleaners = config["preprocessing"]["text"]["text_cleaners"]
    speaker = speaker_name
    with open(corpus_file_path, encoding="utf-8") as f:
        for line in tqdm(f):
            parts = line.strip().split("|")
            base_name = parts[0]
            text = parts[2]
            text = _clean_text(text, cleaners)

            wav_path = os.path.join(dataset_path, "{}.wav".format(base_name))
            if os.path.exists(wav_path):
                os.makedirs(raw_data_speaker_path, exist_ok=True)
                wav, sr = librosa.load(wav_path, sr=sampling_rate)
                #wav, _ = librosa.load(wav_path, sampling_rate)
                wav = wav / max(abs(wav)) * max_wav_value
                wavfile.write(
                    os.path.join(raw_data_speaker_path, "{}.wav".format(base_name)),
                    sampling_rate,
                    wav.astype(np.int16),
                )
                with open(
                    os.path.join(raw_data_speaker_path, "{}.lab".format(base_name)),
                    "w",
                ) as f1:
                    f1.write(text)


config = get_yaml_contents('preprocess')
prepare_align(config)
print('Prepare align done')

9it [00:02,  3.37it/s]

Prepare align done





### Run Forced alignment


In [57]:
# Output TextGrid files go here
!su - mfa_user -c "mkdir -p $mfa_output_path"
%mkdir -p $textgrid_dir

# Allow mfa_user access to output directory 
!chown mfa_user $textgrid_dir

# Command line options
# -m fast: immediate disconnect (doesn't work sadface)
# --clean: cleans output dir for subsequent runs (if off, 
#             does not overwrite old data)
# --single_speaker: multiprocessing for only one speaker
mfa_cmd_opts = f'--clean --single_speaker'
align_cmd_opts = f'{corpus_path} {dictionary_file} {acoustic_model} {mfa_output_path}'

# Command must be run as unprivileged user
!echo Running mfa align with arguments: $mfa_cmd_opts $align_cmd_opts
!su - mfa_user -c "mfa align $mfa_cmd_opts $align_cmd_opts"

!echo Copying TextGrid files to $textgrid_dir
!cp $mfa_output_path/*.* $textgrid_dir 

Running mfa align with arguments: --clean --single_speaker /content/output_dataset/raw_data/universal /content/FastSpeech2/lexicon/librispeech-lexicon.txt english_us_arpa /home/mfa_user/MyDataset/TextGrid
waiting for server to start.... done
server started
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m9[0m files, average number of utterances per      
[2;36m [0m         speaker: [1;36m9.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m[32mINFO    [0m Normalizing 

## Preprocess textgrids

In [58]:
# If you run into errors, install old version of librosa

import sys
sys.path.append('/content/FastSpeech2')
from preprocessor.preprocessor import Preprocessor

config = get_yaml_contents('preprocess')
preprocessor = Preprocessor(config)
preprocessor.build_from_path()

  fft_window = pad_center(fft_window, filter_length)
  mel_basis = librosa_mel_fn(


Processing Data ...


  0%|          | 0/1 [00:00<?, ?it/s]


RuntimeError: ignored