In [1]:
!mkdir data
!unzip public_test_sample.zip -d data/public_test

Archive:  public_test_sample.zip
  inflating: data/public_test/json_labels/37303234365f3231.json  
  inflating: data/public_test/json_labels/37303234365f3431.json  
  inflating: data/public_test/json_labels/37303234365f3732.json  
  inflating: data/public_test/labels/37303234365f3231.txt  
  inflating: data/public_test/labels/37303234365f3431.txt  
 extracting: data/public_test/labels/37303234365f3732.txt  
  inflating: data/public_test/songs/37303234365f3231.wav  
  inflating: data/public_test/songs/37303234365f3431.wav  
  inflating: data/public_test/songs/37303234365f3732.wav  


In [2]:
import os
import csv
import shutil
from time import time
from utils.process_label_to_txt import convert_txt
from demucs_utils.seperate_vocal import separate
from mfa.align import create_parser, run_align_corpus
from mfa.src.postprocessing import post_process_helper



In [6]:
PUBLIC_TEST = "data/public_test"
SONG_RAW_DIR = "data/public_test/songs"
RAW_LYRIC_JSON = "data/public_test/json_labels"

SEPARATED_DATA_DIR = "data/output"
PUBLIC_TEST_OUTPUT_RAW = "data/output/public_test_raw"
OUTPUT_DIR = "data/output/public_test_json"
DICTIONARY_PATH = "mfa/models/vietnamese_mfa_dict_ver3.dict"
ACOUSTIC_MODEL_PATH = "mfa/models/mfa_vn_vocal_train_combine_train_public_test.zip"

SUBMISSION_DIR = "./result"
OUTPUT_FILE = "./result/submission.zip"
OUTPUT_TIME_SUBMISSION = "./result/time_submission.csv"
OUTPUT_JUPYTER_FILE = "./result/jupyter_submission"

In [4]:
test_cases = os.listdir(SONG_RAW_DIR)
len(test_cases)

3

In [7]:
all_predicted_time = []
parser = create_parser()
args, unknown = parser.parse_known_args(["align"])
args.dictionary_path =  DICTIONARY_PATH
args.acoustic_model_path = ACOUSTIC_MODEL_PATH
args.output_directory = PUBLIC_TEST_OUTPUT_RAW

for file_name in test_cases:
    t1 = time()
    separate(os.path.join(SONG_RAW_DIR, file_name), SEPARATED_DATA_DIR)
    current_dir_separate = os.path.join(SEPARATED_DATA_DIR, "mdx_extra_q", file_name[:-4])
    separate_optimized_dir = os.path.join(current_dir_separate, file_name[:-4])
    if not os.path.exists(separate_optimized_dir):
        os.makedirs(separate_optimized_dir)
    os.popen("ffmpeg -i {input} -ar 16000 -ac 1 -y {output}".format(input=os.path.join(current_dir_separate, "vocals.wav"), output=os.path.join(separate_optimized_dir, file_name)))
    convert_txt(file_name.replace("wav", "json"), RAW_LYRIC_JSON, separate_optimized_dir)
    args.corpus_directory = separate_optimized_dir
    run_align_corpus(args, unknown)
    post_process_helper.post_process_json(
        file_name=file_name[:-4], 
        raw_output=PUBLIC_TEST_OUTPUT_RAW, 
        raw_lyric=RAW_LYRIC_JSON,
        output_dir=OUTPUT_DIR,
    )
    t2 = time()
    predicted_time = int(t2*1000 - t1*1000)
    all_predicted_time.append((file_name, predicted_time))

if not os.path.exists(SUBMISSION_DIR):
    os.makedirs(SUBMISSION_DIR)
    
# Save time submission
with open(OUTPUT_TIME_SUBMISSION, 'w') as f:
    write = csv.writer(f)
    fields = ["fname", "time (millisecond)"] 
    write.writerow(fields)
    write.writerows(all_predicted_time)

# Save jupyter submission
shutil.make_archive(OUTPUT_JUPYTER_FILE, 'zip', OUTPUT_DIR)

Separate audio: data/public_test/songs/37303234365f3231.wav
With command:  python -m demucs.separate -o data/output -n mdx_extra_q -j 2 --float32 --two-stems=vocals
Selected model is a bag of 4 models. You will see that many progress bars per track.
Separated tracks will be stored in /mnt/c/Users/Modern 14/projects/CTA-Zero9-ZAIC2022-Lyric-Alignment/data/output/mdx_extra_q
Separating track data/public_test/songs/37303234365f3231.wav


100%|██████████████████████████████████████████████████████████████████████████| 33.0/33.0 [00:18<00:00,  1.81seconds/s]
100%|██████████████████████████████████████████████████████████████████████████| 33.0/33.0 [00:16<00:00,  1.95seconds/s]
100%|██████████████████████████████████████████████████████████████████████████| 33.0/33.0 [00:14<00:00,  2.30seconds/s]
100%|██████████████████████████████████████████████████████████████████████████| 33.0/33.0 [00:15<00:00,  2.13seconds/s]
ffmpeg version 5.1.2 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 10.4.0 (conda-forge gcc 10.4.0-18)
  configuration: --prefix=/home/vnk/miniconda3/envs/lyric-test --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1666357487580/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1666357487580/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1666357487580/_build_env/bin/x86_64-conda-linux-gnu-nm --

[32mINFO[0m - Setting up corpus information...
[32mINFO[0m - Loading corpus from source files...


100%|██████████| 1/1 [00:01<00:00,  1.56s/it]

[32mINFO[0m - Found 1 speaker across 1 file, average number of utterances per speaker: 1.0
[32mINFO[0m - Initializing multiprocessing jobs...





[32mINFO[0m - Creating corpus split for feature generation...
[32mINFO[0m - Generating base features (mfcc)...
[32mINFO[0m - Generating MFCCs...


  0%|          | 0/1 [00:01<?, ?it/s]

[32mINFO[0m - Calculating CMVN...





[32mINFO[0m - Creating corpus split with features...
[32mINFO[0m - Compiling training graphs...


100%|██████████| 1/1 [00:01<00:00,  1.85s/it]

[32mINFO[0m - Performing first-pass alignment...
[32mINFO[0m - Generating alignments...



100%|██████████| 1/1 [00:03<00:00,  3.40s/it]


[32mINFO[0m - Calculating fMLLR for speaker adaptation...


100%|██████████| 1/1 [00:01<00:00,  1.32s/it]

[32mINFO[0m - Performing second-pass alignment...
[32mINFO[0m - Generating alignments...



100%|██████████| 1/1 [00:02<00:00,  3.00s/it]


[32mINFO[0m - Exporting TextGrids to data/output/public_test_raw...
[32mINFO[0m - Collecting phone and word alignments from alignment lattices...


100%|██████████| 1/1 [00:01<00:00,  1.55s/it]
100%|██████████| 1/1 [00:00<00:00, 23.60it/s]

[32mINFO[0m - Finished exporting TextGrids to data/output/public_test_raw!
[32mINFO[0m - Done! Everything took 27.046233892440796 seconds





Separate audio: data/public_test/songs/37303234365f3431.wav
With command:  python -m demucs.separate -o data/output -n mdx_extra_q -j 2 --float32 --two-stems=vocals
Selected model is a bag of 4 models. You will see that many progress bars per track.
Separated tracks will be stored in /mnt/c/Users/Modern 14/projects/CTA-Zero9-ZAIC2022-Lyric-Alignment/data/output/mdx_extra_q
Separating track data/public_test/songs/37303234365f3431.wav


100%|██████████████████████████████████████████████████████████████████████████| 33.0/33.0 [00:28<00:00,  1.16seconds/s]
100%|██████████████████████████████████████████████████████████████████████████| 33.0/33.0 [00:31<00:00,  1.06seconds/s]
100%|██████████████████████████████████████████████████████████████████████████| 33.0/33.0 [00:54<00:00,  1.67s/seconds]
100%|██████████████████████████████████████████████████████████████████████████| 33.0/33.0 [00:52<00:00,  1.59s/seconds]
ffmpeg version 5.1.2 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 10.4.0 (conda-forge gcc 10.4.0-18)
  configuration: --prefix=/home/vnk/miniconda3/envs/lyric-test --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1666357487580/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1666357487580/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1666357487580/_build_env/bin/x86_64-conda-linux-gnu-nm --

[32mINFO[0m - Setting up corpus information...
[32mINFO[0m - Loading corpus from source files...


100%|██████████| 1/1 [00:01<00:00,  1.35s/it]

[32mINFO[0m - Found 1 speaker across 1 file, average number of utterances per speaker: 1.0
[32mINFO[0m - Initializing multiprocessing jobs...
[32mINFO[0m - Creating corpus split for feature generation...
[32mINFO[0m - Generating base features (mfcc)...





[32mINFO[0m - Generating MFCCs...


  0%|          | 0/1 [00:02<?, ?it/s]


[32mINFO[0m - Calculating CMVN...
[32mINFO[0m - Creating corpus split with features...
[32mINFO[0m - Compiling training graphs...


100%|██████████| 1/1 [00:02<00:00,  2.39s/it]

[32mINFO[0m - Performing first-pass alignment...
[32mINFO[0m - Generating alignments...



100%|██████████| 1/1 [00:09<00:00,  9.00s/it]


[32mINFO[0m - Calculating fMLLR for speaker adaptation...


100%|██████████| 1/1 [00:01<00:00,  1.54s/it]

[32mINFO[0m - Performing second-pass alignment...
[32mINFO[0m - Generating alignments...



100%|██████████| 1/1 [00:04<00:00,  4.32s/it]


[32mINFO[0m - Exporting TextGrids to data/output/public_test_raw...
[32mINFO[0m - Collecting phone and word alignments from alignment lattices...


100%|██████████| 1/1 [00:01<00:00,  1.90s/it]
100%|██████████| 1/1 [00:00<00:00, 16.70it/s]

[32mINFO[0m - Finished exporting TextGrids to data/output/public_test_raw!
[32mINFO[0m - Done! Everything took 52.5624635219574 seconds





Separate audio: data/public_test/songs/37303234365f3732.wav
With command:  python -m demucs.separate -o data/output -n mdx_extra_q -j 2 --float32 --two-stems=vocals
Selected model is a bag of 4 models. You will see that many progress bars per track.
Separated tracks will be stored in /mnt/c/Users/Modern 14/projects/CTA-Zero9-ZAIC2022-Lyric-Alignment/data/output/mdx_extra_q
Separating track data/public_test/songs/37303234365f3732.wav


100%|██████████████████████████████████████████████████████████████████████████| 33.0/33.0 [00:12<00:00,  2.71seconds/s]
100%|██████████████████████████████████████████████████████████████████████████| 33.0/33.0 [00:11<00:00,  2.80seconds/s]
100%|██████████████████████████████████████████████████████████████████████████| 33.0/33.0 [00:20<00:00,  1.63seconds/s]
100%|██████████████████████████████████████████████████████████████████████████| 33.0/33.0 [00:17<00:00,  1.84seconds/s]
ffmpeg version 5.1.2 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 10.4.0 (conda-forge gcc 10.4.0-18)
  configuration: --prefix=/home/vnk/miniconda3/envs/lyric-test --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1666357487580/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1666357487580/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1666357487580/_build_env/bin/x86_64-conda-linux-gnu-nm --

[32mINFO[0m - Setting up corpus information...
[32mINFO[0m - Loading corpus from source files...


100%|██████████| 1/1 [00:01<00:00,  1.06s/it]

[32mINFO[0m - Found 1 speaker across 1 file, average number of utterances per speaker: 1.0
[32mINFO[0m - Initializing multiprocessing jobs...
[32mINFO[0m - Creating corpus split for feature generation...
[32mINFO[0m - Generating base features (mfcc)...
[32mINFO[0m - Generating MFCCs...



  0%|          | 0/1 [00:01<?, ?it/s]

[32mINFO[0m - Calculating CMVN...
[32mINFO[0m - Creating corpus split with features...





[32mINFO[0m - Compiling training graphs...


100%|██████████| 1/1 [00:01<00:00,  1.53s/it]

[32mINFO[0m - Performing first-pass alignment...
[32mINFO[0m - Generating alignments...



100%|██████████| 1/1 [00:01<00:00,  1.96s/it]


[32mINFO[0m - Calculating fMLLR for speaker adaptation...


100%|██████████| 1/1 [00:01<00:00,  1.27s/it]

[32mINFO[0m - Performing second-pass alignment...
[32mINFO[0m - Generating alignments...



100%|██████████| 1/1 [00:01<00:00,  1.68s/it]


[32mINFO[0m - Exporting TextGrids to data/output/public_test_raw...
[32mINFO[0m - Collecting phone and word alignments from alignment lattices...


100%|██████████| 1/1 [00:01<00:00,  1.96s/it]
100%|██████████| 1/1 [00:00<00:00, 22.45it/s]

[32mINFO[0m - Finished exporting TextGrids to data/output/public_test_raw!
[32mINFO[0m - Done! Everything took 26.04665994644165 seconds





'/mnt/c/Users/Modern 14/projects/CTA-Zero9-ZAIC2022-Lyric-Alignment/result/jupyter_submission.zip'