In [None]:
# load yaml 
import yaml

config_path = 'Configs/config.yml'
config = yaml.safe_load(open(config_path))
print("batch size in config is {}; config.get('batch_size', 10) returns a batch size with {}.".format(
    config['batch_size'], config.get('batch_size', 10)
))

## Data Preprocess

1. convert video to audio
2. cut the audio into 2-second segments, and check its attributions
3. check the availability
   1. Python Script
   2. human filter
4. select 20mins audio for each speaker
5. add these audio files into the train_list.txt and val_list.txt

In [None]:
# get the demo audio format

import sndhdr, os

audio_path = os.path.join(os.getcwd(), 'Demo' + os.sep + 'VCTK-corpus')
for i in os.listdir(audio_path):
    speaker_path = os.path.join(audio_path, i)
    # check whether a dir
    if os.path.isdir(speaker_path):
        for a in os.listdir(speaker_path):
            file_path = os.path.join(speaker_path, a)
            print('{} {}'.format(a, sndhdr.what(file_path)))

In [None]:
# rename the videos

import os 

video_path = os.path.join(os.getcwd(), 'Video')
for speaker in os.listdir(video_path):
    speaker_path = os.path.join(video_path, speaker)
    cnt = 0
    if os.path.isdir(speaker_path):
        for file in os.listdir(speaker_path):
            if '_' not in file:
                cnt += 1
                input_file = os.path.join(speaker_path, file)
                output_name = '{}_{}.mp4'.format(speaker, cnt)
                output_file = os.path.join(speaker_path, output_name)
                os.rename(input_file, output_file)

In [None]:
# convert video to audio

# install dependency
# REF: https://blog.fat-nerds.com/dot-nerd/install-ffmpeg-mac-os-in-chinese/
# %brew install ffmpeg

import os, subprocess, sndhdr

# REF: https://ffmpeg.org/ffmpeg.html#Audio-Options
###
# :i: input
# :-ac: Set the number of audio channels.
# :-ar: Set the audio sampling frequency.
# :-sample_fmt: Set the audio sample format.
# :-c:a: select ann encoder
# :-y: Overwrite output files without asking
###

video_path = os.path.join(os.getcwd(), 'Video')
for speaker in os.listdir(video_path):
    speaker_path = os.path.join(video_path, speaker)
    if os.path.isdir(speaker_path):
        for file in os.listdir(speaker_path):
            if file.endswith('.DS_Store'):
                continue
            input_file = os.path.join(speaker_path, file)
            # output file name
            output_name = file.split('.')[0] + '.wav'
            # output file path
            output_file = os.path.join(os.getcwd() + os.sep + 'Data' + os.sep + 'raw', output_name)
            # print(input_file, output_file)
            convert_cmd = 'ffmpeg -i {} -y -ac 1 -ar 24000 -c:a pcm_s16le {}'.format(input_file, output_file)
            # print(convert_cmd)
            s = subprocess.run(convert_cmd, shell = True, check = True)
            # s = os.system(convert_cmd)
            # print('done')
            assert sndhdr.what(output_file)[0] == 'wav', '{audio} {output_name} filetype'
            assert sndhdr.what(output_file)[1] == 24000, '{audio} {output_name} framerate'
            assert sndhdr.what(output_file)[2] == 1, '{audio} {output_name} nchannels'
            assert sndhdr.what(output_file)[4] == 16, '{audio} {output_name} sampwidth'
            # print('{} {}'.format(output_name, sndhdr.what(output_file)))

In [None]:
# Split the whole audio into 2-second segments

import os, subprocess, sndhdr

data_path = os.path.join(os.getcwd(), 'Data')
raw_path = os.path.join(data_path, 'raw')

def get_duration(file):
    """
    Get the duration of the file, returns the seconds
    """
    cmd_get_audio_message = "ffmpeg -i " + file + " 2>&1| grep 'Duration' | cut -d ' ' -f 4 | sed s/,//"
    time_str = os.popen(cmd_get_audio_message).readlines()[0].strip('\n').strip('\r')
    time_hour = int(time_str.split(':')[0])
    time_minute = int(time_str.split(':')[1])
    time_second = int(time_str.split(':')[2].split('.')[0])
    return time_hour * 3600 + time_minute * 60 + time_second

def cut_segments(audio: str, raw_path, data_path, duration = 2):
    """
    Cut the audio into 2-second segments, and put it into speaker data path
    :duration: n seconds
    """
    cut_duration = duration  # second
    audio_path = os.path.join(raw_path, audio)
    speaker_name = '_'.join(audio.split('_')[:2])
    speaker_data_path = os.path.join(data_path, speaker_name)
    audio_duration = get_duration(audio_path)
    # get the data length
    cnt = 0 # continue with the existing cnt
    for f in os.listdir(speaker_data_path):
        if f.endswith('.wav'):
            cnt += 1 
    for i in range(audio_duration//cut_duration): 
        cnt += 1
        output_name = '{}_{:05}.wav'.format(speaker_name, cnt)
        output_file = os.path.join(speaker_data_path, output_name)
        cut_cmd = 'ffmpeg -y -ss {start_point} -i {input_file} -t {end_point} -c copy {output_file}'.format(
            start_point = i * cut_duration, end_point = cut_duration, input_file = audio_path, output_file = output_file
        )
        # print(cut_cmd)
        # subprocess occurs 'FFmpeg returned non-zero exit status 1' error
        s = subprocess.check_output(cut_cmd, shell = True)
        # s = os.system(cut_cmd)
        assert sndhdr.what(output_file)[0] == 'wav', '{audio} {output_name} filetype'
        assert sndhdr.what(output_file)[1] == 24000, '{audio} {output_name} framerate'
        assert sndhdr.what(output_file)[2] == 1, '{audio} {output_name} nchannels'
        assert sndhdr.what(output_file)[4] == 16, '{audio} {output_name} sampwidth'
    # print('{} is complete.'.format(audio))

for audio in sorted(os.listdir(raw_path)):
    if not audio.endswith('.wav'):
        continue
    if audio.startswith('Luo_Xiang'):
        cut_segments(audio, raw_path, data_path, duration = 2)


In [None]:
# conduct train_list and val_list

import random, os

speakers = []
video_path = os.path.join(os.getcwd(), 'Video')
data_path = os.path.join('.', 'Data') # get the relative dir
train_list_path = os.path.join(data_path, 'train_list.txt')
if os.path.exists(train_list_path):
    os.remove(train_list_path) # remove if exists
val_list_path = os.path.join(data_path, 'val_list.txt')
if os.path.exists(val_list_path):
    os.remove(val_list_path)

# get the speakers
for speaker in os.listdir(video_path):
    if speaker == '.DS_Store':
        continue
    speakers.append(speaker)
speakers.sort() # get fixed order
for i in enumerate(speakers):
    print(i)

# get the training data
for speaker in os.listdir(data_path):
    if speaker not in speakers:
        continue
    speaker_path = os.path.join(data_path, speaker)
    files = os.listdir(speaker_path)
    random.seed(2022)
    random.shuffle(files)
    train_data = files[:100]
    # train_data = files[:600]
    val_data = files[600:675]
    for f in train_data:
        f_path = os.path.join(speaker_path, f)
        f_name = f'{f_path}|{speakers.index(speaker)}'
        with open(train_list_path, 'a') as f:
            _ = f.write(f_name + '\n') # use _ to avoid the output character
    for f in val_data:
        f_path = os.path.join(speaker_path, f)
        f_name = f'{f_path}|{speakers.index(speaker)}'
        with open(val_list_path, 'a') as f:
            _ = f.write(f_name + '\n')

In [None]:
# construct pred segments

data_path = os.path.join(os.getcwd(), 'Data')
raw_path = os.path.join(data_path, 'raw')
data_path = os.path.join(os.getcwd(), 'Pred' + os.sep + 'yisa')

for audio in sorted(os.listdir(raw_path)):
    if not audio.endswith('.wav'):
        continue
    cut_segments(audio, raw_path, data_path, duration = 15)

# Obtain Domain Index

In [None]:
import torch

m = torch.load('./Models/yisa/epoch_v2_00248.pth')


In [4]:
speakers = {
    0: 'Li_Fanping',
    1: 'Shi_Zhuguo',
    2: 'Wang_Cheng',
    3: 'Wang_Kun',
    4: 'Zhao_Lijian',
    5: 'Hua_Chunying',
    6: 'Luo_Xiang',
    7: 'Li_Gan',
    8: 'Dong_Mingzhu',
    9: 'Ma_Yun'
}

k = sorted(speakers.values())
s = {}
for d in enumerate(k):
    s[d[0]] = d[1]
s

{0: 'Dong_Mingzhu',
 1: 'Hua_Chunying',
 2: 'Li_Fanping',
 3: 'Li_Gan',
 4: 'Luo_Xiang',
 5: 'Ma_Yun',
 6: 'Shi_Zhuguo',
 7: 'Wang_Cheng',
 8: 'Wang_Kun',
 9: 'Zhao_Lijian'}