In [8]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd
from ipywidgets import widgets, HBox

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
device = 'cpu'#torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

from transformers import (
    Wav2Vec2FeatureExtractor,
    Wav2Vec2ForPreTraining,
    Wav2Vec2Model,
)
from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices
import librosa

model_path="TencentGameMate/chinese-wav2vec2-base"

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_path)
model = Wav2Vec2Model.from_pretrained(model_path,)
model = model.to(device)
#model = model.half()
model.eval()

from scipy.io.wavfile import write

def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

def get_w2v_feature(wav_file):
    wav, sr = librosa.load(wav_file, sr=16000)
    input_values = feature_extractor(wav, return_tensors="pt",sampling_rate=sr).input_values
    #input_values = input_values.half()
    input_values = input_values.to(device)

    with torch.no_grad():
        outputs = model(input_values)
        #print(outputs.keys())
        last_hidden_state = outputs.last_hidden_state.mean(dim=1)
        return last_hidden_state

Some weights of the model checkpoint at TencentGameMate/chinese-wav2vec2-base were not used when initializing Wav2Vec2Model: ['project_q.weight', 'quantizer.weight_proj.bias', 'quantizer.codevectors', 'quantizer.weight_proj.weight', 'project_hid.bias', 'project_q.bias', 'project_hid.weight']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
model_path = 'models/model_no_spk'
hps = utils.get_hparams_from_file(f'{model_path}/config.json')

print(hps['speakers'])
speakers = {}
for i,n in enumerate(hps['speakers']):
    speakers[n] = i

net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).to(device)
_ = net_g.eval()

_ = utils.load_checkpoint(f'{model_path}/G_4000.pth', net_g, None)

FileNotFoundError: [Errno 2] No such file or directory: 'models/models/model_no_spk/config.json'

In [None]:
w2v_features = {}
w2v_features['johnny'] = get_w2v_feature('/home/bigfoot/gits/data/cp2077/wav16k/johnny/johnny_inland_avenue_f_17a01dfa8a46b000.wav')
w2v_features['bandashi'] = get_w2v_feature('/home/bigfoot/gits/data/qs/wav_22k/BanDaShi/02_NPC_vo_s1_q104001.wav')
w2v_features['chilian'] = get_w2v_feature('/home/bigfoot/gits/data/qs/wav_22k/ChiLian/3_VO_SB_S1_vo_s1_q107369.wav')
w2v_features['gaoyue'] = get_w2v_feature('/home/bigfoot/gits/data/qs/wav_22k/Guide_F/guide_vo_1-01.wav')

## Single Speaker

In [7]:
stn_tst = get_text("你好啊，你叫什么名字？", hps)
with torch.no_grad():
    x_tst = stn_tst.to(device).unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1, w2v_feature=w2v_features['gaoyue'])[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

## Multiple Speakers

In [189]:
txt = '去和老~朋友们~#打个招呼吧。'
#txt = '好啦别的东西你慢慢熟悉现在去和老朋友们打个招呼吧。'

#['johnny', 'v_q_f', 'v_q_m', 'BaiFeng', 'BanDaShi', 'ChiLian', 'ChiLian0', 'ChuLingEr', 'ChuNanGong', 'DaSiMing', 'DianXiaoEr', 'DongHuangTaiYi', 'DuanMuRong', 'FanZeng', 'FuNian', 'FuSu', 'GaoJianLi', 'GaoYue', 'GeNie', 'GongShuChou', 'GongShuZhi', 'GongSunLingLong', 'Guide_F', 'Guide_M', 'Guide_T', 'JingKe', 'JuZi', 'LongJu', 'MengTian', 'PlayerYYJ', 'Player_F', 'Player_M', 'ShaoSiMing', 'ShaoYu', 'ShiLan', 'TanSonYun', 'TianCi', 'TianHu', 'TianMeng', 'TianMi', 'TianMing', 'TianYan', 'WeiZhuang', 'WuShuangGui', 'XiaoYu', 'XiaoZhi', 'XingHun', 'XueNv', 'YanLu', 'YunZhongJun', 'ZhangLiang']
def gen_audio(speaker):
    speakerid = speakers[speaker]

    stn_tst = get_text(txt, hps)
    with torch.no_grad():
        x_tst = stn_tst.to(device).unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
        sid = torch.LongTensor([speakerid]).to(device)
        
        audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.6, noise_scale_w=0.7, length_scale=1.0)[0][0,0].data.cpu().float().numpy()
    wav_data = ipd.Audio._make_wav(audio, 22050, 1)
    wav_path = f'output/{speaker}.wav'
    with open(wav_path,'wb') as f:
        f.write(wav_data)
    return wav_path
    #return ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False)

audios = []
for speaker in ['johnny', 'v_q_f', 'v_q_m', 'BaiFeng', 'BanDaShi', 'ChiLian', 'Player_F', 'Player_M', 'Guide_F']:
    audios.append(widgets.Audio.from_file(gen_audio(speaker), loop = False))

display(HBox(audios))

HBox(children=(Audio(value=b'RIFF$\xbe\x01\x00WAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00"V\x00\x00D\xac\x00\x00\…

### Voice Conversion

In [111]:
hps = utils.get_hparams_from_file("configs/chinese_val.json")
dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data)
collate_fn = TextAudioSpeakerCollate()
loader = DataLoader(dataset, num_workers=8, shuffle=False,
    batch_size=1, pin_memory=True,
    drop_last=True, collate_fn=collate_fn)
data_list = list(loader)

In [112]:
with torch.no_grad():
    x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda() for x in data_list[0]]
    sid_tgt1 = torch.LongTensor([0]).cuda()
    sid_tgt2 = torch.LongTensor([1]).cuda()
    sid_tgt3 = torch.LongTensor([2]).cuda()
    audio1 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data.cpu().float().numpy()
    audio2 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt2)[0][0,0].data.cpu().float().numpy()
    audio3 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt3)[0][0,0].data.cpu().float().numpy()
print("Original SID: %d" % sid_src.item())
ipd.display(ipd.Audio(y[0].cpu().numpy(), rate=hps.data.sampling_rate, normalize=False))
print("Converted SID: %d" % sid_tgt1.item())
ipd.display(ipd.Audio(audio1, rate=hps.data.sampling_rate, normalize=False))
print("Converted SID: %d" % sid_tgt2.item())
ipd.display(ipd.Audio(audio2, rate=hps.data.sampling_rate, normalize=False))
print("Converted SID: %d" % sid_tgt3.item())
ipd.display(ipd.Audio(audio3, rate=hps.data.sampling_rate, normalize=False))

Original SID: 0


Converted SID: 0


Converted SID: 1


Converted SID: 2
