diff --git a/app/src/main/cpp/CMakeLists.txt b/app/src/main/cpp/CMakeLists.txt index ce41d23..908604e 100644 --- a/app/src/main/cpp/CMakeLists.txt +++ b/app/src/main/cpp/CMakeLists.txt @@ -37,7 +37,7 @@ add_library(moereng SHARED ${libassetmanager_source} ${libmecab_source} ${libjpc ${libnjd_set_digit_source} ${libnjd_set_long_vowel_source} ${libnjd_set_pronunciation_source} ${libnjd_set_unvoiced_vowel_source} ${libtext2mecab_source} ${libmecabapi_source} ${fft_pack_source} ${audio_process_source} - ${ncnn_vits_source} native-lib.cpp ) + ${ncnn_vits_source} vitsncnn_jni.cpp) find_library( # Sets the name of the path variable. android-lib diff --git a/app/src/main/cpp/vits/SynthesizerTrn.cpp b/app/src/main/cpp/vits/SynthesizerTrn.cpp index ce0c3dd..919d669 100644 --- a/app/src/main/cpp/vits/SynthesizerTrn.cpp +++ b/app/src/main/cpp/vits/SynthesizerTrn.cpp @@ -1,6 +1,6 @@ #include #include "SynthesizerTrn.h" -#include "custom_layer.h" +#include "custom_layers.h" #include "../mecab_api/api.h" DEFINE_LAYER_CREATOR(expand_as) @@ -30,7 +30,7 @@ DEFINE_LAYER_CREATOR(ZerosLike) DEFINE_LAYER_CREATOR(RandnLike) bool SynthesizerTrn::load_model(const std::string &folder, bool multi, Net &net, - const string& name, const Option &opt) { + const string &name, const Option &opt) { LOGI("loading %s...\n", name.c_str()); net.register_custom_layer("Tensor.expand_as", expand_as_layer_creator); net.register_custom_layer("modules.Transpose", Transpose_layer_creator); @@ -48,9 +48,9 @@ bool SynthesizerTrn::load_model(const std::string &folder, bool multi, Net &net, net.opt = opt; std::string bin_path = join_path(folder, name + ".ncnn.bin"); - std::string param_path = name + ".ncnn.param"; - if (multi) param_path = "multi/" + param_path; - else param_path = "single/" + param_path; + std::string param_path; + if (multi) param_path = "multi/" + name + ".ncnn.param"; + else param_path = "single/" + name + ".ncnn.param"; bool param_success = !net.load_param(assetManager, param_path.c_str()); bool bin_success = !net.load_model(bin_path.c_str()); if (param_success && bin_success) { @@ -62,8 +62,20 @@ bool SynthesizerTrn::load_model(const std::string &folder, bool multi, Net &net, return false; } -bool SynthesizerTrn::load_weight(const std::string &folder, const std::string &name, const int w, - Mat &weight, const int n) { +void SynthesizerTrn::clear_nets(){ + emb_t.release(); + emb_g.release(); + enc_p.clear(); + enc_q.clear(); + dec.clear(); + flow_reverse.clear(); + flow.clear(); + dp.clear(); +} + +bool SynthesizerTrn::load_weight(const std::string &folder, Mat &weight, const std::string &name, + const int w, + const int n) { LOGI("loading %s...\n", "text embedding"); std::string path = join_path(folder, name + ".bin"); FILE *fp = fopen(path.c_str(), "rb"); @@ -86,42 +98,34 @@ bool SynthesizerTrn::load_weight(const std::string &folder, const std::string &n } bool SynthesizerTrn::init(const std::string &model_folder, bool voice_convert, bool multi, - const int n_vocab, AssetJNI *assetJni, Nets *nets, Option &opt) { - + const int n_vocab, AssetJNI *assetJni, Option &opt) { + clear_nets(); assetManager = AAssetManager_fromJava(assetJni->env, assetJni->assetManager); - opt.lightmode = true; - opt.use_packing_layout = true; - opt.use_bf16_storage = true; - - // use vulkan compute - if (ncnn::get_gpu_count() != 0) - opt.use_vulkan_compute = true; - if (voice_convert) { - if (load_weight(model_folder, "emb_t", 192, nets->emb_t, n_vocab) && - load_weight(model_folder, "emb_g", 256, nets->emb_g_weight, -1) && - load_model(model_folder, multi, nets->enc_q, "enc_q", opt) && - load_model(model_folder, multi, nets->dec, "dec", opt) && - load_model(model_folder, multi, nets->flow, "flow", opt) && - load_model(model_folder, multi, nets->flow_reverse, "flow.reverse", opt)) + if (load_weight(model_folder, emb_t, "emb_t", 192, n_vocab) && + load_weight(model_folder, emb_g, "emb_g", 256, -1) && + load_model(model_folder, multi, enc_q, "enc_q", opt) && + load_model(model_folder, multi, dec, "dec", opt) && + load_model(model_folder, multi, flow, "flow", opt) && + load_model(model_folder, multi, flow_reverse, "flow.reverse", opt)) return true; } else if (multi) { - if (load_weight(model_folder, "emb_t", 192, nets->emb_t, n_vocab) && - load_weight(model_folder, "emb_g", 256, nets->emb_g_weight, -1) && - load_model(model_folder, multi, nets->enc_p, "enc_p", opt) && - load_model(model_folder, multi, nets->enc_q, "enc_q", opt) && - load_model(model_folder, multi, nets->dec, "dec", opt) && - load_model(model_folder, multi, nets->flow, "flow", opt) && - load_model(model_folder, multi, nets->flow_reverse, "flow.reverse", opt) && - load_model(model_folder, multi, nets->dp, "dp", opt)) + if (load_weight(model_folder, emb_t, "emb_t", 192, n_vocab) && + load_weight(model_folder, emb_g, "emb_g", 256, -1) && + load_model(model_folder, multi, enc_p, "enc_p", opt) && + load_model(model_folder, multi, enc_q, "enc_q", opt) && + load_model(model_folder, multi, dec, "dec", opt) && + load_model(model_folder, multi, flow, "flow", opt) && + load_model(model_folder, multi, flow_reverse, "flow.reverse", opt) && + load_model(model_folder, multi, dp, "dp", opt)) return true; } else { - if (load_weight(model_folder, "emb_t", 192, nets->emb_t, n_vocab) && - load_model(model_folder, multi, nets->enc_p, "enc_p", opt) && - load_model(model_folder, multi, nets->dec, "dec", opt) && - load_model(model_folder, multi, nets->flow_reverse, "flow.reverse", opt) && - load_model(model_folder, multi, nets->dp, "dp", opt)) + if (load_weight(model_folder, emb_t, "emb_t", 192, n_vocab) && + load_model(model_folder, multi, enc_p, "enc_p", opt) && + load_model(model_folder, multi, dec, "dec", opt) && + load_model(model_folder, multi, flow_reverse, "flow.reverse", opt) && + load_model(model_folder, multi, dp, "dp", opt)) return true; } @@ -129,8 +133,7 @@ bool SynthesizerTrn::init(const std::string &model_folder, bool voice_convert, b } std::vector -SynthesizerTrn::enc_p_forward(const Mat &x, const Mat &weight, const Net &enc_p, - bool vulkan, const Option &opt) { +SynthesizerTrn::enc_p_forward(const Mat &x, bool vulkan, const Option &opt) { Mat length(1); length[0] = float(x.w); Extractor ex = enc_p.create_extractor(); @@ -138,7 +141,7 @@ SynthesizerTrn::enc_p_forward(const Mat &x, const Mat &weight, const Net &enc_p, ex.set_vulkan_compute(vulkan); ex.input("in0", x); ex.input("in1", length); - ex.input("in2", weight); + ex.input("in2", emb_t); Mat out0, out1, out2, out3; ex.extract("out0", out0); ex.extract("out1", out1); @@ -151,8 +154,7 @@ SynthesizerTrn::enc_p_forward(const Mat &x, const Mat &weight, const Net &enc_p, } std::vector -SynthesizerTrn::enc_q_forward(const Mat &x, const Mat &g, const Net &enc_q, bool vulkan, - const Option &opt) { +SynthesizerTrn::enc_q_forward(const Mat &x, const Mat &g, bool vulkan, const Option &opt) { Mat length(1); length[0] = float(x.w); Extractor ex = enc_q.create_extractor(); @@ -167,16 +169,16 @@ SynthesizerTrn::enc_q_forward(const Mat &x, const Mat &g, const Net &enc_q, bool return std::vector{out0, out1}; } -Mat SynthesizerTrn::emb_g_forward(int sid, const Mat &weight, const Option &opt) { +Mat SynthesizerTrn::emb_g_forward(int sid, const Option &opt) { Mat sid_mat(1); sid_mat[0] = (float) sid; - Mat out = embedding(sid_mat, weight, opt); + Mat out = embedding(sid_mat, emb_g, opt); return out; } Mat SynthesizerTrn::dp_forward(const Mat &x, const Mat &x_mask, const Mat &z, const Mat &g, float noise_scale, - const Net &dp, bool vulkan, const Option &opt) { + bool vulkan, const Option &opt) { Mat out; Extractor ex = dp.create_extractor(); ex.set_num_threads(opt.num_threads); @@ -199,9 +201,8 @@ Mat SynthesizerTrn::dp_forward(const Mat &x, const Mat &x_mask, const Mat &z, co return out; } -Mat SynthesizerTrn::flow_reverse_forward(const Mat &x, const Mat &x_mask, const Mat &g, - const Net &flow_reverse, - bool vulkan, const Option &opt) { +Mat SynthesizerTrn::flow_reverse_forward(const Mat &x, const Mat &x_mask, const Mat &g, bool vulkan, + const Option &opt) { Extractor ex = flow_reverse.create_extractor(); ex.set_num_threads(opt.num_threads); ex.set_vulkan_compute(vulkan); @@ -213,8 +214,8 @@ Mat SynthesizerTrn::flow_reverse_forward(const Mat &x, const Mat &x_mask, const return out; } -Mat SynthesizerTrn::flow_forward(const Mat &x, const Mat &x_mask, const Mat &g, const Net &flow, - bool vulkan, const Option &opt) { +Mat SynthesizerTrn::flow_forward(const Mat &x, const Mat &x_mask, const Mat &g, bool vulkan, + const Option &opt) { Extractor ex = flow.create_extractor(); ex.set_num_threads(opt.num_threads); ex.set_vulkan_compute(vulkan); @@ -226,8 +227,7 @@ Mat SynthesizerTrn::flow_forward(const Mat &x, const Mat &x_mask, const Mat &g, return out; } -Mat SynthesizerTrn::dec_forward(const Mat &x, const Mat &g, const Net &dec, bool vulkan, - const Option &opt) { +Mat SynthesizerTrn::dec_forward(const Mat &x, const Mat &g, bool vulkan, const Option &opt) { Extractor ex = dec.create_extractor(); ex.set_num_threads(opt.num_threads); ex.set_vulkan_compute(vulkan); @@ -241,24 +241,24 @@ Mat SynthesizerTrn::dec_forward(const Mat &x, const Mat &g, const Net &dec, bool SynthesizerTrn::SynthesizerTrn() = default; // c++ implementation of SynthesizerTrn -Mat SynthesizerTrn::forward(const Mat &data, Nets *nets, const Option &opt, bool vulkan, bool multi, +Mat SynthesizerTrn::forward(const Mat &data, const Option &opt, bool vulkan, bool multi, int sid, float noise_scale, float noise_scale_w, float length_scale) { LOGI("processing...\n"); // enc_p - auto enc_p_out = enc_p_forward(data, nets->emb_t, nets->enc_p, vulkan, opt); + auto enc_p_out = enc_p_forward(data, vulkan, opt); Mat x = enc_p_out[0]; Mat m_p = enc_p_out[1]; Mat logs_p = enc_p_out[2]; Mat x_mask = enc_p_out[3]; Mat g; - if (multi){ - g = reducedims(mattranspose(emb_g_forward(sid, nets->emb_g_weight, opt), opt)); + if (multi) { + g = reducedims(mattranspose(emb_g_forward(sid, opt), opt)); } Mat z = randn(x.w, 2, opt, 1); - Mat logw = dp_forward(x, x_mask, z, g, noise_scale_w, nets->dp, vulkan, opt); + Mat logw = dp_forward(x, x_mask, z, g, noise_scale_w, vulkan, opt); Mat w = product(matproduct(matexp(logw, opt), x_mask, opt), length_scale, opt); @@ -291,20 +291,20 @@ Mat SynthesizerTrn::forward(const Mat &data, Nets *nets, const Option &opt, bool opt); z = flow_reverse_forward(expanddims(z_p), mattranspose(expanddims(y_mask), opt), expanddims(g), - nets->flow_reverse, vulkan, opt); + vulkan, opt); y_mask = mattranspose(y_mask, opt); y_mask = expand(y_mask, z.w, z.h, opt); - Mat o = dec_forward(reducedims(matproduct(z, y_mask, opt)), expanddims(g), nets->dec, + Mat o = dec_forward(reducedims(matproduct(z, y_mask, opt)), expanddims(g), vulkan, opt); LOGI("finished!\n"); return o; } Mat SynthesizerTrn::voice_convert(const Mat &audio, int raw_sid, int target_sid, - Nets *net, const Option& opt, bool vulkan) { + const Option &opt, bool vulkan) { LOGI("start converting...\n"); // stft transform @@ -312,15 +312,15 @@ Mat SynthesizerTrn::voice_convert(const Mat &audio, int raw_sid, int target_sid, spec = matsqrt(Plus(matpow(spec, 2, opt), 1e-6, opt), opt); // voice conversion - auto g_src = mattranspose(emb_g_forward(raw_sid, net->emb_g_weight, opt), opt); - auto g_tgt = mattranspose(emb_g_forward(target_sid, net->emb_g_weight, opt), opt); - auto enc_q_out = enc_q_forward(spec, g_src, net->enc_q, vulkan, opt); + auto g_src = mattranspose(emb_g_forward(raw_sid, opt), opt); + auto g_tgt = mattranspose(emb_g_forward(target_sid, opt), opt); + auto enc_q_out = enc_q_forward(spec, g_src, vulkan, opt); auto z = expanddims(enc_q_out[0]); auto y_mask = enc_q_out[1]; - auto z_p = flow_forward(z, y_mask, g_src, net->flow, vulkan, opt); - auto z_hat = flow_reverse_forward(z_p, y_mask, g_tgt, net->flow_reverse, vulkan, opt); + auto z_p = flow_forward(z, y_mask, g_src, vulkan, opt); + auto z_hat = flow_reverse_forward(z_p, y_mask, g_tgt, vulkan, opt); y_mask = expand(y_mask, z_hat.w, z_hat.h, opt); - auto o_hat = dec_forward(reducedims(matproduct(z_hat, y_mask, opt)), g_tgt, net->dec, + auto o_hat = dec_forward(reducedims(matproduct(z_hat, y_mask, opt)), g_tgt, vulkan, opt); LOGI("voice converted!\n"); return o_hat; diff --git a/app/src/main/cpp/vits/SynthesizerTrn.h b/app/src/main/cpp/vits/SynthesizerTrn.h index 970485c..2fb626d 100644 --- a/app/src/main/cpp/vits/SynthesizerTrn.h +++ b/app/src/main/cpp/vits/SynthesizerTrn.h @@ -4,70 +4,58 @@ #include "utils.h" #include "../asset_manager_api/manager.h" -struct Nets { +class SynthesizerTrn { +private: Mat emb_t; - Mat emb_g_weight; + Mat emb_g; Net enc_p; Net enc_q; Net dec; Net flow_reverse; Net flow; Net dp; -}; -class SynthesizerTrn { -private: AAssetManager *assetManager{}; + void clear_nets(); + static bool - load_weight(const std::string &folder, const std::string &name, const int w, Mat &weight, + load_weight(const std::string &folder, Mat &weight, const std::string &name, const int w, const int n); - bool load_model(const std::string &folder, bool multi, Net &net, const string& name, const Option &opt); + bool + load_model(const std::string &folder, bool multi, Net &net, const string& name, const Option &opt); - static std::vector - enc_p_forward(const Mat &x, const Mat &weight, const Net &enc_p, bool vulkan, - const Option &opt); + std::vector + enc_p_forward(const Mat &x, bool vulkan, const Option &opt); - static std::vector - enc_q_forward(const Mat &x, const Mat &g, const Net &enc_q, bool vulkan, const Option &opt); + std::vector enc_q_forward(const Mat &x, const Mat &g, bool vulkan, const Option &opt); - static Mat - emb_g_forward(int sid, const Mat &weight, const Option& opt); + Mat emb_g_forward(int sid, const Option& opt); - static Mat - dp_forward(const Mat &x, const Mat &x_mask, const Mat &z, const Mat &g, float noise_scale, - const Net &dp, + Mat dp_forward(const Mat &x, const Mat &x_mask, const Mat &z, const Mat &g, float noise_scale, bool vulkan, const Option &opt); - static Mat - flow_reverse_forward(const Mat &x, const Mat &x_mask, const Mat &g, const Net &flow_reverse, - bool vulkan, const Option &opt); + Mat flow_reverse_forward(const Mat &x, const Mat &x_mask, const Mat &g, bool vulkan, const Option &opt); - static Mat - flow_forward(const Mat &x, const Mat &x_mask, const Mat &g, const Net &flow, bool vulkan, - const Option &opt); + Mat flow_forward(const Mat &x, const Mat &x_mask, const Mat &g, bool vulkan, const Option &opt); - static Mat - dec_forward(const Mat &x, const Mat &g, const Net &dec, bool vulkan, const Option &opt); + Mat dec_forward(const Mat &x, const Mat &g, bool vulkan, const Option &opt); public: bool init(const std::string &model_folder, bool voice_convert, bool multi, const int n_vocab, - AssetJNI *assetJni, Nets *nets, Option &opt); + AssetJNI *assetJni, Option &opt); SynthesizerTrn(); - static Mat - forward(const Mat &x, Nets *nets, const Option &opt, bool vulkan = false, bool multi = false, + Mat forward(const Mat &x, const Option &opt, bool vulkan = false, bool multi = false, int sid = 0, float noise_scale = .667, float noise_scale_w = 0.8, float length_scale = 1); - static Mat voice_convert(const Mat &x, int raw_sid, int target_sid, - Nets *net, const Option &opt, bool vulkan = false); + Mat voice_convert(const Mat &x, int raw_sid, int target_sid, const Option &opt, bool vulkan = false); ~SynthesizerTrn(); }; #endif -#pragma once \ No newline at end of file diff --git a/app/src/main/cpp/vits/custom_layer.h b/app/src/main/cpp/vits/custom_layers.h similarity index 98% rename from app/src/main/cpp/vits/custom_layer.h rename to app/src/main/cpp/vits/custom_layers.h index 90291eb..b869ff9 100644 --- a/app/src/main/cpp/vits/custom_layer.h +++ b/app/src/main/cpp/vits/custom_layers.h @@ -10,8 +10,7 @@ class expand_as : public Layer { public: - expand_as() { - } + expand_as() = default; virtual int forward(const std::vector &bottom_blobs, std::vector &top_blobs, const Option &opt) const { @@ -94,8 +93,7 @@ class Transpose : public Layer { class PRQTransform : public Layer { public: - PRQTransform() { - } + PRQTransform() = default; virtual int forward(const std::vector &bottom_blobs, std::vector &top_blobs, const Option &opt) const { @@ -133,7 +131,6 @@ class PRQTransform : public Layer { Mat cumwidths = cumsum(widths, opt); - //cumwidths = pad(cumwidths, opt); cumwidths = pad(cumwidths, 0, 0, 1, 0, 0, opt); #pragma omp parallel for num_threads(opt.num_threads) @@ -179,7 +176,6 @@ class PRQTransform : public Layer { } } Mat cumheights = cumsum(heights, opt); - //cumheights = pad(cumheights, opt); cumheights = pad(cumheights, 0, 0, 1, 0, 0, opt); #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < cumheights.c; i++) { @@ -291,8 +287,7 @@ class ResidualReverse : public Layer { private: bool reverse; public: - ResidualReverse() { - } + ResidualReverse() = default; virtual int load_param(const ParamDict &pd) { reverse = bool(pd.get(0, 0)); @@ -330,8 +325,7 @@ class ResidualReverse : public Layer { class Embedding : public Layer { public: - Embedding() { - } + Embedding() = default; virtual int forward(const std::vector &bottom_blobs, std::vector &top_blobs, const Option &opt) const { @@ -346,8 +340,7 @@ class Embedding : public Layer { class SequenceMask : public Layer { public: - SequenceMask() { - } + SequenceMask() = default; virtual int forward(const std::vector &bottom_blobs, std::vector &top_blobs, const Option &opt) const { @@ -367,8 +360,7 @@ class Attention : public Layer { int k_channels = 96; int window_size = 4; public: - Attention() { - } + Attention() = default; virtual int forward(const std::vector &bottom_blobs, std::vector &top_blobs, const Option &opt) const { @@ -411,7 +403,6 @@ class Attention : public Layer { Mat p_attn = softmax(scores, opt); - //dropout(p_attn); Mat output = matmul(p_attn, value, opt); Mat relative_weights = absolute_position_to_relative_position(p_attn, opt); Mat value_relative_embeddings = get_relative_embeddings(emb_rel_v, t_s, window_size, opt); @@ -423,14 +414,11 @@ class Attention : public Layer { top_blob = output.reshape(t_t, d); return 0; } - - }; class ExpandDim : public Layer { public: - ExpandDim() { - } + ExpandDim() = default; virtual int forward(const std::vector &bottom_blobs, std::vector &top_blobs, const Option &opt) const { @@ -503,4 +491,3 @@ class ZerosLike : public Layer { }; #endif -#pragma once \ No newline at end of file diff --git a/app/src/main/cpp/vits/utils.cpp b/app/src/main/cpp/vits/utils.cpp index 91d5b05..baf7aa1 100644 --- a/app/src/main/cpp/vits/utils.cpp +++ b/app/src/main/cpp/vits/utils.cpp @@ -112,8 +112,8 @@ Mat softmax(const Mat &m, const Option &opt) { float *ptr = blob.channel(q); for (int i = 0; i < h; i++) { - float max = ptr[0]; - for (int j = 1; j < w; j++) { + float max = -FLT_MAX; + for (int j = 0; j < w; j++) { max = std::max(max, ptr[j]); } @@ -147,7 +147,6 @@ Mat cumsum(const Mat &blob, const Option &opt) { const float *ptr = blob.channel(i); float *outptr = res.channel(i); for (int j = 0; j < h; j++) { - auto *tmp = new float[w]; std::partial_sum(ptr, ptr + w, outptr); ptr = ptr + w; outptr = outptr + w; @@ -312,6 +311,7 @@ Mat searchsorted(Mat &bin_locations, const Mat &inputs, const Option &opt) { bin_ptr += w; } } + Mat res; res.create_like(inputs); // 100x1 #pragma omp parallel for num_threads(opt.num_threads) @@ -327,6 +327,7 @@ Mat searchsorted(Mat &bin_locations, const Mat &inputs, const Option &opt) { ge_ptr += w; } } + return res; } @@ -340,7 +341,7 @@ Mat gather(Mat &blob, Mat &index, const Option &opt) { const float *idx_ptr = index.channel(i); float *outptr = res.channel(i); for (int j = 0; j < blob.h; j++) { - int k = idx_ptr[j]; + int k = int(idx_ptr[j]); outptr[j] = ptr[k]; ptr += blob.w; } @@ -484,7 +485,7 @@ Mat sum(const Mat &m, const Option &opt) { for (int j = 0; j < m.w * m.h; j++) { summed += p[j]; } - res[0] = summed; + out[0] = summed; } return res; } @@ -522,15 +523,15 @@ Mat matsqrt(const Mat &m, const Option &opt) { float matmax(const Mat &m, const Option &opt) { if (m.empty()) return 0; if (m.w * m.h * m.c == 1) return m[0]; - float _max = m[0]; + float max = -FLT_MAX; #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < m.c; i++) { const float *p = m.channel(i); - for (int j = 1; j < m.w * m.h; j++) { - if (p[j] >= _max) _max = p[j]; + for (int j = 0; j < m.w * m.h; j++) { + if (p[j] >= max) max = p[j]; } } - return _max; + return max; } Mat expand(const Mat &m, int w, int h, const Option &opt) { @@ -579,17 +580,17 @@ Mat randn(int w, int h, const Option &opt, int c) { Mat sequence_mask(const Mat &length, const Option &opt, float max_length_) { if (length.empty()) return {}; - int max_length = 0; + int max_length; if (max_length_ == 0) { - max_length = matmax(length, opt); + max_length = int(matmax(length, opt)); } else { - max_length = max_length_; + max_length = int(max_length_); } Mat x(max_length, 1); float *p = x.channel(0); #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < max_length; i++) { - p[i] = i; + p[i] = float(i); } Mat res(x.w, length.w); #pragma omp parallel for num_threads(opt.num_threads) @@ -608,13 +609,14 @@ Mat sequence_mask(const Mat &length, const Option &opt, float max_length_) { return res; } -Mat generate_path(const Mat &duration, const Mat mask, const Option &opt) { +Mat generate_path(const Mat &duration, const Mat &mask, const Option &opt) { if (duration.empty() || mask.empty()) return {}; Mat cum_duration = cumsum(duration, opt); - int t_y = mask.h; + auto t_y = float(mask.h); Mat path = sequence_mask(cum_duration, opt, t_y); Mat padded_path = pad(path, 1, 0, 0, 0, 0, opt); - padded_path = Slice(padded_path, 0, padded_path.h - 1, 0, padded_path.w, 1, 1, opt); + padded_path = Slice(padded_path, 0, padded_path.h - 1, + 0, padded_path.w, 1, 1, opt); path = matminus(path, padded_path, opt); path = matproduct(reducedims(mattranspose(path, opt)), mask, opt); return path; @@ -626,8 +628,10 @@ Mat mattranspose(const Mat &m, const Option &opt) { int h = m.h; int c = m.c; - Mat res; - res.create(h, w, c); + int target_h = w; + int target_w = h; + + Mat res(target_w, target_h, c); #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < c; i++) { @@ -680,7 +684,6 @@ void mask_fill(Mat &m, const Mat &mask, const char *condition, float condition_v float *ptr = m.channel(i); const float *m_ptr = mask.channel(i); for (int j = 0; j < m.w * m.h; j++) { - if (!strcmp(condition, "=") && m_ptr[i] == condition_value) { ptr[i] = value; continue; @@ -712,7 +715,8 @@ Mat get_relative_embeddings(const Mat &relative_embeddings, int length, int wind int slice_end_position = slice_start_position + 2 * length - 1; Mat padded_relative_embeddings; if (pad_length > 0) { - padded_relative_embeddings = pad(relative_embeddings, pad_length, pad_length, 0, 0, 0, opt); + padded_relative_embeddings = pad(relative_embeddings, pad_length, + pad_length, 0, 0, 0, opt); } else { padded_relative_embeddings = relative_embeddings.clone(); } @@ -746,11 +750,13 @@ Mat relative_position_to_absolute_position(const Mat &x, const Option &opt) { Mat x_pad = pad(x, 0, 0, 0, 1, 0, opt); Mat x_flat = x_pad.reshape(length * 2 * length, heads, 1); // 2x20000 // padding - Mat x_flat_pad = pad(x_flat, 0, 0, 0, length - 1, 0, opt); + Mat x_flat_pad = pad(x_flat, 0, 0, 0, + length - 1, 0, opt); // reshape x_flat_pad = x_flat_pad.reshape(2 * length - 1, length + 1, heads); // slice - Mat x_final = Slice(x_flat_pad, 0, length, length - 1, x_flat_pad.w, 1, 1, opt); + Mat x_final = Slice(x_flat_pad, 0, length, length - 1, + x_flat_pad.w, 1, 1, opt); return x_final; } @@ -819,7 +825,7 @@ std::string join_path(const std::string &folder, const std::string &file) { } std::vector> -rfft1d(const fftpack_real *data, const size_t size, const Option &opt) { +rfft1d(const fftpack_real *data, const fftpack_int size, const Option &opt) { std::vector> res; auto *rfin = new fftpack_real[size]; @@ -900,7 +906,7 @@ Mat hanning_window(const int n, const Option &opt) { for (int i = 0; i < res.c; i++) { float *ptr = res.channel(i); for (int j = 0; j < n; j++) { - res[j] = 0.5 - 0.5 * cos(2 * PI * j / (float(n - 1))); + ptr[j] = float(0.5 - 0.5 * cos(2 * PI * j / (float(n - 1)))); } } return res; @@ -942,7 +948,8 @@ stft(const Mat &y, const int filter_length, const int hop_length, const int win_ int pad_window_size = (filter_length - win_length) / 2; // pad window - fft_window = pad(fft_window, 0, 0, pad_window_size, pad_window_size, 0, opt); + fft_window = pad(fft_window, 0, 0, + pad_window_size, pad_window_size, 0, opt); // pad y int pad_y_size = filter_length / 2; @@ -960,7 +967,8 @@ stft(const Mat &y, const int filter_length, const int hop_length, const int win_ #pragma omp parallel for num_threads(opt.num_threads) for (int bl_s = 0; bl_s < y_frames.w; bl_s += n_columns) { int bl_t = std::min(bl_s + n_columns, y_frames.w); - auto y_frame_sliced = Slice(y_frames, 0, y_frames.h, bl_s, bl_t, 1, 1, opt); + auto y_frame_sliced = Slice(y_frames, 0, y_frames.h, + bl_s, bl_t, 1, 1, opt); auto expaned_fft_window = expand(fft_window, y_frame_sliced.w, y_frame_sliced.h, opt); auto fft_input = matproduct(expaned_fft_window, y_frame_sliced, opt); auto fft_output = rfft(reducedims(mattranspose(fft_input, opt)), opt); @@ -991,17 +999,19 @@ Mat Plus(const Mat &m, float value, const Option &opt) { Mat embedding(const Mat &x, const Mat &weight, const Option &opt) { if (x.empty() || weight.empty()) return {}; - Mat output; - int size = x.total(); - output.create(weight.w, size); - if (output.empty()) return output; + Mat output(weight.w, x.w * x.h); + if (output.empty()) return {}; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < size; i++) { - float word_index = ((const float *) x)[i]; - const float *weight_row = weight.row((int) word_index); - float *out_row = output.row(i); - memcpy(out_row, weight_row, weight.w * sizeof(float)); + for (int i = 0; i < x.c; i++){ + const float *x_p = x.channel(i); + const float *w_p = weight.channel(i); + float *ptr = output.channel(i); + for (int j = 0; j < x.w * x.h; j++) { + int word_index = static_cast(x_p[j]); + memcpy(ptr, w_p + weight.w * word_index, weight.w * sizeof(float)); + ptr += output.w; + } } return output; } diff --git a/app/src/main/cpp/vits/utils.h b/app/src/main/cpp/vits/utils.h index 4b75971..3226011 100644 --- a/app/src/main/cpp/vits/utils.h +++ b/app/src/main/cpp/vits/utils.h @@ -58,7 +58,7 @@ Mat flip(const Mat& x, const Option& opt, int dim = 1); Mat gather(Mat& blob, Mat& index, const Option& opt); -Mat generate_path(const Mat& duration, const Mat mask, const Option& opt); +Mat generate_path(const Mat& duration, const Mat& mask, const Option& opt); Mat hanning_window(const int n, const Option& opt); @@ -98,7 +98,7 @@ Mat reducedims(const Mat& m); Mat randn(int w, int h, const Option& opt, int c = 0); -std::vector> rfft1d(const fftpack_real* data, const size_t size, const Option& opt); +std::vector> rfft1d(const fftpack_real* data, const fftpack_int size, const Option& opt); std::vector rfft(const Mat& m, const Option& opt); // rfft for dim 0 diff --git a/app/src/main/cpp/native-lib.cpp b/app/src/main/cpp/vitsncnn_jni.cpp similarity index 91% rename from app/src/main/cpp/native-lib.cpp rename to app/src/main/cpp/vitsncnn_jni.cpp index f4062f7..67a5c7c 100644 --- a/app/src/main/cpp/native-lib.cpp +++ b/app/src/main/cpp/vitsncnn_jni.cpp @@ -3,20 +3,12 @@ #include "vits/SynthesizerTrn.h" #include "audio_process/audio.h" -SynthesizerTrn net_g; -static Nets *nets = nullptr; +static ncnn::UnlockedPoolAllocator g_blob_pool_allocator; +static ncnn::PoolAllocator g_workspace_pool_allocator; +static SynthesizerTrn net_g; static OpenJtalk *openJtalk; static Option opt; -static -void clear_nets() { - if (nets != nullptr) { - delete nets; - nets = nullptr; - LOGI("all nets cleared"); - } -} - static void release_openjtalk(){ if (openJtalk != nullptr){ @@ -28,13 +20,14 @@ void release_openjtalk(){ JNIEXPORT jint JNI_OnLoad(JavaVM *vm, void *reserved) { LOGD("JNI_OnLoad"); + ncnn::create_gpu_instance(); return JNI_VERSION_1_4; } JNIEXPORT void JNI_OnUnload(JavaVM *vm, void *reserved) { LOGD("JNI_OnUnload"); release_openjtalk(); - clear_nets(); + ncnn::destroy_gpu_instance(); } // vits utils @@ -98,12 +91,20 @@ JNIEXPORT jboolean JNICALL Java_com_example_moereng_Vits_init_1vits(JNIEnv *env, jobject thiz, jobject asset_manager, jstring path, jboolean voice_convert, jboolean multi, jint n_vocab) { - clear_nets(); - nets = new Nets(); + const char *_path = env->GetStringUTFChars(path, nullptr); auto *assetJni = new AssetJNI(env, thiz, asset_manager); - bool ret = net_g.init(_path, voice_convert, multi, n_vocab, assetJni, nets, opt); + opt.lightmode = true; + opt.use_packing_layout = true; + opt.blob_allocator = &g_blob_pool_allocator; + opt.workspace_allocator = &g_workspace_pool_allocator; + + // use vulkan compute + if (ncnn::get_gpu_count() != 0) + opt.use_vulkan_compute = true; + + bool ret = net_g.init(_path, voice_convert, multi, n_vocab, assetJni, opt); delete assetJni; if (ret) return JNI_TRUE; else return JNI_FALSE; @@ -135,7 +136,7 @@ Java_com_example_moereng_Vits_forward(JNIEnv *env, jobject thiz, jintArray x, jb opt.num_threads = num_threads; LOGD("threads = %d", opt.num_threads); auto start = get_current_time(); - auto output = SynthesizerTrn::forward(data, nets, opt, vulkan, multi, sid, + auto output = net_g.forward(data, opt, vulkan, multi, sid, noise_scale, noise_scale_w, length_scale); auto end = get_current_time(); LOGI("time cost: %f ms", end - start); @@ -167,7 +168,7 @@ Java_com_example_moereng_Vits_voice_1convert(JNIEnv *env, jobject thiz, jfloatAr opt.num_threads = num_threads; LOGD("threads = %d", opt.num_threads); auto start = get_current_time(); - auto output = SynthesizerTrn::voice_convert(audio_mat, raw_sid, target_sid, nets, opt, + auto output = net_g.voice_convert(audio_mat, raw_sid, target_sid, opt, vulkan); auto end = get_current_time(); LOGI("time cost: %f ms", end - start); @@ -177,13 +178,6 @@ Java_com_example_moereng_Vits_voice_1convert(JNIEnv *env, jobject thiz, jfloatAr } -JNIEXPORT void JNICALL -Java_com_example_moereng_Vits_clear(JNIEnv *env, jobject thiz) { - if (nets != nullptr) { - clear_nets(); - } -} - // wave utils JNIEXPORT jbyteArray JNICALL Java_com_example_moereng_utils_audio_WaveUtils_convertAudioPCMToWaveByteArray(JNIEnv *env, diff --git a/app/src/main/java/com/example/moereng/Vits.kt b/app/src/main/java/com/example/moereng/Vits.kt index cefd3f3..1278313 100644 --- a/app/src/main/java/com/example/moereng/Vits.kt +++ b/app/src/main/java/com/example/moereng/Vits.kt @@ -21,8 +21,6 @@ object Vits { vulkan: Boolean, num_threads: Int ): FloatArray - external fun clear() - init { System.loadLibrary("moereng") }