Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

转化后的模型推理出的中文音频不对 #17

Open
potatoker opened this issue Oct 12, 2023 · 0 comments
Open

转化后的模型推理出的中文音频不对 #17

potatoker opened this issue Oct 12, 2023 · 0 comments

Comments

@potatoker
Copy link

potatoker commented Oct 12, 2023

我有一个原本config中是chinese_cleaners的vits模型,原始模型本身推理是正常的,使用https://github.com/weirdseed/vits-ncnn-convert-tool转换后,修改了chinese_cleaners值为zh_ja_mixture_cleaners,然后放入SDcard 在Vits-Android-ncnn APP 中加载之后没有报错,但是在生成后出来的音频是意义不明的奇怪发音。请问这里有什么办法可以追溯一下错误原因吗。我的config 完整为:

{
  "train": {
    "log_interval": 10,
    "eval_interval": 100,
    "seed": 1234,
    "epochs": 10000,
    "learning_rate": 0.0002,
    "betas": [
      0.8,
      0.99
    ],
    "eps": 1e-09,
    "batch_size": 16,
    "fp16_run": true,
    "lr_decay": 0.999875,
    "segment_size": 8192,
    "init_lr_ratio": 1,
    "warmup_epochs": 0,
    "c_mel": 45,
    "c_kl": 1.0
  },
  "data": {
    "training_files": "final_annotation_train.txt",
    "validation_files": "final_annotation_val.txt",
    "text_cleaners": [
      "zh_ja_mixture_cleaners"
    ],
    "max_wav_value": 32768.0,
    "sampling_rate": 16000,
    "filter_length": 1024,
    "hop_length": 256,
    "win_length": 1024,
    "n_mel_channels": 80,
    "mel_fmin": 0.0,
    "mel_fmax": null,
    "add_blank": true,
    "n_speakers": 1,
    "cleaned_text": true
  },
  "model": {
    "inter_channels": 192,
    "hidden_channels": 192,
    "filter_channels": 768,
    "n_heads": 2,
    "n_layers": 6,
    "kernel_size": 3,
    "p_dropout": 0.1,
    "resblock": "1",
    "resblock_kernel_sizes": [
      3,
      7,
      11
    ],
    "resblock_dilation_sizes": [
      [
        1,
        3,
        5
      ],
      [
        1,
        3,
        5
      ],
      [
        1,
        3,
        5
      ]
    ],
    "upsample_rates": [
      8,
      8,
      2,
      2
    ],
    "upsample_initial_channel": 512,
    "upsample_kernel_sizes": [
      16,
      16,
      4,
      4
    ],
    "n_layers_q": 3,
    "use_spectral_norm": false,
    "gin_channels": 256
  },
  "speakers": ["\u0078\u0075\u0065\u006c\u0069\u006e\u0067"],
  "symbols": [
    "_",
    "\uff0c",
    "\u3002",
    "\uff01",
    "\uff1f",
    "\u2014",
    "\u2026",
    "\u3105",
    "\u3106",
    "\u3107",
    "\u3108",
    "\u3109",
    "\u310a",
    "\u310b",
    "\u310c",
    "\u310d",
    "\u310e",
    "\u310f",
    "\u3110",
    "\u3111",
    "\u3112",
    "\u3113",
    "\u3114",
    "\u3115",
    "\u3116",
    "\u3117",
    "\u3118",
    "\u3119",
    "\u311a",
    "\u311b",
    "\u311c",
    "\u311d",
    "\u311e",
    "\u311f",
    "\u3120",
    "\u3121",
    "\u3122",
    "\u3123",
    "\u3124",
    "\u3125",
    "\u3126",
    "\u3127",
    "\u3128",
    "\u3129",
    "\u02c9",
    "\u02ca",
    "\u02c7",
    "\u02cb",
    "\u02d9",
    " "
  ]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant