<a href="https://colab.research.google.com/github/tg-bomze/TimbreTransfer/blob/master/TimbreTransfer_Rus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<b><font color="black" size="+4">Перенос тембра</font></b>

<b><font color="black" size="+2">Базируется на:</font></b>

**GitHub репозиторий**: [DDSP](https://github.com/magenta/ddsp)

Статья: [DDSP: Differentiable Digital Signal Processing](https://openreview.net/forum?id=B1x1ma4tDr)

Авторы: **[Jesse Engel](https://github.com/jesseengel), Lamtharn (Hanoi) Hantrakul, Chenjie Gu, [Adam Roberts](https://github.com/adarob).**

<b><font color="black" size="+2">Колаб собрал:</font></b>

GitHub: [@tg-bomze](https://github.com/tg-bomze),
Telegram: [@bomze](https://t.me/bomze),
Twitter: [@tg_bomze](https://twitter.com/tg_bomze).



```
Далее тыкай на кнопки (куда указывает красная стрелка) в каждом блоке поочередно. После нажатия дождись окончания выполнения.
```



In [0]:
#@title <b><font color="red" size="+3">←</font><font color="black" size="+3"> Устанавливаем все необходимые компоненты</font></b>

%tensorflow_version 2.x

print('Installing from pip package...')
!pip install -qU ddsp

from IPython.display import clear_output
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

# Ignore a bunch of deprecation warnings
import warnings
warnings.filterwarnings("ignore")

import copy
import os
import time

import crepe
import ddsp
import ddsp.training
from ddsp.colab.colab_utils import (download, play, record, specplot, upload,
                                    DEFAULT_SAMPLE_RATE)
import gin
from google.colab import files
import librosa
import matplotlib.pyplot as plt
import numpy as np
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds

# Helper Functions
sample_rate = DEFAULT_SAMPLE_RATE  # 16000
clear_output()
print('Готово!')

In [0]:
#@title <b><font color="red" size="+3">←</font><font color="black" size="+3"> Запишите или загрузите (.wav или .mp3) аудио</font></b>

#@markdown * Внимание! Аудио должно быть монотонным (*один голос или один инструмент*)
#@markdown * Если вы выбрали "**Record**", то запись начнется сразу же, как появится надпись "**Starting recording for X seconds...**"

record_or_upload = "Record"  #@param ["Record", "Upload (.mp3 or .wav)"]

record_seconds =   10  #@param {type:"number", min:1, max:10, step:1}

if record_or_upload == "Record":
  audio = record(seconds=record_seconds)
else:
  # Load audio sample here (.mp3 or .wav3 file)
  # Just use the first file.
  filenames, audios = upload()
  audio = audios[0]
audio = audio[np.newaxis, :]
clear_output()

# Plot.
specplot(audio)
play(audio)

# Setup the session.
ddsp.spectral_ops.reset_crepe()

# Compute features.
start_time = time.time()
audio_features = ddsp.training.eval_util.compute_audio_features(audio)
audio_features['loudness_db'] = audio_features['loudness_db'].astype(np.float32)
audio_features_mod = None

In [0]:
#@title <b><font color="red" size="+3">←</font><font color="black" size="+3"> Выберите инструмент</font></b>
#@markdown * **Violin** (Скрипка)

#@markdown * **Flute** (Флейта)

#@markdown * **Trumpet** (Труба)

#@markdown * **Tenor_Saxophone** (Тенор саксофон)

#@markdown * или загрузите свою собственную модель

#@markdown ---

model = 'Violin' #@param ['Violin', 'Flute', 'Flute2', 'Trumpet', 'Tenor_Saxophone','Upload your own (checkpoint folder as .zip)']
MODEL = model
#@markdown ---
GCS_CKPT_DIR = 'gs://ddsp/models/tf2'

def find_model_dir(dir_name):
  # Iterate through directories until model directory is found
  for root, dirs, filenames in os.walk(dir_name):
    for filename in filenames:
      if filename.endswith(".gin") and not filename.startswith("."):
        model_dir = root
        break
  return model_dir 


if model in ('Violin', 'Flute', 'Flute2', 'Trumpet', 'Tenor_Saxophone'):
  # Pretrained models.
  PRETRAINED_DIR = '/content/pretrained'
  # Copy over from gs:// for faster loading.
  !rm -r $PRETRAINED_DIR &> /dev/null
  !mkdir $PRETRAINED_DIR &> /dev/null
  model_dir = os.path.join(GCS_CKPT_DIR, 'solo_%s_ckpt' % model.lower())
  !gsutil cp $model_dir/* $PRETRAINED_DIR &> /dev/null
  model_dir = PRETRAINED_DIR
  gin_file = os.path.join(model_dir, 'operative_config-0.gin')

else:
  # User models.
  UPLOAD_DIR = '/content/uploaded'
  !mkdir $UPLOAD_DIR
  uploaded_files = files.upload()

  for fnames in uploaded_files.keys():
    print("Unzipping... {}".format(fnames))
    !unzip -o "/content/$fnames" -d $UPLOAD_DIR &> /dev/null
  model_dir = find_model_dir(UPLOAD_DIR)
  gin_file = os.path.join(model_dir, 'operative_config-0.gin')

# Parse gin config,
with gin.unlock_config():
  gin.parse_config_file(gin_file, skip_unknown=True)

# Assumes only one checkpoint in the folder, 'ckpt-[iter]`.
ckpt_files = [f for f in tf.io.gfile.listdir(model_dir) if 'ckpt' in f]
ckpt_name = ckpt_files[0].split('.')[0]
ckpt = os.path.join(model_dir, ckpt_name)

# Ensure dimensions and sampling rates are equal
time_steps_train = gin.query_parameter('DefaultPreprocessor.time_steps')
n_samples_train = gin.query_parameter('Additive.n_samples')
hop_size = int(n_samples_train / time_steps_train)

time_steps = int(audio.shape[1] / hop_size)
n_samples = time_steps * hop_size

# print("===Trained model===")
# print("Time Steps", time_steps_train)
# print("Samples", n_samples_train)
# print("Hop Size", hop_size)
# print("\n===Resynthesis===")
# print("Time Steps", time_steps)
# print("Samples", n_samples)
# print('')

gin_params = [
    'Additive.n_samples = {}'.format(n_samples),
    'FilteredNoise.n_samples = {}'.format(n_samples),
    'DefaultPreprocessor.time_steps = {}'.format(time_steps),
]

with gin.unlock_config():
  gin.parse_config(gin_params)


# Trim all input vectors to correct lengths 
for key in ['f0_hz', 'f0_confidence', 'loudness_db']:
  audio_features[key] = audio_features[key][:time_steps]
audio_features['audio'] = audio_features['audio'][:, :n_samples]


# Set up the model just to predict audio given new conditioning
model = ddsp.training.models.Autoencoder()
model.restore(ckpt)

# Build model by running a batch through it.
start_time = time.time()
_ = model(audio_features, training=False)

In [0]:
#@title <b><font color="red" size="+3">←</font><font color="black" size="+3"> Модификация и синтез аудио</font></b>

#@markdown Автокорректировка средней громкости, частоты и высоты тона (данная опция не всегда дает хороший результат):

auto_adjust = True #@param{type:"boolean"}


#@markdown *Контроль октавы (звук более глухой или звонкий):*
f0_octave_shift =  0 #@param {type:"slider", min:-2, max:2, step:1}
#@markdown *Контроль доверительного интервала звучания:*
f0_confidence_threshold =  0 #@param {type:"slider", min:0.0, max:1.0, step:0.05}
#@markdown *Контроль уровня громкости:*
loudness_db_shift = 0 #@param {type:"slider", min:-20, max:20, step:1}

#@markdown Экспериментируя с бегунками вы можете добиваться более реалистичного звучания

audio_features_mod = {k: v.copy() for k, v in audio_features.items()}


## Helper functions.
def shift_ld(audio_features, ld_shift=0.0):
  """Shift loudness by a number of ocatves."""
  audio_features['loudness_db'] += ld_shift
  return audio_features


def shift_f0(audio_features, f0_octave_shift=0.0):
  """Shift f0 by a number of ocatves."""
  audio_features['f0_hz'] *= 2.0 ** (f0_octave_shift)
  audio_features['f0_hz'] = np.clip(audio_features['f0_hz'], 
                                    0.0, 
                                    librosa.midi_to_hz(110.0))
  return audio_features


def mask_by_confidence(audio_features, confidence_level=0.1):
  """For the violin model, the masking causes fast dips in loudness. 
  This quick transient is interpreted by the model as the "plunk" sound.
  """
  mask_idx = audio_features['f0_confidence'] < confidence_level
  audio_features['f0_hz'][mask_idx] = 0.0
  # audio_features['loudness_db'][mask_idx] = -ddsp.spectral_ops.LD_RANGE
  return audio_features


def smooth_loudness(audio_features, filter_size=3):
  """Smooth loudness with a box filter."""
  smoothing_filter = np.ones([filter_size]) / float(filter_size)
  audio_features['loudness_db'] = np.convolve(audio_features['loudness_db'], 
                                           smoothing_filter, 
                                           mode='same')
  return audio_features

if auto_adjust:
  if MODEL in ['Violin', 'Flute', 'Flute2', 'Trumpet', 'Saxophone', 'Tenor_Saxophone']:
    # Adjust the peak loudness.
    l = audio_features['loudness_db']
    model_ld_avg_max = {
        'Violin': -34.0,
        'Flute': -45.0,
        'Flute2': -44.0,
        'Trumpet': -52.3,
        'Tenor_Saxophone': -31.2
    }[MODEL]
    ld_max = np.max(audio_features['loudness_db'])
    ld_diff_max = model_ld_avg_max - ld_max
    audio_features_mod = shift_ld(audio_features_mod, ld_diff_max)

    # Further adjust the average loudness above a threshold.
    l = audio_features_mod['loudness_db']
    model_ld_mean = {
        'Violin': -44.0,
        'Flute': -51.0,
        'Flute2': -53.0,
        'Trumpet': -69.2,
        'Tenor_Saxophone': -50.8
    }[MODEL]
    ld_thresh = -70.0
    ld_mean = np.mean(l[l > ld_thresh])
    ld_diff_mean = model_ld_mean - ld_mean
    audio_features_mod = shift_ld(audio_features_mod, ld_diff_mean)

    # Shift the pitch register.
    model_p_mean = {
        'Violin': 73.0,
        'Flute': 81.0,
        'Flute2': 74.0,
        'Trumpet': 65.8,
        'Tenor_Saxophone': 57.8
    }[MODEL]
    p = librosa.hz_to_midi(audio_features['f0_hz'])
    p[p == -np.inf] = 0.0
    p_mean = p[l > ld_thresh].mean()
    p_diff = model_p_mean - p_mean
    p_diff_octave = p_diff / 12.0
    round_fn = np.floor if p_diff_octave > 1.5 else np.ceil
    p_diff_octave = round_fn(p_diff_octave)
    audio_features_mod = shift_f0(audio_features_mod, p_diff_octave)

  else:
    print('\nUser uploaded model: disabling auto-adjust.')

  
audio_features_mod = shift_ld(audio_features_mod, loudness_db_shift)
audio_features_mod = shift_f0(audio_features_mod, f0_octave_shift)
audio_features_mod = mask_by_confidence(audio_features_mod, f0_confidence_threshold)

# Resynthesize Audio.
af = audio_features if audio_features_mod is None else audio_features_mod

# Run a batch of predictions.
start_time = time.time()
audio_gen = model(af, training=False)

print('----------------------------------------------------------------------------------------------------')
print('- Для скачивания аудиозаписи нажмите правой кнопкой мыши на плеер и выберите "Сохранить аудио как" -')
print('----------------------------------------------------------------------------------------------------')
print('\n')

# Audio.
print('Resynthesis')
play(audio_gen)

print('Original')
play(audio)

# Plot
specplot(audio_gen)
plt.title("Resynthesis")

specplot(audio)
_ = plt.title("Original")