# AUDIO-SR

### Paper - https://arxiv.org/pdf/2309.07314.pdf
### GitHub - https://audioldm.github.io/audiosr

This is a bare minimum code to run audio SR on a custom audio clip and have that speech recording enhanced.

In [None]:
!pip3 install audiosr==0.0.5
!pip install pydub



In [None]:
import librosa
import IPython
from IPython.core.display import display
import soundfile
import os
import soundfile
from pydub import AudioSegment

In [None]:
def get_wav_duration(wav_file_path):
    data, sr = soundfile.read(wav_file_path, dtype="float32")
    length = len(data) / sr

    return int(length)

In [None]:
import os
import torch
import logging
import argparse
from audiosr import super_resolution, build_model, save_wave

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

os.environ["TOKENIZERS_PARALLELISM"] = "true"
torch.set_float32_matmul_precision("high")


def main(args):
    audiosr = build_model(model_name=args["model_name"], device="auto")

    waveform = super_resolution(
        audiosr,
        args["input_path"],
        seed=42,
        guidance_scale=3.5,
        ddim_steps=50,
        latent_t_per_second=12.8
    )

    save_wave(waveform, args["save_path"], name="output", samplerate=48000)


def get_upscaled_audio_path(wav_file_path):
  #https://arxiv.org/pdf/2309.07314.pdf
    # parser = argparse.ArgumentParser(description='Perform super-resolution on audio files using audiosr package.')

    # parser.add_argument('-i', '--input_path', required=True, help='Path to the input waveform file.')
    # parser.add_argument('-s', '--save_path', required=True, help='Path to save the output waveform file.')
    # parser.add_argument('--model_name', choices=['basic', 'speech'], default='speech', help='Name of the model to be used.')
    # parser.add_argument('-d', '--device', default="auto", help='The device for computation. If not specified, the script will automatically choose the device based on your environment.')
    # parser.add_argument('--ddim_steps', type=int, default=50, help='The sampling step for DDIM.')
    # parser.add_argument('-gs', '--guidance_scale', type=float, default=3.5, help='Guidance scale (Large => better quality and relavancy to text; Small => better diversity).')
    # parser.add_argument('--seed', type=int, default=42, help='Change this value (any integer number) will lead to a different generation result.')
    # parser.add_argument('-il', '--input_file_list', help='A file that contains all audio files that need to perform audio super resolution.')

    args = {"input_path":wav_file_path, "model_name":"speech", "device": "cuda", "ddim_steps":50, "guidance_scale":3.5, "seed":42, "save_path": ""}
    main(args)
    return "/content/output.wav"


In [None]:

start = 0
end = 0
wav_file_path = "/content/Recording3.wav"
duration = get_wav_duration(wav_file_path)
upscaled_fragments_list = []
while end<duration*1000:
  print(f"duration: {duration} start: {start} end: {end}")
  end = start + 5000
  newAudio = AudioSegment.from_wav(wav_file_path)
  newAudio = newAudio[start: end]
  newAudio.export('temp.wav', format="wav")
  output_path = get_upscaled_audio_path("/content/temp.wav")
  newAudio = AudioSegment.from_wav(output_path)
  upscaled_fragments_list.append(newAudio)
  start = end





duration: 113 start: 0 end: 0
Loading AudioSR: speech
Loading model on cuda:0




DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:10<00:00,  4.61it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.
duration: 113 start: 5000 end: 5000
Loading AudioSR: speech
Loading model on cuda:0
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:10<00:00,  4.65it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.
duration: 113 start: 10000 end: 10000
Loading AudioSR: speech
Loading model on cuda:0
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:10<00:00,  4.71it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.
duration: 113 start: 15000 end: 15000
Loading AudioSR: speech
Loading model on cuda:0
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:09<00:00,  5.49it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.
duration: 113 start: 20000 end: 20000
Loading AudioSR: speech
Loading model on cuda:0
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:08<00:00,  5.62it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.
duration: 113 start: 25000 end: 25000
Loading AudioSR: speech
Loading model on cuda:0
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:10<00:00,  4.78it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.
duration: 113 start: 30000 end: 30000
Loading AudioSR: speech
Loading model on cuda:0
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:10<00:00,  4.70it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.
duration: 113 start: 35000 end: 35000
Loading AudioSR: speech
Loading model on cuda:0
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:09<00:00,  5.05it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.
duration: 113 start: 40000 end: 40000
Loading AudioSR: speech
Loading model on cuda:0
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:08<00:00,  5.77it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.
duration: 113 start: 45000 end: 45000
Loading AudioSR: speech
Loading model on cuda:0
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:09<00:00,  5.35it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.
duration: 113 start: 50000 end: 50000
Loading AudioSR: speech
Loading model on cuda:0
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:10<00:00,  4.71it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.
duration: 113 start: 55000 end: 55000
Loading AudioSR: speech
Loading model on cuda:0
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:10<00:00,  4.84it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.
duration: 113 start: 60000 end: 60000
Loading AudioSR: speech
Loading model on cuda:0
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:08<00:00,  5.76it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.
duration: 113 start: 65000 end: 65000
Loading AudioSR: speech
Loading model on cuda:0
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:09<00:00,  5.33it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.
duration: 113 start: 70000 end: 70000
Loading AudioSR: speech
Loading model on cuda:0
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:10<00:00,  4.70it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.
duration: 113 start: 75000 end: 75000
Loading AudioSR: speech
Loading model on cuda:0
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:10<00:00,  4.74it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.
duration: 113 start: 80000 end: 80000
Loading AudioSR: speech
Loading model on cuda:0
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:09<00:00,  5.23it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.
duration: 113 start: 85000 end: 85000
Loading AudioSR: speech
Loading model on cuda:0
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:08<00:00,  5.71it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.
duration: 113 start: 90000 end: 90000
Loading AudioSR: speech
Loading model on cuda:0
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:09<00:00,  5.24it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.
duration: 113 start: 95000 end: 95000
Loading AudioSR: speech
Loading model on cuda:0
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:10<00:00,  4.67it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.
duration: 113 start: 100000 end: 100000
Loading AudioSR: speech
Loading model on cuda:0
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:10<00:00,  4.72it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.
duration: 113 start: 105000 end: 105000
Loading AudioSR: speech
Loading model on cuda:0
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:09<00:00,  5.17it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.
duration: 113 start: 110000 end: 110000
Loading AudioSR: speech
Loading model on cuda:0
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [00:08<00:00,  5.69it/s]


[98m Don't forget to try different seeds by setting --seed <int> so that AudioSR can have optimal performance on your hardware.[00m
Save audio to output.wav.


In [None]:
sound_final = upscaled_fragments_list[0]

for sound in upscaled_fragments_list[1:]:
  sound_final += sound

In [None]:
sound_final

In [None]:
sound_final.export("/content/recording3_audioSR.wav", format="wav")

<_io.BufferedRandom name='/content/recording3_audioSR.wav'>