# Install spleeter

In [None]:
!apt install ffmpeg
!apt install imagemagick
!pip install virtualenv
!virtualenv spleeter-env
!virtualenv whisper-env


Install Spleeter in other venv

In [None]:
!source spleeter-env


In [None]:
pip install spleeter

Install Wisper in other Venv.

In [3]:
from IPython.display import Audio

# Separate from command line

In [None]:
# @title 🌴 Change the values in this section

## @markdown Select the source of the audio/video file to be transcribed
#input_format = "youtube" #@param ["youtube", "gdrive", "local"]

# https://hindi2.djpunjab.app/load/_YUa7Bq9EGGsRerDnJIktw==/Yamma%20Yamma%20\(From%20Shaan\).mp3

# @markdown Enter the URL of the YouTube video or the path of the audio file to be transcribed
file = "https://hindi2.djpunjab.app/load/_YUa7Bq9EGGsRerDnJIktw==/Yamma%20Yamma%20\\(From%20Shaan\\).mp3" #@param {type:"string"}
!wget -O 'song.mp3' $file

In [None]:
Audio('song.mp3')

In [None]:
!spleeter separate --help

In [None]:
!spleeter separate -o output/ 'song.mp3'

In [None]:
!ls output/song

In [None]:
#Audio('output/song/vocals.wav')
Audio('output/song/accompaniment.wav')

In [None]:
!source whisper-env
# Fix for whisper + torch DispatchKey.Meta bug in Colab
!pip uninstall -y torch torchvision torchaudio
!pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2
!pip install -U openai-whisper


In [None]:
!source whisper-env

In [None]:
import whisper
import json
import gc
import torch

model = whisper.load_model("medium") # base, small, medium, large
result = model.transcribe("output/song/vocals.wav", language='HI')
print(result["language"]) # correct language detected i.e "ml"
print(result["text"]) # text is in a different language
with open("output/song/segments.json", "w", encoding="utf-8") as f:
    json.dump(result["segments"], f, indent=3)

# Now release memory
del model
del result
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [None]:
!ls /usr/share/fonts/truetype/noto/


Convert Song to Karaoke Video

In [None]:
from moviepy.editor import AudioFileClip, ImageClip, CompositeVideoClip
from PIL import Image, ImageDraw, ImageFont
from moviepy.editor import ColorClip
import os, json

FONT_Roman = '/usr/share/fonts/truetype/noto/NotoSansMono-Regular.ttf'
FONT_PATH = "Mangal Regular.ttf"  # adjust for your OS
FONT_SIZE = 48
SCREEN_SIZE = (1280, 720)
LINE_HEIGHT = 60
FPS = 12
segmentFile = 'output/song/segments.json'
intrumentFile= 'output/song/accompaniment.wav'
def make_line_image(text, roman, size=SCREEN_SIZE, highlight=False):
    img = Image.new("RGBA", size, (0, 0, 0, 255))
    draw = ImageDraw.Draw(img)
    font = ImageFont.truetype(FONT_PATH, FONT_SIZE)
    romanFont = ImageFont.truetype(FONT_Roman, FONT_SIZE)

    color = (255, 255, 0, 255) if highlight else (255, 255, 255, 255)
    bbox = draw.textbbox((0, 0), text, font=font)
    text_width = bbox[2] - bbox[0]
    x = (size[0] - text_width) // 2
    y = size[1] - 150
    draw.text((x, y), text, font=font, fill=color)

    draw.text((x, size[1]- 400), roman, font=romanFont, fill=color)

    return img

def main():
    with open(segmentFile, "r", encoding="utf-8") as f:
        segments = json.load(f)

    audio = AudioFileClip(intrumentFile)
    #background = ImageClip("bg.jpg").set_duration(audio.duration).resize(SCREEN_SIZE)
    background = ColorClip(size=SCREEN_SIZE, color=(0, 0, 0)).set_duration(audio.duration)

    clips = []
    for i, seg in enumerate(segments):

        roman = transliterate(seg["text"], DEVANAGARI, ITRANS)
        img = make_line_image(seg["text"], roman, highlight=True)
        fname = f"_line_{i}.png"
        img.save(fname)

        clip = ImageClip(fname).set_start(seg["start"]).set_duration(seg["end"] - seg["start"]).fadein(0.3).fadeout(0.3)
        clips.append(clip)

    final = CompositeVideoClip([background] + clips).set_audio(audio).set_duration(audio.duration)
    final.write_videofile("output/song/karaoke_fast.mp4", fps=FPS)

    for f in os.listdir():
        if f.startswith("_line_") and f.endswith(".png"):
            os.remove(f)

if __name__ == "__main__":
    main()


In [None]:
!pip install indic-transliteration

In [None]:
from indic_transliteration.sanscript import transliterate, DEVANAGARI, ITRANS
import os, json

segmentFile = 'output/song/segments.json'
with open(segmentFile, "r", encoding="utf-8") as f:
        segments = json.load(f)
for i, seg in enumerate(segments):
  dev_text = seg["text"]
  roman = transliterate(dev_text, DEVANAGARI, ITRANS)
  print(dev_text + ' : ' + roman)  # Outputs: "tum kya kar rahe ho"