In [None]:
import zipfile
with zipfile.ZipFile("./YouTube-music-video-5M.zip", 'r') as zip_ref:
    zip_ref.extractall(".")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
dir_to_files = "./YouTube-music-video-5M-master/youtube_ids"
dataset_dir = "./drive/MyDrive/dataset2"

if not os.path.exists(dataset_dir):
    os.mkdir(dataset_dir)
# exteract_feature_queue = []
# ids = []

In [None]:
import os
files = [dir_to_files+'/'+path for path in os.listdir(dir_to_files)]
files

['./YouTube-music-video-5M-master/youtube_ids/youtube_video_ids_02_242353.txt',
 './YouTube-music-video-5M-master/youtube_ids/youtube_video_ids_17_247362.txt',
 './YouTube-music-video-5M-master/youtube_ids/youtube_video_ids_18_250524.txt',
 './YouTube-music-video-5M-master/youtube_ids/youtube_video_ids_04_249301.txt',
 './YouTube-music-video-5M-master/youtube_ids/youtube_video_ids_15_250890.txt',
 './YouTube-music-video-5M-master/youtube_ids/youtube_video_ids_09_251585.txt',
 './YouTube-music-video-5M-master/youtube_ids/youtube_video_ids_01_235614.txt',
 './YouTube-music-video-5M-master/youtube_ids/youtube_video_ids_16_250165.txt',
 './YouTube-music-video-5M-master/youtube_ids/youtube_video_ids_00_206947.txt',
 './YouTube-music-video-5M-master/youtube_ids/youtube_video_ids_07_255766.txt',
 './YouTube-music-video-5M-master/youtube_ids/youtube_video_ids_08_252801.txt',
 './YouTube-music-video-5M-master/youtube_ids/youtube_video_ids_11_248030.txt',
 './YouTube-music-video-5M-master/youtub

In [None]:
# import shutil

# shutil.rmtree('./audio')

In [None]:
import re

def get_music_ids(file_path):
  """
  get all music ids of a file based on below pattern
  # new artist: Drake 3TVXtAsR1Inumwj472S9r4

   3t195yz9xCc
   VkXjvHfP3MM
   7LnBvuzjpr4
   1Ldzm7KGECI
   HL1UzIK-flA
   3XR5mhXtpXw
   WsPfSXJaelk
  """
  regex = r"\n(?![#])(.+)"

  f = open(file_path, 'r')
  return re.findall(regex, f.read())

In [None]:
len(get_music_ids(files[1]))

247362

In [None]:
!pip install youtube_dl

Collecting youtube_dl
  Downloading youtube_dl-2021.6.6-py2.py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 4.1 MB/s 
[?25hInstalling collected packages: youtube-dl
Successfully installed youtube-dl-2021.6.6


In [None]:
from __future__ import unicode_literals
import youtube_dl


def download_music(music_urls: list):
  """
  download a list of youtube urls
  """

  ydl_opts = {
      # 'proxy': 'socks5://5.252.161.48:1080',
      'force-ipv4': True,
      "external-downloader": "aria2c",
      "external-downloader-args": "-x 16 -s 16 -k 1M",
      'ignoreerrors': True,
      'format': 'bestaudio/best',
      'extractaudio' : True,      # only keep the audio
      'audioformat' : "wav",      # convert to mp3 
      'outtmpl': './audio/%(id)s.wav',        # name the file the ID of the video
      'noplaylist' : True,
      'max-filesize': '10m',
      'quiet': True,
      'no-warnings': True
  }
  with youtube_dl.YoutubeDL(ydl_opts) as ydl:
      ydl.download(music_urls)

In [None]:
def get_youtube_urls(music_ids: list) -> list:
  """
  get id and return youtube url
  example:
  7ZUMRECYLOQ => https://www.youtube.com/watch?v=7ZUMRECYLOQ
  """
  base_url = "https://www.youtube.com/watch?v={}"
  return [base_url.format(music_id) for music_id in music_ids]

In [None]:
sample_rate = 16000
hop_length = int(sample_rate * 0.01)
n_fft = int(sample_rate * 0.02)

In [None]:
import librosa
import numpy as np

def get_audio_features(file_path: str) -> np.ndarray:
  """
  extract features from audio file
  """
  try:
    x, sr = librosa.load(file_path, sample_rate, res_type="kaiser_fast")
    chromagram = librosa.feature.chroma_cens(x, sr=sr, hop_length=hop_length, n_octaves=6)
  except Exception as e:
      print(e)
      return None
  os.remove(file_path)
  return chromagram.transpose()

In [None]:
import time

def check_limits_for_download(music_urls: list) -> list:
  """
  if the size or duration is ok it is allowed to download
  return allowed list
  """
  MAX_DURATION = 600
  MAX_SIZE = 10485760 # 10M

  allowed = list()
  for url in music_urls:
    audio_downloder = youtube_dl.YoutubeDL({'format':'bestaudio', 
                                            "external-downloader": "aria2c",
                                            "external-downloader-args": "-x 16 -s 16 -k 1M",
                                            'ignoreerrors': True,
                                            'quiet': True, 
                                            'no-warnings': True})
    info = audio_downloder.extract_info(url, download=False) 
    if info and info.get('duration', float('inf')) <= MAX_DURATION and info.get('filesize', float('inf')) <= MAX_SIZE:
      allowed.append(url)
    if not info:
      time.sleep(0.2)
  return allowed

  

In [None]:
def block_management(music_ids: list) -> list:
  """
  get url of ids + download them as audio + extract features
  """
  allowed = check_limits_for_download(get_youtube_urls(music_ids))
  download_music(allowed)
  features = list()
  for each in music_ids:
    result = get_audio_features(f"./audio/{each}.wav")
    if result is not None:
      features.append(result)
  print(f'+++++++++++++++++++++++++++++++++++over=>{len(features)}element, allowed: {len(allowed)}+++++++++++++++++++++++++++++')
  return features

In [None]:
music_ids_list = [each for path in files for each in get_music_ids(path)]

len(music_ids_list)

5081000

In [None]:
MAX_NUM_PROCESS = 10
MAX_NUM_ELEMENT_BLOCK = 20
MAX_NUM_IDS = 200

In [None]:
def save_array(file_name: str, array: np.array, path=dataset_dir):
  print(len(array))
  with open(f'{path}/{file_name}.npy', 'wb') as f:
    np.save(f, array)

In [None]:
from multiprocessing import Pool
pool = Pool(MAX_NUM_PROCESS)

for file_id in range(1000, 1010):
  each = music_ids_list[file_id * MAX_NUM_IDS: (file_id + 1) * MAX_NUM_IDS]
  args = [each[(i * MAX_NUM_ELEMENT_BLOCK): ((i + 1) * MAX_NUM_ELEMENT_BLOCK)] 
          for i in range(int(len(each)/MAX_NUM_ELEMENT_BLOCK)+1)]
  results = pool.map(block_management, args)
  save_array(str(file_id), [each for result in results for each in result])