## Import libraries

In [1]:
!pip install selenium
!apt-get update
!apt-get install -y wget
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!apt-get install -y ./google-chrome-stable_current_amd64.deb
!wget https://storage.googleapis.com/chrome-for-testing-public/136.0.7103.0/linux64/chromedriver-linux64.zip
!unzip chromedriver-linux64.zip
!mv chromedriver-linux64/chromedriver /usr/bin/chromedriver
!chmod +x /usr/bin/chromedriver



import os
import requests
import time
import json

from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import sys
sys.path.insert(0, '/usr/lib/chromium-browser/chromedriver')
from selenium.webdriver.chrome.options import Options

Collecting selenium
  Downloading selenium-4.32.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.32.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m90.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1

In [2]:
# Setting options for broswer
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run without interface
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("window-size=1920x1080")

# Create Driver
service = Service("/usr/bin/chromedriver")
driver = webdriver.Chrome(service=service, options=chrome_options)


# Set time
driver.implicitly_wait(15)
wait = WebDriverWait(driver, 15)

##  Extract links from menu

In [4]:
def extract_audio_links_from_menu(menu_url, driver):
    driver.get(menu_url)
    container = wait.until(EC.presence_of_element_located(
        (By.CSS_SELECTOR, "div.w-full.flex.flex-col.gap-3.pt-3")
    ))
    play_items = container.find_elements(By.CSS_SELECTOR, "div.play-item")
    links = []
    for item in play_items:
        try:
            a_tag = item.find_element(By.CSS_SELECTOR, ".ptxt-track a")
            link = a_tag.get_attribute("href")
            links.append(link)
        except Exception:
            continue
    return links

## Extract information

In [5]:
def extract_track_info(driver):
    audio_div = WebDriverWait(driver, 15).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-track-info]"))
    )
    return json.loads(audio_div.get_attribute("data-track-info"))

def extract_genres(driver):
    try:
        genre_elem = driver.find_element(By.CSS_SELECTOR, "span.md\\:col-span-6.flex.flex-wrap.gap-3")
        return [a.text.strip() for a in genre_elem.find_elements(By.TAG_NAME, "a") if a.text.strip()]
    except Exception:
        return []

def extract_duration(driver):
    try:
        duration_elem = driver.find_element(By.CSS_SELECTOR, "span.w-12.ml-auto.md\\:ml-0.col-span-2.inline-flex.justify-end.items-center")
        return duration_elem.text.strip()
    except Exception:
        return ""

In [6]:
def extract_extra_info(driver):
    instrumental = "No"
    ai_generated = "No"
    try:
        info_container = driver.find_element(By.CSS_SELECTOR, "div.px-8.py-2.bg-gray-light.flex.flex-col.divide-y.divide-gray")
        info_divs = info_container.find_elements(By.CSS_SELECTOR, "div.grid.grid-cols-1.md\\:grid-cons-8.py-6")
        for div in info_divs:
            label = div.find_element(By.CSS_SELECTOR, "span.font-\\[500\\].md\\:col-span-2").text.strip()
            value = div.find_element(By.CSS_SELECTOR, "span.md\\:col-span-6").text.strip()
            if "Instrumental" in label:
                instrumental = value
            if "AI generated?" in label:
                ai_generated = value
    except Exception:
        pass
    return instrumental, ai_generated


In [10]:
def download_audio_file(file_url, audio_filepath):
    os.makedirs(os.path.dirname(audio_filepath), exist_ok=True)
    response = requests.get(file_url, stream=True)
    if response.status_code == 200:
        with open(audio_filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
    else:
        print(f"Failed to download file from {file_url}")

In [11]:
def process_audio_page(audio_url, driver, index):
    driver.get(audio_url)

    track_info = extract_track_info(driver)
    file_url = track_info.get("fileUrl", "")
    audio_name = track_info.get("title", "").strip()
    author = track_info.get("artistName", "").strip()

    genres = extract_genres(driver)
    duration = extract_duration(driver)
    instrumental, ai_generated = extract_extra_info(driver)

    metadata = {
        "audioName": audio_name,
        "author": author,
        "genres": genres,
        "instrumental": instrumental,
        "ai_generated": ai_generated,
        "duration": duration,
        "audio_url": audio_url
    }
    audio_filename = f"audio_{index:04d}.mp3"
    meta_filename = f"audio_{index:04d}.json"
    audio_filepath = os.path.join("crawled_data", "audio", audio_filename)
    meta_filepath = os.path.join("crawled_data", meta_filename)

    download_audio_file(file_url, audio_filepath)

    with open(meta_filepath, "w", encoding="utf-8") as f:
        json.dump(metadata, f, ensure_ascii=False, indent=4)

    return metadata

In [None]:
def loop_over_menu_pages(base_url, total_pages, driver):
    all_links = []
    for page in tqdm(range(1, total_pages + 1), desc="Extracting Links", unit="page"):
        page_url = f"{base_url}?page={page}"
        try:
            links = extract_audio_links_from_menu(page_url, driver)
            all_links.extend(links)
        except Exception as e:
            print(f"Error on page {page}: {e}")
    return all_links
links = loop_over_menu_pages('https://freemusicarchive.org/genre/Soul-RB/', 40, driver)

os.makedirs("crawled_data/audio", exist_ok=True)

for index, audio_url in enumerate(tqdm(links), start=1):
    process_audio_page(audio_url, driver, index)

Extracting Links: 100%|██████████| 5/5 [00:12<00:00,  2.60s/page]
 26%|██▌       | 26/100 [07:35<21:22, 17.34s/it]