In [4]:
import os
import time
import datetime
import random
import urllib.request
import json

In [5]:
# Create directory structure
dataset_name = "RadioWaveBench"
base_dir = f"./{dataset_name}"
audio_dir = f"{base_dir}/audio"
metadata_dir = f"{base_dir}/metadata"

# Create directories if they don't exist
os.makedirs(audio_dir, exist_ok=True)
os.makedirs(metadata_dir, exist_ok=True)

In [6]:
# List of reliable MP3 stream URLs
radio_stations = [
    {"name": "SomaFM Groove Salad", "url": "https://ice1.somafm.com/groovesalad-128-mp3"},
    {"name": "SomaFM Drone Zone", "url": "https://ice1.somafm.com/dronezone-128-mp3"},
    {"name": "Ambient Sleeping Pill", "url": "http://radio.stereoscenic.com/asp-h"},
    {"name": "Positively Vibe", "url": "http://stream.positivity.radio/pr-positive.mp3"},
    {"name": "Radio Paradise", "url": "http://stream.radioparadise.com/mp3-128"}
]

In [7]:
def download_stream_simple(station, duration_seconds, output_file):
    try:
        print(f"Attempting to download from {station['name']}...")
        
        # Create a request with a timeout
        req = urllib.request.Request(station['url'])
        
        # Open the stream
        start_time = datetime.datetime.now()
        timestamp = start_time.strftime("%Y-%m-%d %H:%M:%S")
        
        with urllib.request.urlopen(req, timeout=10) as response:
            # Check if we got a good response
            if response.status != 200:
                print(f"Failed to connect to stream: HTTP {response.status}")
                return None
                
            # Download the audio data
            print(f"Connected to {station['name']}, downloading for {duration_seconds} seconds...")
            
            # Calculate bytes to read (approximate for mp3 at 128kbps)
            # 128 kbps = 16 KB/s
            bytes_to_read = duration_seconds * 16 * 1024
            
            # Read the data in chunks
            with open(output_file, 'wb') as f:
                bytes_read = 0
                chunk_size = 8192  # 8KB chunks
                
                # Keep track of download start time
                download_start = time.time()
                
                while bytes_read < bytes_to_read:
                    # Read a chunk of data
                    chunk = response.read(chunk_size)
                    if not chunk:
                        break
                        
                    # Write the chunk to file
                    f.write(chunk)
                    bytes_read += len(chunk)
                    
                    # Check if we've been downloading for the specified duration
                    if time.time() - download_start >= duration_seconds:
                        break
                        
            # Check if file was created and has size
            if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
                file_size = os.path.getsize(output_file)
                duration_estimate = file_size / (16 * 1024)  # Estimate based on 128kbps
                
                # Create metadata
                metadata = {
                    "station_name": station["name"],
                    "station_url": station["url"],
                    "timestamp": timestamp,
                    "requested_duration": duration_seconds,
                    "estimated_duration": duration_estimate,
                    "file_format": "mp3",
                    "file_path": output_file,
                    "file_size_bytes": file_size
                }
                
                print(f"Successfully downloaded {file_size} bytes from {station['name']}")
                return metadata
            else:
                print(f"Failed to save data from {station['name']}")
                return None
                
    except Exception as e:
        print(f"Error downloading from {station['name']}: {str(e)}")
        return None

In [8]:
def collect_samples(num_samples=5, min_duration=10, max_duration=20):
    all_metadata = []
    
    # Ensure we have enough stations
    if len(radio_stations) < num_samples:
        stations_to_use = random.choices(radio_stations, k=num_samples)
    else:
        stations_to_use = random.sample(radio_stations, k=min(len(radio_stations), num_samples))
    
    # Download from each station
    for i, station in enumerate(stations_to_use):
        # Generate random duration
        duration = random.randint(min_duration, max_duration)
        
        # Create safe filename
        safe_name = ''.join(c for c in station['name'] if c.isalnum() or c in ' ').strip()
        safe_name = safe_name.replace(' ', '_')
        output_file = f"{audio_dir}/sample_{i+1:02d}_{safe_name}.mp3"
        
        # Download the stream
        print(f"Recording {station['name']} for {duration} seconds... ({i+1}/{len(stations_to_use)})")
        metadata = download_stream_simple(station, duration, output_file)
        
        if metadata:
            all_metadata.append(metadata)
            
            # Save individual metadata
            metadata_file = f"{metadata_dir}/sample_{i+1:02d}_{safe_name}.json"
            with open(metadata_file, 'w') as f:
                json.dump(metadata, f, indent=4)
        
        # Pause between downloads
        time.sleep(2)
    
    # Save summary information
    summary = {
        "dataset_name": dataset_name,
        "creation_date": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "total_samples": len(all_metadata),
        "total_estimated_duration": sum(m["estimated_duration"] for m in all_metadata) if all_metadata else 0,
        "file_format": "mp3",
        "stations": list(set(m["station_name"] for m in all_metadata)) if all_metadata else []
    }
    
    with open(f"{base_dir}/dataset_summary.json", "w") as f:
        json.dump(summary, f, indent=4)
    
    return all_metadata

In [12]:
# Run the collection
print(f"Starting collection of {dataset_name} dataset...")
metadata = collect_samples(num_samples=30, min_duration=5, max_duration=10)
print(f"Dataset collection complete! Collected {len(metadata)} samples.")

Starting collection of RadioWaveBench dataset...
Recording Ambient Sleeping Pill for 8 seconds... (1/30)
Attempting to download from Ambient Sleeping Pill...
Connected to Ambient Sleeping Pill, downloading for 8 seconds...
Successfully downloaded 131072 bytes from Ambient Sleeping Pill
Recording SomaFM Drone Zone for 9 seconds... (2/30)
Attempting to download from SomaFM Drone Zone...
Connected to SomaFM Drone Zone, downloading for 9 seconds...
Successfully downloaded 147456 bytes from SomaFM Drone Zone
Recording SomaFM Groove Salad for 9 seconds... (3/30)
Attempting to download from SomaFM Groove Salad...
Connected to SomaFM Groove Salad, downloading for 9 seconds...
Successfully downloaded 147456 bytes from SomaFM Groove Salad
Recording SomaFM Groove Salad for 5 seconds... (4/30)
Attempting to download from SomaFM Groove Salad...
Connected to SomaFM Groove Salad, downloading for 5 seconds...
Successfully downloaded 81920 bytes from SomaFM Groove Salad
Recording Positively Vibe for 9 

In [13]:
# Display the dataset summary
try:
    with open(f"{base_dir}/dataset_summary.json", "r") as f:
        summary = json.load(f)
        for key, value in summary.items():
            print(f"{key}: {value}")
except:
    print("No summary file created.")

dataset_name: RadioWaveBench
creation_date: 2025-03-09 22:52:33
total_samples: 25
total_estimated_duration: 192.0
file_format: mp3
stations: ['Radio Paradise', 'SomaFM Groove Salad', 'SomaFM Drone Zone', 'Ambient Sleeping Pill']


In [14]:
# List the files we collected
print("\nFiles collected:")
for file in os.listdir(audio_dir):
    file_path = os.path.join(audio_dir, file)
    file_size = os.path.getsize(file_path)
    print(f" - {file} ({file_size} bytes)")


Files collected:
 - sample_01_Ambient_Sleeping_Pill.mp3 (131072 bytes)
 - sample_01_Radio_Paradise.mp3 (147456 bytes)
 - sample_02_Radio_Paradise.mp3 (131072 bytes)
 - sample_02_SomaFM_Drone_Zone.mp3 (147456 bytes)
 - sample_03_Ambient_Sleeping_Pill.mp3 (131072 bytes)
 - sample_03_SomaFM_Groove_Salad.mp3 (147456 bytes)
 - sample_04_SomaFM_Groove_Salad.mp3 (81920 bytes)
 - sample_07_SomaFM_Groove_Salad.mp3 (114688 bytes)
 - sample_08_Radio_Paradise.mp3 (163840 bytes)
 - sample_09_SomaFM_Drone_Zone.mp3 (131072 bytes)
 - sample_10_Radio_Paradise.mp3 (81920 bytes)
 - sample_11_Radio_Paradise.mp3 (114688 bytes)
 - sample_12_Ambient_Sleeping_Pill.mp3 (98304 bytes)
 - sample_14_Radio_Paradise.mp3 (147456 bytes)
 - sample_15_SomaFM_Groove_Salad.mp3 (147456 bytes)
 - sample_16_Ambient_Sleeping_Pill.mp3 (163840 bytes)
 - sample_17_Ambient_Sleeping_Pill.mp3 (131072 bytes)
 - sample_18_Ambient_Sleeping_Pill.mp3 (163840 bytes)
 - sample_19_SomaFM_Drone_Zone.mp3 (163840 bytes)
 - sample_21_SomaFM_D

Potential use cases for RadioWaveBench dataset:
1. Audio fingerprinting
2. Music genre classification
3. Audio compression testing
4. Stream quality analysis
5. Audio dataset for machine learning models