In [None]:
"""
RadioStreamCollector: A tool for collecting audio samples from online radio stations.
Uses pure Python libraries without FFmpeg dependency.
Adapted for running in Jupyter notebooks.
"""

import os
import requests
import datetime
import time
import json
import random
import csv
from contextlib import closing

Defaulting to user installation because normal site-packages is not writeable


ERROR: Could not find a version that satisfies the requirement pyaudioop (from versions: none)

[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for pyaudioop


In [None]:


class RadioStreamCollector:
    """
    A class to collect audio samples from online radio stations and store them with metadata.
    """
    def __init__(self, output_dir="radio_dataset", metadata_file="metadata.csv"):
        """Initialize the collector with output directory and metadata file."""
        self.output_dir = output_dir
        self.metadata_file = os.path.join(output_dir, metadata_file)
        self.stations = self._load_stations()
        
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Initialize metadata file if it doesn't exist
        if not os.path.exists(self.metadata_file):
            with open(self.metadata_file, 'w', newline='') as f:
                writer = csv.writer(f)
                writer.writerow(['filename', 'station_name', 'station_url', 'timestamp', 'duration', 'format', 'size_bytes'])

    def _load_stations(self):
        """Load a list of radio stations that offer public streams."""
        # This is a sample list of stations with direct MP3 streams that work well with direct requests
        return [
            {"name": "NPR", "url": "https://npr-ice.streamguys1.com/live.mp3"},
            {"name": "SomaFM Groove Salad", "url": "https://ice4.somafm.com/groovesalad-128-mp3"},
            {"name": "Radio Paradise", "url": "https://stream.radioparadise.com/mp3-128"},
            {"name": "France Info", "url": "https://icecast.radiofrance.fr/franceinfo-midfi.mp3"},
            {"name": "WAMU", "url": "https://hd1.wamu.org/stream.mp3"},
            {"name": "1.FM - Top Hits", "url": "https://strm112.1.fm/top40_mobile_mp3"}
        ]

    def find_more_stations(self, count=20):
        """
        Find additional radio stations from online directory APIs.
        """
        try:
            # Example: use the radio-browser.info API
            response = requests.get("https://de1.api.radio-browser.info/json/stations/bycodec/MP3", 
                                    params={"limit": count, "hidebroken": "true"})
            
            if response.status_code == 200:
                stations = response.json()
                added_count = 0
                
                for station in stations:
                    if station.get("url_resolved") and station.get("name"):
                        self.stations.append({
                            "name": station["name"],
                            "url": station["url_resolved"]
                        })
                        added_count += 1
                        
                print(f"Added {added_count} stations from radio-browser.info")
            else:
                print(f"Failed to fetch stations: {response.status_code}")
                
        except Exception as e:
            print(f"Error finding more stations: {e}")

    def record_stream(self, station, duration=60, file_format="mp3"):
        """
        Record an audio stream for the specified duration using direct HTTP requests.
        
        Args:
            station (dict): Station information with name and URL
            duration (int): Recording duration in seconds
            file_format (str): Output file format (only mp3 supported)
            
        Returns:
            dict: Metadata about the recording
        """
        timestamp = datetime.datetime.now()
        safe_name = ''.join(c if c.isalnum() or c in [' ', '_', '-'] else '_' for c in station['name'])
        filename = f"{safe_name.replace(' ', '_')}_{timestamp.strftime('%Y%m%d_%H%M%S')}.{file_format}"
        output_path = os.path.join(self.output_dir, filename)
        
        print(f"Recording {station['name']} for {duration} seconds...")
        
        try:
            # Handle .pls or .m3u playlist files
            stream_url = station['url']
            if stream_url.endswith('.pls') or stream_url.endswith('.m3u'):
                playlist_response = requests.get(stream_url, timeout=10)
                if playlist_response.status_code == 200:
                    playlist_content = playlist_response.text
                    # Very simple playlist parser
                    for line in playlist_content.splitlines():
                        if line.startswith('File1=') or line.startswith('http'):
                            stream_url = line.split('=', 1)[-1].strip()
                            if stream_url.startswith('http'):
                                break
                    print(f"Extracted stream URL from playlist: {stream_url}")
            
            start_time = time.time()
            
            # Stream using requests with a timeout
            with requests.get(stream_url, stream=True, timeout=10) as response:
                response.raise_for_status()  # Raise an exception for HTTP errors
                
                with open(output_path, 'wb') as f:
                    # Process the stream in chunks
                    for chunk in response.iter_content(chunk_size=8192):
                        # Check if we've recorded for long enough
                        if time.time() - start_time >= duration:
                            break
                        
                        if chunk:  # Filter out keep-alive chunks
                            f.write(chunk)
            
            # Get the file size
            file_size = os.path.getsize(output_path)
            
            # Calculate actual recording duration
            actual_duration = min(duration, time.time() - start_time)
            
            # Prepare metadata
            metadata = {
                'filename': filename,
                'station_name': station['name'],
                'station_url': station['url'],
                'timestamp': timestamp.isoformat(),
                'duration': round(actual_duration, 1),
                'format': file_format,
                'size_bytes': file_size
            }
            
            # Save metadata to CSV
            with open(self.metadata_file, 'a', newline='') as f:
                writer = csv.writer(f)
                writer.writerow([
                    metadata['filename'],
                    metadata['station_name'],
                    metadata['station_url'],
                    metadata['timestamp'],
                    metadata['duration'],
                    metadata['format'],
                    metadata['size_bytes']
                ])
            
            print(f"Successfully recorded: {output_path} ({file_size/1024:.1f} KB)")
            return metadata
            
        except requests.exceptions.RequestException as e:
            print(f"Error recording {station['name']}: {e}")
            return None
        except Exception as e:
            print(f"Unexpected error recording {station['name']}: {e}")
            return None

    def collect_dataset(self, num_samples=30, duration_range=(30, 90), formats=None):
        """
        Collect a dataset of audio samples from various radio stations.
        
        Args:
            num_samples (int): Number of audio files to collect
            duration_range (tuple): Min and max duration in seconds
            formats (list): List of formats to save in (default: ["mp3"])
        """
        if formats is None:
            formats = ["mp3"]
        
        if "mp3" not in formats:
            print("Warning: Only MP3 format is supported without FFmpeg. Adding MP3 to formats.")
            formats.append("mp3")
            
        if len(self.stations) < 5:
            self.find_more_stations(30)
        
        collected_count = 0
        attempts = 0
        max_attempts = num_samples * 3  # Allow for more failures since we're using direct streaming
        
        while collected_count < num_samples and attempts < max_attempts:
            # Select a random station
            station = random.choice(self.stations)
            
            # Select a random duration within the range
            duration = random.randint(duration_range[0], duration_range[1])
            
            # Record the stream (only MP3 is supported without FFmpeg)
            result = self.record_stream(station, duration, "mp3")
            
            if result:
                collected_count += 1
                print(f"Progress: {collected_count}/{num_samples} samples collected")
                
                # Add a short delay between recordings to avoid overwhelming servers
                time.sleep(2)
            else:
                # If station fails, try to find more stations
                if attempts % 5 == 0 and attempts > 0:
                    self.find_more_stations(10)
            
            attempts += 1
        
        print(f"Dataset collection complete. {collected_count} samples collected.")
        print(f"Metadata saved to: {self.metadata_file}")



In [None]:
# Run this directly instead of using a command-line interface
def run_collector(output_dir="RadioWaves_Dataset", num_samples=30, min_duration=30, max_duration=90):
    """
    Run the collector with specified parameters
    """
    collector = RadioStreamCollector(output_dir=output_dir)
    collector.collect_dataset(
        num_samples=num_samples,
        duration_range=(min_duration, max_duration),
        formats=["mp3"]  # Only mp3 is supported without FFmpeg
    )

    # Example use case demonstration
    print("\nDataset Name: RadioWaves Dataset")
    print("\nUse Case Demonstration: Audio Content Analysis")
    print("This dataset can be used for:")
    print("1. Building audio fingerprinting systems")
    print("2. Training speech/music classification models")
    print("3. Analyzing advertisement patterns across stations")
    print("4. Developing automated content categorization systems")
    print("5. Studying music genre distributions across different stations")

# To run this in Jupyter, execute the following line:
# run_collector(num_samples=5, min_duration=10, max_duration=20)

In [16]:
# Example: Collect 5 samples with shorter durations for testing
run_collector(num_samples=30, min_duration=10, max_duration=90)

# After testing, you can run with the full parameters:
# run_collector(num_samples=30, min_duration=30, max_duration=90)

Recording WAMU for 46 seconds...
Error recording WAMU: HTTPSConnectionPool(host='hd1.wamu.org', port=443): Max retries exceeded with url: /stream.mp3 (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'hd1.wamu.org'. (_ssl.c:1028)")))
Recording NPR for 85 seconds...
Successfully recorded: RadioWaves_Dataset\NPR_20250308_222102.mp3 (1040.0 KB)
Progress: 1/30 samples collected
Recording WAMU for 72 seconds...
Error recording WAMU: HTTPSConnectionPool(host='hd1.wamu.org', port=443): Max retries exceeded with url: /stream.mp3 (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'hd1.wamu.org'. (_ssl.c:1028)")))
Recording Radio Paradise for 17 seconds...
Successfully recorded: RadioWaves_Dataset\Radio_Paradise_20250308_222232.mp3 (392.0 KB)
Progress: 2/30 samples collected
Recording W