In [8]:
# Dataset Downloader and Preprocessor
# Downloads and prepares the four datasets for training

import os
import requests
import zipfile
import tarfile
import shutil
from pathlib import Path
import pandas as pd
import librosa
import soundfile as sf
import numpy as np
from tqdm import tqdm
import json

class DatasetDownloader:
    def __init__(self, base_dir='./datasets'):
        self.base_dir = Path(base_dir)
        self.base_dir.mkdir(exist_ok=True)
        
        # Dataset URLs and configurations
        self.datasets = {
            'urban8k': {
                'url': 'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz',
                'type': 'background',
                'extract_to': 'urban8k'
            },
            'zenodo': {
                'url': 'https://zenodo.org/record/7004819/files/gunshot_dataset.zip',
                'type': 'gunshot',
                'extract_to': 'zenodo_gunshots'
            },
            'kaggle': {
                'url': 'manual_download',  # Requires Kaggle API
                'type': 'gunshot',
                'extract_to': 'kaggle_gunshots'
            }
        }
    
    def download_file(self, url, filename):
        """Download file with progress bar"""
        response = requests.get(url, stream=True)
        total_size = int(response.headers.get('content-length', 0))
    
        with open(filename, 'wb') as file, tqdm(
            desc=str(filename),
            total=total_size,
            unit='B',
            unit_scale=True,
            unit_divisor=1024,
        ) as pbar:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
                pbar.update(len(chunk))

    def extract_archive(self, archive_path, extract_to):
        """Extract various archive formats"""
        if archive_path.suffix == '.zip':
            with zipfile.ZipFile(archive_path, 'r') as zip_ref:
                zip_ref.extractall(extract_to)
        elif archive_path.suffix in ['.tar', '.gz', '.tgz']:
            with tarfile.open(archive_path, 'r:*') as tar_ref:
                tar_ref.extractall(extract_to, filter='data')  # <== Add this

    
    def download_urban8k(self):
        """Download Urban8K dataset"""
        print("Downloading Urban8K dataset...")
        dataset_dir = self.base_dir / 'urban8k'
        dataset_dir.mkdir(exist_ok=True)
        
        # Download
        archive_path = dataset_dir / 'UrbanSound8K.tar.gz'
        if not archive_path.exists():
            self.download_file(self.datasets['urban8k']['url'], archive_path)
        
        # Extract
        extract_dir = dataset_dir / 'extracted'
        if not extract_dir.exists():
            print("Extracting Urban8K...")
            self.extract_archive(archive_path, extract_dir)
        
        return extract_dir
    
    def download_zenodo_gunshots(self):
        """Download Zenodo gunshot dataset"""
        print("Downloading Zenodo gunshot dataset...")
        dataset_dir = self.base_dir / 'zenodo_gunshots'
        dataset_dir.mkdir(exist_ok=True)
        
        # Download
        archive_path = dataset_dir / 'gunshot_dataset.zip'
        if not archive_path.exists():
            self.download_file(self.datasets['zenodo']['url'], archive_path)
        
        # Extract
        extract_dir = dataset_dir / 'extracted'
        if not extract_dir.exists():
            print("Extracting Zenodo gunshots...")
            self.extract_archive(archive_path, extract_dir)
        
        return extract_dir
    
    def setup_kaggle_download(self):
        """Setup instructions for Kaggle dataset"""
        print("Setting up Kaggle gunshot dataset...")
        print("To download Kaggle dataset, you need to:")
        print("1. Install kaggle: pip install kaggle")
        print("2. Setup API key from https://www.kaggle.com/settings")
        print("3. Run: kaggle datasets download -d emrahaydemr/gunshot-audio-dataset")
        print("4. Extract to: datasets/kaggle_gunshots/")
        
        kaggle_dir = self.base_dir / 'kaggle_gunshots'
        kaggle_dir.mkdir(exist_ok=True)
        return kaggle_dir
    
    def download_bgg_dataset(self):
        """Setup BGG dataset (manual download from GitHub)"""
        print("Setting up BGG (PUBG) dataset...")
        print("To download BGG dataset:")
        print("1. Clone: git clone https://github.com/junwoopark92/PUBG-Gun-Sound-Dataset.git")
        print("2. Copy audio files to: datasets/bgg/")
        
        bgg_dir = self.base_dir / 'bgg'
        bgg_dir.mkdir(exist_ok=True)
        return bgg_dir
    
    def download_all(self):
        """Download all available datasets"""
        print("Starting dataset download process...")
        
        # Download automatic datasets
        self.download_urban8k()
        self.download_zenodo_gunshots()
        
        # Setup manual download instructions
        self.setup_kaggle_download()
        self.download_bgg_dataset()
        
        print("Dataset download setup complete!")

class DatasetPreprocessor:
    def __init__(self, datasets_dir='./datasets', output_dir='./processed_data'):
        self.datasets_dir = Path(datasets_dir)
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        
        # Create output structure
        (self.output_dir / 'gunshots' / 'pistol').mkdir(parents=True, exist_ok=True)
        (self.output_dir / 'gunshots' / 'rifle').mkdir(parents=True, exist_ok=True)
        (self.output_dir / 'gunshots' / 'shotgun').mkdir(parents=True, exist_ok=True)
        (self.output_dir / 'gunshots' / 'unknown').mkdir(parents=True, exist_ok=True)
        (self.output_dir / 'background').mkdir(parents=True, exist_ok=True)
        
        # Audio settings
        self.target_sr = 16000
        self.duration = 2.0
        self.target_length = int(self.target_sr * self.duration)
    
    def preprocess_audio(self, input_path, output_path, label_info=None):
        """Preprocess single audio file"""
        try:
            # Load audio
            y, sr = librosa.load(input_path, sr=self.target_sr, mono=True)
            
            # Normalize
            y = librosa.util.normalize(y)
            
            # Handle length
            if len(y) > self.target_length:
                # Split into segments
                segments = []
                for i in range(0, len(y) - self.target_length + 1, self.target_length // 2):
                    segment = y[i:i + self.target_length]
                    if len(segment) == self.target_length:
                        segments.append(segment)
                
                # Save segments
                for i, segment in enumerate(segments):
                    segment_path = output_path.parent / f"{output_path.stem}_{i}.wav"
                    sf.write(segment_path, segment, self.target_sr)
            
            elif len(y) < self.target_length:
                # Pad with zeros
                y = np.pad(y, (0, self.target_length - len(y)), mode='constant')
                sf.write(output_path, y, self.target_sr)
            
            else:
                # Perfect length
                sf.write(output_path, y, self.target_sr)
            
            return True
            
        except Exception as e:
            print(f"Error processing {input_path}: {e}")
            return False
    
    def process_urban8k(self):
        """Process Urban8K dataset for background sounds"""
        print("Processing Urban8K dataset...")

        urban8k_dir = self.datasets_dir / 'urban8k' / 'extracted'
        if not urban8k_dir.exists():
            print("Urban8K not found, skipping...")
            return

        metadata_path = None
        for path in urban8k_dir.rglob('*.csv'):
            if 'UrbanSound8K' in path.name:
                metadata_path = path
                break
        
        if not metadata_path:
            print("Metadata not found for Urban8K.")
            return
        
        metadata = pd.read_csv(metadata_path)
        audio_base_dir = urban8k_dir / 'UrbanSound8K' / 'audio'

        for _, row in tqdm(metadata.iterrows(), total=len(metadata)):
            file_path = audio_base_dir / f"fold{row['fold']}" / row['slice_file_name']
            if not file_path.exists():
                continue
            
            label = row['class']
            # Use only background sounds (non-gunshots)
            if label.lower() in ['gun_shot', 'gunshot']:
                continue
            
            output_path = self.output_dir / 'background' / file_path.name
            self.preprocess_audio(file_path, output_path)

    def process_zenodo(self):
        """Process Zenodo gunshot dataset"""
        print("Processing Zenodo dataset...")
        zenodo_dir = self.datasets_dir / 'zenodo_gunshots' / 'extracted'
        if not zenodo_dir.exists():
            print("Zenodo dataset not found.")
            return
        
        for file_path in tqdm(zenodo_dir.rglob('*.wav')):
            label = 'unknown'
            if 'pistol' in file_path.stem.lower():
                label = 'pistol'
            elif 'rifle' in file_path.stem.lower():
                label = 'rifle'
            elif 'shotgun' in file_path.stem.lower():
                label = 'shotgun'

            output_path = self.output_dir / 'gunshots' / label / file_path.name
            self.preprocess_audio(file_path, output_path)

    def process_kaggle(self):
        """Process Kaggle gunshot dataset (manually downloaded)"""
        print("Processing Kaggle dataset...")
        kaggle_dir = self.datasets_dir / 'kaggle_gunshots'
        if not kaggle_dir.exists():
            print("Kaggle dataset not found.")
            return

        for file_path in tqdm(kaggle_dir.rglob('*.wav')):
            label = 'unknown'
            if 'pistol' in file_path.stem.lower():
                label = 'pistol'
            elif 'rifle' in file_path.stem.lower():
                label = 'rifle'
            elif 'shotgun' in file_path.stem.lower():
                label = 'shotgun'
            
            output_path = self.output_dir / 'gunshots' / label / file_path.name
            self.preprocess_audio(file_path, output_path)

    def process_bgg(self):
        """Process BGG (PUBG) gunshot dataset"""
        print("Processing BGG dataset...")
        bgg_dir = self.datasets_dir / 'bgg'
        if not bgg_dir.exists():
            print("BGG dataset not found.")
            return

        for file_path in tqdm(bgg_dir.rglob('*.wav')):
            label = 'unknown'
            if 'pistol' in file_path.stem.lower():
                label = 'pistol'
            elif 'rifle' in file_path.stem.lower():
                label = 'rifle'
            elif 'shotgun' in file_path.stem.lower():
                label = 'shotgun'
            
            output_path = self.output_dir / 'gunshots' / label / file_path.name
            self.preprocess_audio(file_path, output_path)

    def process_all(self):
        """Process all available datasets"""
        self.process_urban8k()
        self.process_zenodo()
        self.process_kaggle()
        self.process_bgg()
        print("All datasets processed.")


In [9]:
!pip install --upgrade tqdm

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
# Step 1: Download datasets
downloader = DatasetDownloader()
downloader.download_all()

Starting dataset download process...
Downloading Urban8K dataset...
Downloading Zenodo gunshot dataset...
Setting up Kaggle gunshot dataset...
To download Kaggle dataset, you need to:
1. Install kaggle: pip install kaggle
2. Setup API key from https://www.kaggle.com/settings
3. Run: kaggle datasets download -d emrahaydemr/gunshot-audio-dataset
4. Extract to: datasets/kaggle_gunshots/
Setting up BGG (PUBG) dataset...
To download BGG dataset:
1. Clone: git clone https://github.com/junwoopark92/PUBG-Gun-Sound-Dataset.git
2. Copy audio files to: datasets/bgg/
Dataset download setup complete!


In [11]:
preprocessor = DatasetPreprocessor()
preprocessor.process_all()

Processing Urban8K dataset...


100%|██████████████████████████████████████████████████████████████████████████████| 8732/8732 [04:03<00:00, 35.93it/s]


Processing Zenodo dataset...


2148it [00:57, 37.38it/s]


Processing Kaggle dataset...


851it [00:23, 36.67it/s]


Processing BGG dataset...


0it [00:00, ?it/s]

All datasets processed.



