# Drone RF Dataset

This notebook builds a **drone-only RF dataset** from two public UAV datasets:

1. **AirID**  
   - Format: `.mat`
   - Content: raw IQ samples of UAV transmissions with RF impairments

2. **Hovering UAVs RF Fingerprinting Dataset**
   - Format: `.bin` + `.json` (SigMF)
   - Content: raw IQ samples of hovering DJI M100 UAVs

we randomly sample both of them because there are too many files

The output is a **master dataset** of overlapping windows stored as `.npz` files.

I uploaded these two datasets and the  to google drive

#### Master Dataset Format

Each sample is stored as a `.npz` file with:

- `x` : spectrogram (float32, shape = [freq_bins, time_bins])
- `y` : label (1 = drone)
- `meta` : metadata (dataset source, sample rate, center frequency)


in the end we get approx 6k windows from each dataset with random sampling (takes around 500 MB)

In [7]:
import os
import json
import numpy as np
import scipy.io as sio
from scipy.signal import stft
from tqdm import tqdm

In [8]:
# Windowing
WINDOW_SAMPLES = 4096
HOP_SAMPLES = 2048

# Spectrogram
NFFT = 512

# Output
OUT_DIR = "drone_dataset_npz"
os.makedirs(OUT_DIR, exist_ok=True)


In [9]:
def normalize_iq(iq):
    """Power normalize IQ samples"""
    return iq / np.sqrt(np.mean(np.abs(iq)**2) + 1e-12)


def iq_to_spectrogram(iq, fs):
    """Convert IQ window to log-magnitude spectrogram"""
    f, t, Z = stft(iq, fs=fs, nperseg=NFFT, noverlap=NFFT//2)
    spec = np.log1p(np.abs(Z))
    return spec.astype(np.float32)


In [None]:
def extract_iq_from_mat(mat):
    for k, v in mat.items():
        if isinstance(v, np.ndarray) and np.iscomplexobj(v):
            return v.flatten()
        if isinstance(v, np.ndarray) and v.ndim == 1:
            return v.flatten()
    return None


import random

def process_airid_root(root_dir, out_dir):
    mat_files = []

    for root, _, files in os.walk(root_dir):
        for f in files:
            if f.endswith(".mat"):
                mat_files.append(os.path.join(root, f))

    print(f"[AirID] Found {len(mat_files)} .mat files")

    files_processed = 0
    total_windows = 0
    kept_windows = 0
    WINDOW_KEEP_PROB = 1/30

    pbar = tqdm(mat_files, desc="AirID files", leave=True)

    for path in pbar:
        try:
            mat = sio.loadmat(path)
            iq = extract_iq_from_mat(mat)

            if iq is None:
                pbar.set_postfix(status="no IQ")
                continue

            files_processed += 1
            iq = normalize_iq(iq)
            fs = 10e6

            win_count = 0
            kept_count = 0

            for start in range(0, len(iq) - WINDOW_SAMPLES, HOP_SAMPLES):
                total_windows += 1

                if random.random() > WINDOW_KEEP_PROB:
                    continue

                window = iq[start:start + WINDOW_SAMPLES]
                spec = iq_to_spectrogram(window, fs)

                np.savez(
                    os.path.join(out_dir, f"airid_{kept_windows}.npz"),
                    x=spec,
                    y=1,
                    meta={
                        "dataset": "AirID",
                        "source_file": path,
                        "fs": fs
                    }
                )

                kept_windows += 1
                kept_count += 1
                win_count += 1

            pbar.set_description(f"AirID ({os.path.basename(path)})")
            pbar.set_postfix(
                files=files_processed,
                total_windows_seen=total_windows,
                windows_kept=kept_windows,
                kept_this_file=kept_count
            )

        except Exception as e:
            pbar.set_postfix(error=str(e))

    print(
        f"[AirID] Done | "
        f"Files processed: {files_processed}/{len(mat_files)} | "
        f"Total windows seen: {total_windows} | "
        f"Windows kept: {kept_windows}"
    )


In [13]:
def load_sigmf_iq(bin_path, json_path):
    with open(json_path, "r") as f:
        meta = json.load(f)

    fs = meta["global"]["core:sample_rate"]
    fc = meta["captures"]["core:center_frequency"]

    raw = np.fromfile(bin_path, dtype=np.float16)
    iq = raw[0::2] + 1j * raw[1::2]
    iq = normalize_iq(iq)

    return iq, fs, fc

def parse_hover_filename(fname):
    parts = fname.replace(".bin", "").split("_")
    return {
        "uav": parts[0], 
        "distance": parts[1],
        "burst": parts[2],
    }

def process_hovering_root(root_dir, out_dir):
    bin_files = [f for f in os.listdir(root_dir) if f.endswith(".bin")]

    from collections import defaultdict

    groups = defaultdict(list)

    for f in bin_files:
        info = parse_hover_filename(f)
        key = (info["uav"], info["distance"])
        groups[key].append(f)

        #print(f"[HoveringUAV] Found {len(bin_files)} .bin files")
    
    #print(len(groups), "groups found")

    import random

    MAX_FILES_PER_GROUP = 5

    sampled_files = []

    for key, files in groups.items():
        sampled = random.sample(files, min(MAX_FILES_PER_GROUP, len(files)))
        sampled_files.extend(sampled)

    files_processed = 0
    total_windows = 0

    print(f"[HoveringUAV] Samples {len(sampled_files)} .bin files")

    pbar = tqdm(sampled_files, desc="Sampled hovering UAV files", leave=True)

    for f in pbar:
        bin_path = os.path.join(root_dir, f)
        json_path = bin_path.replace(".bin", ".json")

        if not os.path.exists(json_path):
            pbar.set_postfix(status="missing json")
            continue

        try:
            iq, fs, fc = load_sigmf_iq(bin_path, json_path)
            files_processed += 1

            win_count = 0
            for start in range(0, len(iq) - WINDOW_SAMPLES, HOP_SAMPLES):
                window = iq[start:start + WINDOW_SAMPLES]
                spec = iq_to_spectrogram(window, fs)

                np.savez(
                    os.path.join(out_dir, f"hover_{total_windows}.npz"),
                    x=spec,
                    y=1,
                    meta={
                        "dataset": "HoveringUAV",
                        "source_file": bin_path,
                        "fs": fs,
                        "fc": fc
                    }
                )

                total_windows += 1
                win_count += 1

            # Update progress bar text instead of printing
            pbar.set_description(f"Hovering UAV ({f})")
            pbar.set_postfix(
                files=files_processed,
                windows=total_windows,
                last_file_windows=win_count
            )

        except Exception as e:
            pbar.set_postfix(error=str(e))

    print(
        f"[HoveringUAV] Done | "
        f"Files processed: {files_processed}/{len(bin_files)} | "
        f"Windows created: {total_windows}"
    )


In [14]:
AIRID_ROOT = "drone_datasets/AirID-Globecom2020_dataset"
HOVER_ROOT = "drone_datasets/UAV-Sigmf-float16"

process_airid_root(AIRID_ROOT, OUT_DIR)
process_hovering_root(HOVER_ROOT, OUT_DIR)


[AirID] Found 102 .mat files


  f, t, Z = stft(iq, fs=fs, nperseg=NFFT, noverlap=NFFT//2)
AirID (WiFiRxKRI_air_radio2_amp_2_ph_0_CBW5_MCS3.mat): 100%|██████████| 102/102 [00:33<00:00,  3.01it/s, files=102, kept_this_file=256, total_windows_seen=182060, windows_kept=6025]                  


[AirID] Done | Files processed: 102/102 | Total windows seen: 182060 | Windows kept: 6025
[HoveringUAV] Samples 136 .bin files


Hovering UAV (uav7_15ft_burst2_1.bin): 100%|██████████| 136/136 [00:04<00:00, 30.55it/s, files=136, last_file_windows=8, windows=5647]   

[HoveringUAV] Done | Files processed: 136/13893 | Windows created: 5647



