In [None]:
import h5py
import numpy as np
from collections import defaultdict
import pickle
from matplotlib import pyplot as plt

# Paths to dataset and output
dataset_path = "/kaggle/input/radioml2018/GOLD_XYZ_OSC.0001_1024.hdf5"
positive_snr_path = "GOLD_XYZ_OSC_POSITIVE_FILTERED.hdf5"

# Target modulation classes and all modulation classes
target_classes = [
    'BPSK', 'QPSK', '8PSK', '16QAM', '64QAM', 'AM-DSB-WC', 'AM-SSB-SC', 'FM', 'GMSK'
]
all_classes = [
    'OOK', '4ASK', '8ASK', 'BPSK', 'QPSK', '8PSK', '16PSK', '32PSK',
    '16APSK', '32APSK', '64APSK', '128APSK', '16QAM', '32QAM',
    '64QAM', '128QAM', '256QAM', 'AM-SSB-WC', 'AM-SSB-SC',
    'AM-DSB-WC', 'AM-DSB-SC', 'FM', 'GMSK', 'OQPSK'
]

# Map target classes to their indices in all_classes
class_indices_to_keep = [all_classes.index(cls) for cls in target_classes]
class_mapping = {orig_idx: new_idx for new_idx, orig_idx in enumerate(class_indices_to_keep)}

# SNR threshold
snr_threshold = 0

# Chunk size for processing
chunk_size = 50000

# Initialize storage for filtered data
filtered_X = []
filtered_Y = []
filtered_Z = []

# Process the dataset in chunks
with h5py.File(dataset_path, 'r') as f:
    num_samples = f['X'].shape[0]
    X = f['X']
    Y = f['Y']
    Z = f['Z']

    # Validate dimensions
    assert X.shape[0] == Y.shape[0] == Z.shape[0], "Dimension mismatch in datasets"

    for start_idx in range(0, num_samples, chunk_size):
        end_idx = min(start_idx + chunk_size, num_samples)

        # Load the chunk
        X_chunk = X[start_idx:end_idx]
        Y_chunk = Y[start_idx:end_idx]
        Z_chunk = Z[start_idx:end_idx]

        # Filter data in the chunk
        for i in range(X_chunk.shape[0]):
            orig_class_idx = np.argmax(Y_chunk[i])  # Extract the class index from one-hot encoding
            snr = Z_chunk[i, 0]  # SNR value

            if orig_class_idx in class_indices_to_keep and snr >= snr_threshold:
                filtered_X.append(X_chunk[i])
                filtered_Y.append(class_mapping[orig_class_idx])
                filtered_Z.append(Z_chunk[i])

# Convert lists to numpy arrays
filtered_X = np.array(filtered_X, dtype=np.float32)
filtered_Y = np.array(filtered_Y, dtype=np.int32)
filtered_Z = np.array(filtered_Z, dtype=np.float32)

# Save the filtered dataset
with h5py.File(positive_snr_path, 'w') as output_file:
    output_file.create_dataset('X', data=filtered_X, compression='gzip')
    output_file.create_dataset('Y', data=filtered_Y, compression='gzip')
    output_file.create_dataset('Z', data=filtered_Z, compression='gzip')

print("Filtered dataset saved to", positive_snr_path)


In [None]:
modulation_schemes = ['BPSK', 'QPSK', '8PSK', '16QAM', '64QAM', 'AM-DSB-SC', 'AM-SSB-SC', 'FM', 'GMSK']
orig_mapped_classes_index = {
    'BPSK': 0,
    'QPSK': 1,
    '8PSK': 2,
    '16QAM': 3,
    '64QAM': 4,
    'AM-DSB-SC': 5,
    'AM-SSB-SC': 6,
    'GMSK': 8,
    'FM': 7,
}
gen_mapped_classes_index = {
    'BPSK': 0,
    'QPSK': 1,
    '8PSK': 2,
    'QAM16': 3,
    'QAM64': 4,
    'GFSK': 5,
    'WBFM': 6,
    'AM-DSB': 7,
    'AM-SSB': 8
}


positive_snr_dict = "GOLD_XYZ_OSC_POSITIVE_FILTERED_DICT"
positive_snr_path = "/kaggle/working/GOLD_XYZ_OSC_POSITIVE_FILTERED.hdf5"


# Load filtered dataset
with h5py.File(positive_snr_path, 'r') as f:
    X = f['X'][:]  # IQ samples
    Y = f['Y'][:]  # Modulation class indices
    Z = f['Z'][:]  # SNR values

print("X shape:", X.shape)
print("Y shape:", Y.shape)
print("Z shape:", Z.shape)

# Mappings for modulation classes
orig_mapped_classes = {
    0: 'BPSK',
    1: 'QPSK',
    2: '8PSK',
    3: '16QAM',
    4: '64QAM',
    5: 'AM-DSB-SC',
    6: 'AM-SSB-SC',
    7: 'FM',
    8: 'GMSK'
}

# Flatten SNR array if necessary
Z = Z.flatten()

# Initialize dictionary
data_orig = defaultdict(list)

# Populate dictionary
for i in range(X.shape[0]):
    modulation = orig_mapped_classes[Y[i]]  # Get modulation class name
    snr = Z[i]  # SNR value
    data_orig[(modulation, snr)].append(X[i])  # Add IQ sample to the dictionary
# Convert lists to numpy arrays
for key in data_orig.keys():
    data_orig[key] = np.array(data_orig[key])

# Verify dictionary structure
print("Total keys in data_orig:", len(data_orig))
for key, value in list(data_orig.items())[:5]:  # Show first 5 keys for validation
    print(f"Key: {key}, Shape: {value.shape}")

# Save the dictionary as a pickle file
with open(positive_snr_dict, "wb") as f:
    pickle.dump(dict(data_orig), f)

print(f"Dictionary saved to {positive_snr_dict}")


In [7]:
import h5py
import numpy as np
import pickle

# File paths
gen_dataset_awgn = "/kaggle/input/radioml/RML25AWGN"
gen_dataset_clean = "/kaggle/input/radioml/RML25CLEAN"
positive_snr_dict = "/kaggle/input/radioml-filtered/GOLD_XYZ_OSC_POSITIVE_FILTERED_DICT"


with open(positive_snr_dict, "rb") as f:
    data_orig = pickle.load(f, encoding="latin1")

with open(gen_dataset_awgn, "rb") as f:
    data_awgn = pickle.load(f, encoding='latin1')
    print(data_awgn.keys())

with open(gen_dataset_clean, "rb") as f:
    data_clean = pickle.load(f, encoding='latin1')
    print(data_clean.keys())


dict_keys([('BPSK', 0), ('BPSK', 2), ('BPSK', 4), ('BPSK', 6), ('BPSK', 8), ('BPSK', 10), ('BPSK', 12), ('BPSK', 14), ('BPSK', 16), ('BPSK', 18), ('BPSK', 20), ('BPSK', 22), ('BPSK', 24), ('BPSK', 26), ('BPSK', 28), ('BPSK', 30), ('QPSK', 0), ('QPSK', 2), ('QPSK', 4), ('QPSK', 6), ('QPSK', 8), ('QPSK', 10), ('QPSK', 12), ('QPSK', 14), ('QPSK', 16), ('QPSK', 18), ('QPSK', 20), ('QPSK', 22), ('QPSK', 24), ('QPSK', 26), ('QPSK', 28), ('QPSK', 30), ('8PSK', 0), ('8PSK', 2), ('8PSK', 4), ('8PSK', 6), ('8PSK', 8), ('8PSK', 10), ('8PSK', 12), ('8PSK', 14), ('8PSK', 16), ('8PSK', 18), ('8PSK', 20), ('8PSK', 22), ('8PSK', 24), ('8PSK', 26), ('8PSK', 28), ('8PSK', 30), ('QAM16', 0), ('QAM16', 2), ('QAM16', 4), ('QAM16', 6), ('QAM16', 8), ('QAM16', 10), ('QAM16', 12), ('QAM16', 14), ('QAM16', 16), ('QAM16', 18), ('QAM16', 20), ('QAM16', 22), ('QAM16', 24), ('QAM16', 26), ('QAM16', 28), ('QAM16', 30), ('QAM64', 0), ('QAM64', 2), ('QAM64', 4), ('QAM64', 6), ('QAM64', 8), ('QAM64', 10), ('QAM64', 12

dict_keys([('BPSK', 0), ('BPSK', 2), ('BPSK', 4), ('BPSK', 6), ('BPSK', 8), ('BPSK', 10), ('BPSK', 12), ('BPSK', 14), ('BPSK', 16), ('BPSK', 18), ('BPSK', 20), ('BPSK', 22), ('BPSK', 24), ('BPSK', 26), ('BPSK', 28), ('BPSK', 30), ('QPSK', 0), ('QPSK', 2), ('QPSK', 4), ('QPSK', 6), ('QPSK', 8), ('QPSK', 10), ('QPSK', 12), ('QPSK', 14), ('QPSK', 16), ('QPSK', 18), ('QPSK', 20), ('QPSK', 22), ('QPSK', 24), ('QPSK', 26), ('QPSK', 28), ('QPSK', 30), ('8PSK', 0), ('8PSK', 2), ('8PSK', 4), ('8PSK', 6), ('8PSK', 8), ('8PSK', 10), ('8PSK', 12), ('8PSK', 14), ('8PSK', 16), ('8PSK', 18), ('8PSK', 20), ('8PSK', 22), ('8PSK', 24), ('8PSK', 26), ('8PSK', 28), ('8PSK', 30), ('QAM16', 0), ('QAM16', 2), ('QAM16', 4), ('QAM16', 6), ('QAM16', 8), ('QAM16', 10), ('QAM16', 12), ('QAM16', 14), ('QAM16', 16), ('QAM16', 18), ('QAM16', 20), ('QAM16', 22), ('QAM16', 24), ('QAM16', 26), ('QAM16', 28), ('QAM16', 30), ('QAM64', 0), ('QAM64', 2), ('QAM64', 4), ('QAM64', 6), ('QAM64', 8), ('QAM64', 10), ('QAM64', 12

In [8]:
for modulation, snr in list(data_awgn.keys()):
    if modulation == "QAM16":
        data_awgn[("16QAM", snr)] = data_awgn.pop((modulation, snr))
        data_clean[("16QAM", snr)] = data_clean.pop((modulation, snr))
    elif modulation == "QAM64":
        data_awgn[("64QAM", snr)] = data_awgn.pop((modulation, snr))
        data_clean[("64QAM", snr)] = data_clean.pop((modulation, snr))
    elif modulation == "GFSK":
        data_awgn[("GMSK", snr)] = data_awgn.pop((modulation, snr))
        data_clean[("GMSK", snr)] = data_clean.pop((modulation, snr))
    elif modulation == "WBFM":
        data_awgn[("FM", snr)] = data_awgn.pop((modulation, snr))
        data_clean[("FM", snr)] = data_clean.pop((modulation, snr))
    elif modulation == "AM-SSB":
        data_awgn[("AM-SSB-SC", snr)] = data_awgn.pop((modulation, snr))
        data_clean[("AM-SSB-SC", snr)] = data_clean.pop((modulation, snr))
    elif modulation == "AM-DSB":
        data_awgn[("AM-DSB-SC", snr)] = data_awgn.pop((modulation, snr))
        data_clean[("AM-DSB-SC", snr)] = data_clean.pop((modulation, snr))

In [None]:
for modulation, snr in list(data_clean.keys()):
    new_key = (modulation, 9999)
    
    # If the new key already exists, append the values; otherwise, create it
    if new_key in data_clean:
        data_clean[new_key] = np.concatenate((data_clean[new_key], data_clean.pop((modulation, snr))), axis=0)
    else:
        data_clean[new_key] = data_clean.pop((modulation, snr))

In [None]:
for key, values in data_awgn.items():
    data_awgn[key] = np.transpose(np.array(values), (0, 2, 1))

for key, values in data_clean.items():
    data_clean[key] = np.transpose(np.array(values), (0, 2, 1))

(16000, 1024, 2)


In [None]:
for key, value in data_awgn.items():
    if key in data_orig:
        # Merge values if the key exists in both dictionaries
        data_orig[key] = np.concatenate((data_orig[key], value), axis=0)
    else:
        # Add the key-value pair if it doesn't exist
        data_orig[key] = value

(5096, 1024, 2)


In [14]:
data_clean_new = {}

for key, value in data_clean.items():
    data_clean_new[key] = value[:5096]

In [None]:
for key, value in data_clean_new.items():
    if key in data_orig:
        # Merge values if the key exists in both dictionaries
        data_orig[key] = np.concatenate((data_orig[key], value), axis=0)
    else:
        # Add the key-value pair if it doesn't exist
        data_orig[key] = value

dict_keys([('BPSK', np.float32(0.0)), ('BPSK', np.float32(2.0)), ('BPSK', np.float32(4.0)), ('BPSK', np.float32(6.0)), ('BPSK', np.float32(8.0)), ('BPSK', np.float32(10.0)), ('BPSK', np.float32(12.0)), ('BPSK', np.float32(14.0)), ('BPSK', np.float32(16.0)), ('BPSK', np.float32(18.0)), ('BPSK', np.float32(20.0)), ('BPSK', np.float32(22.0)), ('BPSK', np.float32(24.0)), ('BPSK', np.float32(26.0)), ('BPSK', np.float32(28.0)), ('BPSK', np.float32(30.0)), ('QPSK', np.float32(0.0)), ('QPSK', np.float32(2.0)), ('QPSK', np.float32(4.0)), ('QPSK', np.float32(6.0)), ('QPSK', np.float32(8.0)), ('QPSK', np.float32(10.0)), ('QPSK', np.float32(12.0)), ('QPSK', np.float32(14.0)), ('QPSK', np.float32(16.0)), ('QPSK', np.float32(18.0)), ('QPSK', np.float32(20.0)), ('QPSK', np.float32(22.0)), ('QPSK', np.float32(24.0)), ('QPSK', np.float32(26.0)), ('QPSK', np.float32(28.0)), ('QPSK', np.float32(30.0)), ('8PSK', np.float32(0.0)), ('8PSK', np.float32(2.0)), ('8PSK', np.float32(4.0)), ('8PSK', np.float32(6.

In [19]:
combined_dataset_path = "GOLD_XYZ_OSC_POSITIVE_COMBINED.hdf5"

In [20]:
orig_mapped_classes_index = {
    'BPSK': 0,
    'QPSK': 1,
    '8PSK': 2,
    '16QAM': 3,
    '64QAM': 4,    
    'AM-DSB-SC': 5,
    'AM-SSB-SC': 6,
    'FM': 7,
    'GMSK': 8
}

X_list = []
Y_list = []
Z_list = []

for modulation, snr in data_orig.keys():
    mapped_class = orig_mapped_classes_index[modulation]
    data = data_orig[(modulation, snr)]
    X_list.append(data)
    Y_list.append(np.full(data.shape[0], mapped_class))
    Z_list.append(np.full(data.shape[0], snr))

X_total = np.concatenate(X_list, axis=0)
Y_total = np.concatenate(Y_list, axis=0)
Z_total = np.concatenate(Z_list, axis=0)

with h5py.File(combined_dataset_path, 'w') as output_file:
    output_file.create_dataset('X', data=X_total, compression='gzip')
    output_file.create_dataset('Y', data=Y_total, compression='gzip')
    output_file.create_dataset('Z', data=Z_total, compression='gzip')


In [21]:
import h5py
total_dataset_path = "/kaggle/working/GOLD_XYZ_OSC_POSITIVE_COMBINED.hdf5"

with h5py.File(total_dataset_path, 'r') as file:
        X = file['X'][:]  # IQ samples
        Y = file['Y'][:]  # Class indices (already mapped to contiguous indices)
        Z = file['Z'][:]  # SNR values