In [None]:
import numpy as np
import matplotlib.pyplot as plt
import h5py  # For handling large datasets

def gaussian_peak(mz_range, center, width):
    """Generate Gaussian peak based on center and width."""
    return np.exp(-(mz_range - center)**2 / (2 * width**2))

def generate_spectrum(mz_range, peak_info):
    """Generate a spectrum with multiple peaks based on the peak information."""
    spectrum = np.zeros_like(mz_range)
    for center, width in peak_info:
        spectrum += gaussian_peak(mz_range, center, width)
    return spectrum

def generate_data_pairs(num_pairs, max_groups, blur_range):
    """Generate data pairs of spectra with variable number of peak groups."""
    mz_range = np.linspace(50, 150, 400)
    with h5py.File('mass_spectra_test_v1.hdf5', 'w') as f:
        blurred_dset = f.create_dataset('blurred_spectra', (num_pairs, len(mz_range)), dtype='float32')
        distinct_dset = f.create_dataset('distinct_spectra', (num_pairs, len(mz_range)), dtype='float32')
        
        for i in range(num_pairs):
            num_groups = np.random.randint(1, max_groups + 1)  # Random number of peak groups
            blurred_peaks = []
            distinct_peaks = []
            for _ in range(num_groups):
                center = np.random.randint(50, 145)  # Adjust to avoid overflow beyond 150
                blurred_width = np.random.uniform(blur_range[0], blur_range[1])
                blurred_peaks.append((center, blurred_width))
                blurred_peaks.append((center + 5, blurred_width))
                distinct_peaks.append((center, 0.5))
                distinct_peaks.append((center + 5, 0.5))
            blurred_spectrum = generate_spectrum(mz_range, blurred_peaks) + np.random.normal(0, 0.05, mz_range.size)
            distinct_spectrum = generate_spectrum(mz_range, distinct_peaks) + np.random.normal(0, 0.01, mz_range.size)
            blurred_dset[i, :] = blurred_spectrum
            distinct_dset[i, :] = distinct_spectrum

# Example usage
num_pairs = 1  # Total number of spectrum pairs to generate
max_groups = 10  # Maximum number of peak groups per spectrum
blur_range = (1.5, 2.2)  # Width range for blurred peaks

generate_data_pairs(num_pairs, max_groups, blur_range)
