In [441]:
import numpy as np
import pandas as pd
from noise import *
import random
import os 
from sklearn.model_selection import train_test_split  # Import train_test_split

In [442]:
# x values
def generate_x_values(start_x, end_x, n_datapoints, n_samples):
    # create n x values
    xs = np.linspace(start_x, end_x, n_datapoints)
    xs = np.tile(xs, (n_samples, 1))
    return xs

In [443]:
# True functions
def sin(xs):
    return np.sin(xs)

In [444]:
# Noise functions
def pink(xs, amplitude):
    # Maybe change to y_true as input (instead of xs)
    y_added_noise = np.zeros(xs.shape)
    for sample in range(xs.shape[0]):
        y_added_noise[sample, :] = amplitude*pink_noise(xs[sample, :])
    return y_added_noise

def uniform(xs, lower_bound, upper_bound):
    return np.random.uniform(lower_bound, upper_bound)

def gaussian(xs, std):
    return np.random.normal(0, std, xs.shape)

In [445]:
# SNR
def calculate_snr(y_true, y_noise): ### write own code later
    # Calculate signal power
    signal_power = np.mean(y_true ** 2)
    # Calculate noise power
    noise_power = np.mean((y_noise - y_true) ** 2)
    # Calculate (linear) SNR
    snr = signal_power / noise_power
    # Convert to dB
    snr_db = 10 * np.log10(snr)
    return snr, snr_db

In [446]:
# Dataset parameters
n_datapoints = 30  # number of (total) datapoints generated
n_samples = 100
start_x, end_x = -10, 10  # x values to be used

# Train/validation/test split parameters
train_size = 0.7
validation_size = 0.1
test_size = 0.2

# Function parameters
func = sin  # Available: x, x_sq, sin

# Noise parameters
noise = gaussian  #  #TODO Available: pink, uniform, gaussian

amplitude = None  
upper_bound, lower_bound = None, None
std = None

if noise == pink:
    amplitude = 0.001 # We want values: amplitude in (0.01, 20), to get snr_db in approx. (-30, 40) for sin(x) 
elif noise == uniform:
    upper_bound, lower_bound = -0.01, 0.01  # We want values: upper_bound, lower_bound in (-0.01, 0.01) to (-20, 20), to ge snr_db in approx. (-30, 40) for sin(x)
elif noise == gaussian:
    std = 4.5 #Mellan [0.01, 100]

In [447]:
# Dataset generation
xs = generate_x_values(start_x, end_x, n_datapoints, n_samples)  # x values
#print(xs.shape)
y_true = func(xs)

if noise == pink:
    y_added_noise = noise(xs, amplitude)
elif noise == uniform:
    y_added_noise = noise(xs, lower_bound, upper_bound)
elif noise == gaussian:
    y_added_noise = noise(xs, std)

y_noise = y_true + y_added_noise
print(y_noise.shape)

# Calculate snr, snr_db for the entire dataset
snr, snr_db = calculate_snr(y_true, y_noise)

(100, 30)


In [448]:
sum_snr = 0
sum_snr_db = 0
for i in range(y_noise.shape[0]):
        
    snr_, snr_db = calculate_snr(y_true, y_noise)
    sum_snr += snr_
    sum_snr_db += snr_db

snr_ = sum_snr / y_noise.shape[0]
snr_db_ = sum_snr_db / y_noise.shape[0]
print(f"SNR: {snr_} DB: {snr_db_}")

SNR: 0.022667274416647558 DB: -16.446006976491002


In [449]:
# Saving data
# create the datasets in dataframe format
true_data_df = pd.DataFrame(y_true)

noisy_data_df = pd.DataFrame(y_noise)


# Step 1: Split 80% for training/validation and 20% for testing
X_temp, X_test, y_temp, y_test = train_test_split(
    noisy_data_df,  
    true_data_df,
    test_size=0.2,   # 20% for testing
    random_state=42  
)

# Step 2: Split the remaining 80% into 70% training and 10% validation
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, 
    y_temp,
    test_size=0.125,  # 10% of the original total = 0.1 / 0.8 = 0.125
    random_state=42  
)


In [450]:
save = True
if save:
    func_name = func.__name__
    noise_name = noise.__name__
    name = func_name + "_" + noise_name + "_" + str(round(snr_db_, 2))
    folder_name = f"../datasets/seq_{name}"

    os.makedirs(folder_name, exist_ok=True)

    y_train.to_csv(f'{folder_name}/y_data_train.csv', index=False)
    y_val.to_csv(f'{folder_name}/y_data_val.csv', index=False)
    y_test.to_csv(f'{folder_name}/y_data_test.csv', index=False)
    X_train.to_csv(f'{folder_name}/X_data_train.csv', index=False)
    X_val.to_csv(f'{folder_name}/X_data_val.csv', index=False)
    X_test.to_csv(f'{folder_name}/X_data_test.csv', index=False)

    

    with open(f'{folder_name}/meta_data.txt', "w") as file:
        # Write text to the file
        file.write(f"n_samples: {n_samples}\n")
        file.write(f"n_datapoints: {n_datapoints}\n")
        file.write(f"start_x, end_x: {start_x, end_x }\n")
        file.write(f"function: {func_name}\n")
        file.write(f"noise: {noise_name}\n")
        file.write(f"snr, snr_db: {snr, snr_db}\n\n")
        file.write(f"Noise specific\n")
        file.write(f"(pink noise only) amplitude: {amplitude}\n")
        file.write(f"(uniform noise only) upper_bound, lower_bound: {upper_bound, lower_bound}\n")
        file.write(f"(gaussian noise only) std: {std}")