New data generation file

In [61]:
import numpy as np
import pandas as pd
from noise import *
import random
import os 
from sklearn.model_selection import train_test_split  # Import train_test_split

Functions Used for Data Generation

In [62]:
# x values
def generate_x_values(start_x, end_x, n_datapoints):
    # create n x values
    xs = np.linspace(start_x, end_x, n_datapoints)
    return xs

In [63]:
# True functions
def x(xs):
    return x

def x_sq(xs):
    return xs**2

def sin(xs):
    return np.sin(xs)

In [64]:
# Noise functions
def pink(xs, amplitude):
    return amplitude*pink_noise(xs)

def uniform(xs, lower_bound, upper_bound):
    return np.random.uniform(lower_bound, upper_bound)

def gaussian(xs, std):
    return np.random.normal(0, std, xs.shape)


In [65]:
# SNR
def calculate_snr(y_true, y_noise): ### write own code later
    # Calculate signal power
    signal_power = np.mean(y_true ** 2)
    # Calculate noise power
    noise_power = np.mean((y_noise - y_true) ** 2)
    # Calculate (linear) SNR
    snr = signal_power / noise_power
    # Convert to dB
    snr_db = 10 * np.log10(snr)
    return snr, snr_db

Change Parameters for Data Generation Here

In [None]:
# Dataset parameters
n_datapoints = 1000  # number of (total) datapoints generated
start_x, end_x = -10, 10  # x values to be used

# Train/validation/test split parameters
train_size = 0.7
validation_size = 0.2
test_size = 0.1

# Function parameters
func = sin  # Available: x, x_sq, sin

# Noise parameters
noise = gaussian  #  #TODO Available: pink, uniform, gaussian

amplitude = None  
upper_bound, lower_bound = None, None
std = None

if noise == pink:
    amplitude = 0.01 # We want values: amplitude in (0.01, 20), to get snr_db in approx. (-30, 40) for sin(x) 
elif noise == uniform:
    upper_bound, lower_bound = -0.01, 0.01  # We want values: upper_bound, lower_bound in (-0.01, 0.01) to (-20, 20), to ge snr_db in approx. (-30, 40) for sin(x)
elif noise == gaussian:
    std = 1 #Mellan [0.01, 100]



Generation of Datasets

In [67]:
# Dataset generation
xs = generate_x_values(start_x, end_x, n_datapoints)  # x values
y_true = func(xs)
if noise == pink:
    y_added_noise = noise(xs, amplitude)
elif noise == uniform:
    y_added_noise = noise(xs, lower_bound, upper_bound)
elif noise == gaussian:
    y_added_noise = noise(xs, std)

y_noise = y_true + y_added_noise

# Calculate snr, snr_db for the entire dataset
snr, snr_db = calculate_snr(y_true, y_noise)

In [68]:
# Gather data
data = {'x': xs,'y_noise': y_noise, 'y_true': y_true}
df = pd.DataFrame(data)

# Split into train/validation/test sets
train_df, test_val_test_df = train_test_split(df, train_size=train_size, random_state=42)
if test_size == 0:  # to be able to run the code when we don't need a test set
    validation_df = test_val_test_df
    test_df = None
else:
    validation_df, test_df = train_test_split(test_val_test_df, train_size=validation_size/(validation_size+test_size), random_state=42)

In [69]:
# Convert DataFrame to CSV
print(train_df.head())
print("------------")
print(test_df.head())
print("------------")
print(validation_df.head())

            x   y_noise    y_true
541  0.830831  0.760921  0.738492
440 -1.191191 -0.886211 -0.928811
482 -0.350350 -0.229023 -0.343227
422 -1.551552 -1.047368 -0.999815
778  5.575576 -0.581305 -0.650019
------------
            x   y_noise    y_true
557  1.151151  0.976650  0.913234
798  5.975976 -0.311541 -0.302400
977  9.559560 -0.109017 -0.134374
136 -7.277277 -0.784943 -0.838264
575  1.511512  0.967507  0.998243
------------
            x   y_noise    y_true
904  8.098098  0.942832  0.970351
543  0.870871  0.757773  0.764890
139 -7.217217 -0.762336 -0.804024
526  0.530531  0.502603  0.505991
868  7.377377  0.842863  0.888558


Save Data

In [70]:
# Naming of the folder and file
func_name = func.__name__
noise_name = noise.__name__
folder_name = func_name + "_" + noise_name + "_" + str(round(snr_db, 2))
print(folder_name)

# Create folder
os.makedirs(f'./datasets/{folder_name}', exist_ok=True)

# Save training, validation, and test sets to CSV files
df.to_csv(f'./datasets/{folder_name}/true_data.csv', index=False)
train_df.to_csv(f'./datasets/{folder_name}/train_data.csv', index=False)
validation_df.to_csv(f'./datasets/{folder_name}/validation_data.csv', index=False)
test_df.to_csv(f'./datasets/{folder_name}/test_data.csv', index=False)

# Save parameters to a text file
file_path = f'./datasets/{folder_name}/params.txt'

with open(file_path, "w") as file:
    file.write(f"n_datapoints: {n_datapoints}\n")
    file.write(f"start_x, end_x: {start_x, end_x }\n")
    file.write(f"function: {func_name}\n")
    file.write(f"noise: {noise_name}\n")
    file.write(f"snr, snr_db: {snr, snr_db}\n\n")
    file.write(f"Noise specific\n")
    file.write(f"(pink noise only) amplitude: {amplitude}\n")
    file.write(f"(uniform noise only) upper_bound, lower_bound: {upper_bound, lower_bound}\n")
    file.write(f"(gaussian noise only) std: {std}")

sin_gaussian_22.84
