# Library


In [1]:
import os
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
import torch
from resources.data_utils import get_RFP_type_single, check_smoothness_dataset, peak_to_ring_num, random_sampling, check_smoothness, get_RFP_type, scale_dataset, scaleback_dataset

In [2]:
%run VAE_core.ipynb
%run MLP_VAE_core.ipynb

In [None]:
# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def moving_average_filter(data, window_size):
    half_window = window_size // 2
    length = data.shape[0]
    smoothed_data = np.zeros(length)
    
    # Handle center cases
    smoothed_data[:half_window] = data[:half_window]
    
    # Apply the moving average filter for the middle sequences
    for i in range(half_window, length - half_window):
        window = data[i - half_window: i + half_window + 1]
        smoothed_data[i] = window.mean()
    
    # Handle boundary cases
    smoothed_data[length - half_window:] = data[length - half_window:]
    
    return smoothed_data

def remove_rand_peaks(data, zero_threshold=0, min_zero_length=20, peak_threshold=0.05):
    filtered_data = np.copy(data)
    found_zero_chunk = False
    
    for i in range(len(data)):
        if i + min_zero_length <= len(data) and np.all(filtered_data[i:i + min_zero_length] <= zero_threshold):
            
            # Check if there's a peak in the chunk that exceeds peak_threshold
            if np.any(filtered_data[i:i + min_zero_length] > peak_threshold):
                # If peak exists, continue searching for a valid zero chunk
                continue  

            else:
                # Found a valid zero chunk without significant peaks
                start_index = i
                found_zero_chunk = True
                break

    if found_zero_chunk: #remove rand peak
        filtered_data[start_index:] = 0

    return filtered_data


In [162]:
def save_ML_by_type(idx, num_pattern, target_idx_list, all_ML_outputs, all_params, target_ring_num, output_dir):
    # target_ring_num: wanted ring number
    # all_ML_outputs: ML predictions
    # all_params: random parameters
    # num_pattern: max number of data points to collect
    # idx: file ID
    # target_idx_list: index list of target class

    outputs = []
    params = []
    types = []

    count = 0
    for i in target_idx_list:
        count += 1
        data = all_ML_outputs[i, 0, :]
        param = np.array(all_params[i])
        _, RFP_type = get_RFP_type_single(data)

        # Check smoothness
        if_smooth, _ = check_smoothness(data)
        if (data[-1] > 0.05 * np.max(data)):
            if_smooth = 0

        if if_smooth == 1: # if data is smooth
            # Append data point 
            outputs.append(np.array(data))
            params.append(np.array(param))
            types.append(np.array(RFP_type))

            # Break if got enough data points
            if len(outputs) == num_pattern:
                break

    # Save temp files
    filename = output_dir + str(target_ring_num) + '_rings_outputs_' + str(idx) +'.npy'
    np.save(filename, np.array(outputs))
    filename = output_dir + str(target_ring_num) + '_rings_params_' + str(idx) +'.npy'
    np.save(filename, np.array(params))
    filename = output_dir + str(target_ring_num) + '_rings_types_' + str(idx) +'.npy'
    np.save(filename, np.array(types))

def concatenate_temps_by_type(idx, num_pattern, target_ring_num, col_names, output_dir):
    all_outputs = []
    all_params = []
    all_types = []

    for i in range(1, idx+1): 

        # Outputs
        filename = output_dir + f'{target_ring_num}_rings_outputs_{i}.npy'
        if not os.path.exists(filename): # Skip if file doesn't exist
            continue
            
        arr = np.load(filename)
        if len(arr) == 0: # If it's empty
            continue
        all_outputs.append(arr)

        # Parameters
        filename = output_dir + f'{target_ring_num}_rings_params_{i}.npy'
        all_params.append(np.load(filename))

        # Pattern types
        filename = output_dir + f'{target_ring_num}_rings_types_{i}.npy'
        all_types.append(np.load(filename, allow_pickle=True))
    
    # If any is empty, skip
    if not all_outputs or not all_params or not all_types:
        print(f"{target_ring_num} ring -- Empty file")
        return 0, pd.DataFrame(columns=col_names)

    # Concatenate results
    all_outputs = np.concatenate(all_outputs)
    all_params = np.concatenate(all_params)
    all_types = np.concatenate(all_types)
    
    # df
    target_df = pd.DataFrame(all_params, columns=col_names)
    target_df['RFP_type'] = all_types # peak number
    target_df = target_df.head(num_pattern)

    # Save
    print(target_ring_num, ' ring -- ', len(all_types))
    filename = output_dir + str(target_ring_num) + '_rings_outputs.npy'
    np.save(filename, np.array(all_outputs)[0: num_pattern])
    filename = output_dir + str(target_ring_num) + '_rings_params.npy'
    np.save(filename, np.array(all_params)[0: num_pattern])
    filename = output_dir + str(target_ring_num) + '_rings_types.npy'
    np.save(filename, np.array(all_types)[0: num_pattern])

    return len(all_outputs), target_df, all_outputs

# Sampling setup and model

In [174]:
data_dir = "./all_data/"
output_dir = data_dir + "inference_1/" 
model_dir = './model/'
os.makedirs(output_dir, exist_ok=True)

In [160]:
# Sampling setup
scaling_ranges = {
    'DC': [0.5e-3, 12.5e-2],
    'aC': [0.1, 1],
    'aA': [100, 100000],
    'aT': [10, 8000],
    'aL': [5, 500],
    'dA': [0.001, 0.1],
    'dT': [3, 300],
    'dL': [0.144, 14.4],
    'alpha': [1, 5],
    'beta':  [2, 2000],
    'Kphi':  [1, 10],
    'N0':  [200000, 5000000]
}
scaling_options = ['exp','linear','exp','exp','exp', 'exp','exp','exp','linear','exp','linear','linear']

In [None]:
# Model parameters
input_dim = 12
seq_length = 201
batch_size = 32
latent_dim = 16
latent_channel = 16

# Load trained model
# Initiate VAE
vae = VAE(seq_length, latent_dim, latent_channel)
filename = model_dir + 'VAE.pt'
# Initiate MLP
model = CombinedModel(input_dim, latent_dim, vae.decoder, 42) 
filename = os.path.join(model_dir, "MLP_VAE.pt")
# Load trained model
model.load_state_dict(torch.load(filename))
model = model.to(device)
model.eval()
print(model)

# Loop for generating a target type

## Loop

In [None]:
idx = 1
num_nonpattern = 5000  # Target patterning sets
num_pattern = 5000 # Target patterning sets
target_class = 1 # 1 ring
col_names = scaling_ranges.keys()
N = 10000 # Total number of random parameters to infer every loop

for iii in tqdm(range(0, 400)):

    # Randomly sample parameters
    rand_params = random_sampling(scaling_ranges, scaling_options, col_names, N)
    rand_params = scale_dataset(rand_params, scaling_ranges, scaling_options)    
    params_tensor = torch.tensor(rand_params, dtype=torch.float32)

    # Save and load
    filename = os.path.join(output_dir, 'temp.txt')
    np.savetxt(filename, params_tensor, delimiter=',', fmt='%0.8f')
    params_tensor = np.loadtxt(filename, delimiter=',')
    params_tensor = torch.tensor(params_tensor, dtype=torch.float32).to(device)
  
    # Get start time
    start_time = time.time()

    # Predict 
    with torch.no_grad():
        ML_outputs, _, _ = model(params_tensor)
    end_time = time.time()

    #  Total compute time
    comp_time = end_time - start_time
    comp_time_per_pred = comp_time/len(ML_outputs)
    print(' **************************************************************** ')
    print(f"Comp time: {comp_time} seconds")
    print(f"Averaged comp time: {comp_time_per_pred} seconds")
    print(str((iii + 1) * N) + ' data points have been generated so far')

    ML_outputs = ML_outputs.cpu().numpy()
    params_tensor = params_tensor.cpu()
    
    # print(' -------------------------- Smoothing -------------------------- ')
    # ML_outputs = np.array(ML_outputs)
    # for i in range(0, len(ML_outputs)):
    #     data = ML_outputs[i, 0, :]
    #     data = remove_rand_peaks(data, zero_threshold=0.05, min_zero_length=5, peak_threshold=0.1)
    #     data = np.array(moving_average_filter(data, window_size=9)) 
    #     ML_outputs[i, 0, :] = data 
        
    print(' ---------------------- Get pattern type ----------------------- ')
    _, RFP_type_list = get_RFP_type(ML_outputs)
    if_smooth_list = check_smoothness_dataset(ML_outputs)
    
    print('------------------------ Process by type ------------------------ ')
    output_df = pd.DataFrame(params_tensor, columns=col_names)
    output_df['if_smooth'] = if_smooth_list
    output_df['RFP_type'] = RFP_type_list
    output_df['ring_num'] = peak_to_ring_num(np.array(RFP_type_list))

    # Remove nonsmooth predictions
    output_df = output_df[output_df['if_smooth'] == 1]

    # Get index for target type
    RFP_target_idx = output_df[output_df['ring_num'] == target_class].index
    print(len(RFP_target_idx))
    print(' ************************ Save temp files ************************ ')
    save_ML_by_type(idx, num_nonpattern, RFP_target_idx, ML_outputs, params_tensor, target_class, output_dir)

    print(' ************************** Concatenate ************************** ')
    num_target, target_df, concate_outputs = concatenate_temps_by_type(idx, num_pattern, target_class, col_names, output_dir)
    
    idx += 1
    if num_target == num_pattern:
        print('Finished!')
        break


## Save

In [None]:
# Save the final dataset
print(len(target_df))
all_sets = target_df
all_set_outputs = concate_outputs
ML_all_outputs = np.vstack((all_set_outputs))
ML_all_params = all_sets[col_names]
ML_all_types = all_sets['RFP_type']

# Save normalied parametrs
filename = output_dir + 'ML_norm_params.npy'
np.save(filename, np.array(ML_all_params)) 
filename = os.path.join(output_dir, 'ML_norm_params.txt')
np.savetxt(filename, ML_all_params, delimiter=',', fmt='%0.8f')

# Scale params
ML_all_params = ML_all_params.to_numpy()
ML_all_params = scaleback_dataset(ML_all_params, scaling_ranges, scaling_options)

# Save
filename = output_dir + 'ML_outputs.npy'
np.save(filename, np.array(ML_all_outputs))
filename = output_dir + 'ML_params.npy'
np.save(filename, np.array(ML_all_params)) # not scaled back
filename = output_dir + 'ML_types.npy'
np.save(filename, np.array(ML_all_types))

filename = os.path.join(output_dir, 'ML_params.txt')
np.savetxt(filename, ML_all_params, delimiter=',', fmt='%0.8f')
print(filename)