In [None]:
from natsort import natsorted
from sklearn.mixture import GaussianMixture
import numpy as np
import os
import pandas as pd

root_folder = '' # For the datasets (real trace data)

def find_and_sort_csv_files(folder):
    csv_files = []
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith('.csv'):
                csv_files.append(os.path.join(root, file))
    csv_files_sorted = natsorted(csv_files)
    return csv_files_sorted

all_csv_files_sorted = find_and_sort_csv_files(root_folder)
max_int64 = np.iinfo(np.int64).max

def process_dataset(file_path):
    df = pd.read_csv(file_path)
    model = GaussianMixture(n_components=5)
    model.fit(df)
    synthetic_data = model.sample(len(df))[0]
    df_predict = pd.DataFrame(data=synthetic_data, columns=df.columns)
    df_predict = np.round(df_predict)
    for col in df_predict.columns:
        if df_predict[col].max() * 3 < max_int64:
            df_predict[col] = df_predict[col].astype(np.int64)
    dir_name = os.path.basename(os.path.dirname(file_path))
    file_name = os.path.splitext(os.path.basename(file_path))[0]
    final_name = f'{dir_name}_{file_name}'
    df_predict.to_csv(f'.../{final_name}.csv', index=False) # For the target folder of the results (synthetic trace data)

In [None]:
for csv_file in all_csv_files_sorted:
    file_path = os.path.join(root_folder, csv_file)
    process_dataset(file_path)
    print(f'{file_path}')