In [None]:
from natsort import natsorted
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import os
import pandas as pd

root_folder = '' # For the datasets (real trace data)

def find_and_sort_csv_files(folder):
    csv_files = []
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith('.csv'):
                csv_files.append(os.path.join(root, file))
    csv_files_sorted = natsorted(csv_files)
    return csv_files_sorted

all_csv_files_sorted = find_and_sort_csv_files(root_folder)
distance = 1
count = 50

def process_dataset(file_path):
    df = pd.read_csv(file_path)
    train_length = int(np.round(len(df) * (1 - test_size)))
    test_length = len(df) - train_length
    df_train = df.head(train_length).copy()
    df_test = df.tail(test_length).copy()
    df_predict = df_test.copy()
    non_immutable_columns = []
    continuous_columns = []
    discrete_columns = []
    for col in df_train.columns:
        unique_values = df_train[col].nunique()
        if unique_values > 1:
            non_immutable_columns.append(col)
            if unique_values > 15:
                continuous_columns.append(col)
            else:
                discrete_columns.append(col)
    df_edited = df.copy()
    new_columns = {}
    for col in non_immutable_columns:
        for i in range(1, count + 1):
            new_columns[f'{col}_{i}'] = df_edited[col].shift((i - 1) * distance + test_length)
    new_df = pd.DataFrame(new_columns)
    df_edited = pd.concat([df_edited, new_df], axis=1)
    for col in non_immutable_columns:
        for i in range(1, count + 1):
            df_edited[f'{col}_{i}'].fillna(-1, inplace=True)
            if df_train[col].dtype == 'int64':
                df_edited[f'{col}_{i}'] = np.round(df_edited[f'{col}_{i}']).astype('int64')
    for col in df_train.columns:
        if col in non_immutable_columns:
            X = np.hstack((df_edited.index.values.reshape(-1, 1), df_edited[[f'{col}_{i}' for i in range(1, count + 1)]].values))
            y = df_edited[col].values
            X_train, X_test = X[:train_length], X[train_length:]
            y_train, y_test = y[:train_length], y[train_length:]
            if col in continuous_columns:
                tail_value = int(0.95 * len(X_train))
                X_train = X_train[tail_value:]
                y_train = y_train[tail_value:]
            scale = MinMaxScaler()
            X_train = scale.fit_transform(X_train)
            X_test = scale.transform(X_test)
            if col in continuous_columns:
                model = LinearRegression()
            else:
                model = LogisticRegression(max_iter=10000, random_state=42)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            y_pred = np.where(y_pred < -1, -1, y_pred)
            if df_train[col].dtype == 'int64':
                y_pred = np.round(y_pred).astype('int64')
            df_predict[col] = y_pred
    dir_name = os.path.basename(os.path.dirname(file_path))
    file_name = os.path.splitext(os.path.basename(file_path))[0]
    final_name = f'{dir_name}_{file_name}_{int(np.round(test_size * 100))}'
    df_predict.to_csv(f'.../{final_name}.csv', index=False) # For the target folder of the results (synthetic trace data)

In [None]:
test_size = 0.2

for csv_file in all_csv_files_sorted:
    file_path = os.path.join(root_folder, csv_file)
    process_dataset(file_path)
    print(f'{file_path}')

In [None]:
test_size = 0.5

for csv_file in all_csv_files_sorted:
    file_path = os.path.join(root_folder, csv_file)
    process_dataset(file_path)
    print(f'{file_path}')

In [None]:
test_size = 0.8

for csv_file in all_csv_files_sorted:
    file_path = os.path.join(root_folder, csv_file)
    process_dataset(file_path)
    print(f'{file_path}')

In [None]:
test_size = 0.9

for csv_file in all_csv_files_sorted:
    file_path = os.path.join(root_folder, csv_file)
    process_dataset(file_path)
    print(f'{file_path}')

In [None]:
test_size = 0.95

for csv_file in all_csv_files_sorted:
    file_path = os.path.join(root_folder, csv_file)
    process_dataset(file_path)
    print(f'{file_path}')

In [None]:
test_size = 0.99

for csv_file in all_csv_files_sorted:
    file_path = os.path.join(root_folder, csv_file)
    process_dataset(file_path)
    print(f'{file_path}')