In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd

max_int64 = np.iinfo(np.int64).max

# For the datasets (real trace data)
# In this example, we use 3 pairs of QLTs, BCLTs, and IOLTs for training data
df_1 = pd.read_csv('.../qlt.csv')
df_2 = pd.read_csv('.../bclt.csv')
df_3 = pd.read_csv('.../iolt.csv')
merged_df = pd.concat([df_1, df_2, df_3], axis=1)
merged_df.fillna(-1, inplace=True)
for col in merged_df.columns:
    if np.max(merged_df[col]) * 3 < max_int64:
        merged_df[col] = merged_df[col].astype(np.int64)

df_1 = pd.read_csv('.../qlt.csv')
df_2 = pd.read_csv('.../bclt.csv')
df_3 = pd.read_csv('.../iolt.csv')
copy_df = pd.concat([df_1, df_2, df_3], axis=1)
copy_df.fillna(-1, inplace=True)
for col in copy_df.columns:
    if np.max(copy_df[col]) * 3 < max_int64:
        copy_df[col] = copy_df[col].astype(np.int64)
final_df = pd.concat([copy_df, merged_df], ignore_index=True)

df_1 = pd.read_csv('.../qlt.csv')
df_2 = pd.read_csv('.../bclt.csv')
df_3 = pd.read_csv('.../iolt.csv')
copy_df = pd.concat([df_1, df_2, df_3], axis=1)
copy_df.fillna(-1, inplace=True)
for col in copy_df.columns:
    if np.max(copy_df[col]) * 3 < max_int64:
        copy_df[col] = copy_df[col].astype(np.int64)
final_df = pd.concat([copy_df, final_df], ignore_index=True)

df_train = final_df.copy()

# In this example, we use 1 pair of QLT, BCLT, and IOLT for testing data
df_1 = pd.read_csv('.../qlt.csv')
df_2 = pd.read_csv('.../bclt.csv')
df_3 = pd.read_csv('.../iolt.csv')
df_test = pd.concat([df_1, df_2, df_3], axis=1)
df_test.fillna(-1, inplace=True)
for col in df_test.columns:
    if np.max(df_test[col]) * 3 < max_int64:
        df_test[col] = df_test[col].astype(np.int64)

list_a = ['access_timestamp_in_microseconds_qlt', 'key', 'type_id', 'column_family_id_qlt', 'value_size']
df_pred = pd.DataFrame(index=df_test.index)
for col in [column for column in df_train.columns if df_train[column].nunique() == 1]:
    df_pred[col] = pd.Series(df_train[col].iloc[0]).repeat(len(df_test)).values
for col in list_a:
    df_pred[col] = df_test[col]
not_df_pred = [col for col in df_train.columns.tolist() if col not in df_pred.columns.tolist()]

dict_unique = {}
for col in not_df_pred:
    num_unique = df_train[col].nunique()
    dict_unique[col] = num_unique
continuous_columns = []
for col, num_unique in dict_unique.items():
    if num_unique > 15:
        continuous_columns.append(col)

feature = list_a
X_test = df_pred[feature].values
for col in not_df_pred:
    X_train = df_train[feature].values
    y_train = df_train[col].values
    if col in continuous_columns:
        model = LinearRegression()
    else:
        model = LogisticRegression(max_iter=1000000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred = np.absolute(y_pred)
    y_pred = np.round(y_pred)
    if np.max(y_pred) * 3 < max_int64:
        y_pred = y_pred.astype(np.int64)
    df_pred[col] = y_pred

df_pred = df_pred[df_train.columns]
df_pred.to_csv(f'.../combined_bclt_and_iolt.csv', index=False) # For the target folder of the results (synthetic trace data)

dict_score = {}
for col in df_pred.columns:
    mean_y_test = np.mean(df_test[col])
    std_y_test = np.std(df_test[col])
    median_y_test = np.median(df_test[col])
    min_y_test = np.min(df_test[col])
    max_y_test = np.max(df_test[col])
    percentile_25_y_test = np.percentile(df_test[col], 25)
    percentile_75_y_test = np.percentile(df_test[col], 75)
    range_y_test = np.max(df_test[col]) - np.min(df_test[col])
    variance_y_test = np.var(df_test[col])
    mean_y_pred = np.mean(df_pred[col])
    std_y_pred = np.std(df_pred[col])
    median_y_pred = np.median(df_pred[col])
    min_y_pred = np.min(df_pred[col])
    max_y_pred = np.max(df_pred[col])
    percentile_25_y_pred = np.percentile(df_pred[col], 25)
    percentile_75_y_pred = np.percentile(df_pred[col], 75)
    range_y_pred = np.max(df_pred[col]) - np.min(df_pred[col])
    variance_y_pred = np.var(df_pred[col])
    if mean_y_test + mean_y_pred == 0:
        epsilon = 1e-10
    else:
        epsilon = 0
    mean_result = abs(mean_y_test - mean_y_pred) / ((mean_y_test + mean_y_pred) / 2 + epsilon)
    if std_y_test + std_y_pred == 0:
        epsilon = 1e-10
    else:
        epsilon = 0
    std_result = abs(std_y_test - std_y_pred) / ((std_y_test + std_y_pred) / 2 + epsilon)
    if median_y_test + median_y_pred == 0:
        epsilon = 1e-10
    else:
        epsilon = 0
    median_result = abs(median_y_test - median_y_pred) / ((median_y_test + median_y_pred) / 2 + epsilon)
    if min_y_test + min_y_pred == 0:
        epsilon = 1e-10
    else:
        epsilon = 0
    min_result = abs(min_y_test - min_y_pred) / ((min_y_test + min_y_pred) / 2 + epsilon)
    if max_y_test + max_y_pred == 0:
        epsilon = 1e-10
    else:
        epsilon = 0
    max_result = abs(max_y_test - max_y_pred) / ((max_y_test + max_y_pred) / 2 + epsilon)
    if percentile_25_y_test + percentile_25_y_pred == 0:
        epsilon = 1e-10
    else:
        epsilon = 0
    percentile_25_result = abs(percentile_25_y_test - percentile_25_y_pred) / ((percentile_25_y_test + percentile_25_y_pred) / 2 + epsilon)
    if percentile_75_y_test + percentile_75_y_pred == 0:
        epsilon = 1e-10
    else:
        epsilon = 0
    percentile_75_result = abs(percentile_75_y_test - percentile_75_y_pred) / ((percentile_75_y_test + percentile_75_y_pred) / 2 + epsilon)
    if range_y_test + range_y_pred == 0:
        epsilon = 1e-10
    else:
        epsilon = 0
    range_result = abs(range_y_test - range_y_pred) / ((range_y_test + range_y_pred) / 2 + epsilon)
    if variance_y_test + variance_y_pred == 0:
        epsilon = 1e-10
    else:
        epsilon = 0
    variance_result = abs(variance_y_test - variance_y_pred) / ((variance_y_test + variance_y_pred) / 2 + epsilon)
    difference = np.mean([mean_result, std_result, median_result, min_result, max_result, percentile_25_result, percentile_75_result, range_result, variance_result])
    if difference > 2:
        difference = 2
    if col in continuous_columns:
        score = r2_score(df_test[col], df_pred[col])
        if score < 0 or score > 1:
            score = 0
    else:
        score = accuracy_score(df_test[col], df_pred[col])
    dict_score[col] = (score, difference)
average_score = np.mean([score for score, difference in dict_score.values()])
average_difference = np.mean([difference for score, difference in dict_score.values()])
print(f'Average Accuracy Score: {(average_score * 100):.2f}%')
print(f'Average Statistical Difference: {(average_difference * 100):.2f}%')