In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score

df = pd.read_excel('data_for_train.xlsx')

df['workload_code'] = df['workload_size'].astype('category').cat.codes

qs_values = [1, 10, 100, 1000, 10000, 100000, 1000000]
qs1_space = qs2_space = qs3_space = qs_values

qs3_allowed = {
    1: qs_values,
    2: qs_values,
    3: qs_values
}

X_all = df[['grouping', 'workload_code', 'qs1', 'qs2', 'qs3']]
y_time = df['time']
y_energy = df['energy']
X_train, X_test, y_time_train, y_time_test, y_energy_train, y_energy_test = train_test_split(
    X_all, y_time, y_energy, random_state=42, test_size=0.2
)

def train_rf_model(X_train, y_train, target_name):
    model = RandomForestRegressor(n_estimators=100, random_state=42)

    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    print(f"[{target_name}] Cross-Validation R²: {scores.mean():.4f} ± {scores.std():.4f}")

    model.fit(X_train, y_train)
    return model

model_time = train_rf_model(X_train, y_time_train, 'time')
model_energy = train_rf_model(X_train, y_energy_train, 'energy')

print(f"[time] Final Test R²: {model_time.score(X_test, y_time_test):.4f}")
print(f"[energy] Final Test R²: {model_energy.score(X_test, y_energy_test):.4f}")

groupings = df['grouping'].unique()
workloads = df['workload_size'].unique()
workload_to_code = dict(zip(df['workload_size'], df['workload_code']))

results = []

for grouping in groupings:
    for workload in workloads:
        workload_code = workload_to_code[workload]

        for target in ['time', 'energy']:
            model = model_time if target == 'time' else model_energy
            best_score = None
            best_params = None

            for qs1 in qs1_space:
                for qs2 in qs2_space:
                    for qs3 in qs3_allowed[grouping]:
                        X_candidate = pd.DataFrame({
                            'grouping': [grouping],
                            'workload_code': [workload_code],
                            'qs1': [qs1],
                            'qs2': [qs2],
                            'qs3': [qs3]
                        })
                        score = model.predict(X_candidate)[0]

                        if best_score is None or score < best_score:
                            best_score = score
                            best_params = (qs1, qs2, qs3)

            X_best = pd.DataFrame({
                'grouping': [grouping],
                'workload_code': [workload_code],
                'qs1': [best_params[0]],
                'qs2': [best_params[1]],
                'qs3': [best_params[2]]
            })

            predicted_time = model_time.predict(X_best)[0]
            predicted_energy = model_energy.predict(X_best)[0]

            results.append({
                'grouping': grouping,
                'workload_size': workload,
                'optimized_for': target,
                'qs1': best_params[0],
                'qs2': best_params[1],
                'qs3': best_params[2],
                'predicted_time': predicted_time,
                'predicted_energy': predicted_energy
            })

results_df = pd.DataFrame(results)
summary = []
for workload in workloads:
    workload_results = results_df[results_df['workload_size'] == workload]

    for target, metric in [('time', 'predicted_time'), ('energy', 'predicted_energy')]:
        best_row = workload_results[workload_results['optimized_for'] == target].nsmallest(1, metric).iloc[0]
        summary.append({
            'Best for': target,
            'workload_size': workload,
            'grouping': best_row['grouping'],
            'qs1': best_row['qs1'],
            'qs2': best_row['qs2'],
            'qs3': best_row['qs3'],
            'predicted_time': best_row['predicted_time'],
            'predicted_energy': best_row['predicted_energy']
        })

summary_df = pd.DataFrame(summary)
with pd.ExcelWriter("optimal_qs_results.xlsx", engine="openpyxl") as writer:
    results_df.to_excel(writer, sheet_name="Optimal Configs", index=False, startrow=0)
    summary_df.to_excel(writer, sheet_name="Optimal Configs", index=False, startrow=len(results_df) + 5)

print("\n Training, cross-validation, and opt are done in 'optimal_qs_results.xlsx'")

print("Results:")
print(results_df)
print("\n Summary:")
print(summary_df)


[time] Cross-Validation R²: 0.9835 ± 0.0041
[energy] Cross-Validation R²: 0.9590 ± 0.0137
[time] Final Test R²: 0.9658
[energy] Final Test R²: 0.8950

 Training, cross-validation, and opt are done in 'optimal_qs_results.xlsx'
Results:
    grouping workload_size optimized_for      qs1      qs2      qs3  \
0          1         large          time     1000     1000     1000   
1          1         large        energy     1000     1000     1000   
2          1         small          time  1000000  1000000  1000000   
3          1         small        energy   100000   100000   100000   
4          2         large          time       10       10       10   
5          2         large        energy   100000   100000   100000   
6          2         small          time    10000    10000     1000   
7          2         small        energy     1000     1000     1000   
8          3         large          time        1        1        1   
9          3         large        energy       10      