In [1]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import GRU, Dense, LSTM
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from keras.utils import to_categorical
import pandas as pd
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import os
import numpy as np
import datetime
from tensorflow import keras
import tensorflow as tf
from tensorboard.plugins.hparams import api as hp
import optuna
from optuna import logging
from keras.optimizers import Adam
from optuna.integration import TFKerasPruningCallback

path = "../../dane/8CPU_20RAM/3600s/7_merged.csv"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Załaduj dane
df = pd.read_csv(path)

In [3]:
# nie stosujemy one-hod encoding (kolumny z 0 i 1) bo zwiększy to bardzo czas obliczen
for col in ['replicaId', 'endpointUrl_methods']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [4]:
# Dodajemy skalowanie dla wybranych kolumn
features_to_scale = ['queueSizeForward_methods', 'queueSizeBack_methods',
                     'cpuUsage_stock', 'memoryUsage_stock',
                     'applicationTime_trading', 'databaseTime_trading',
                     'numberOfSellOffers_trading', 'numberOfBuyOffers_trading']
scaler = StandardScaler() #StandardScaler()MinMaxScaler
for col in features_to_scale:
    df[col] = scaler.fit_transform(df[col].values.reshape(-1, 1))

In [5]:
# Kodowanie kolumny 'test'
le_test = LabelEncoder()
df['test'] = le_test.fit_transform(df['test'])

In [6]:
# Wybór cech
features = [
            # 'timestamp',
            # 'apiTime_methods',
            # 'applicationTime_methods',
            # 'databaseTime_methods',
            # 'endpointUrl_methods',
            # 'queueSizeForward_methods',
            # 'queueSizeBack_methods',
            'cpuUsage_stock',
            'memoryUsage_stock',
            # 'applicationTime_trading',
            # 'databaseTime_trading',
            # 'numberOfSellOffers_trading',
            # 'numberOfBuyOffers_trading',
            # 'cpuUsage_traffic',
            # 'memoryUsage_traffic',
            # 'replicaId'
            ]
df_features = df[features]

# Kopiowanie danych
df_encoded = df_features.copy()
df_encoded['test'] = df['test']

In [7]:
def create_windows(X, y, window_size, step_size):
    X_windows = []
    y_windows = []

    # Przesuń okno po danych
    for i in range(0, len(X) - window_size, step_size):
        # Utwórz okno danych
        X_window = X.iloc[i:i + window_size]
        # Utwórz etykietę dla okna (etykieta ostatniej obserwacji w oknie)
        y_window = y.iloc[i + window_size]

        X_windows.append(X_window.values)
        y_windows.append(y_window)

    return np.array(X_windows), np.array(y_windows)

In [8]:
# Stwórz puste listy do przechowywania danych treningowych i testowych
X_train = []
X_test = []
y_train = []
y_test = []

In [9]:
window_size = 5000
step_size = 1000
# Dla każdego unikalnego pudełka
for box in df_encoded['test'].unique():
    # Wybierz tylko rekordy dla tego pudełka
    box_data = df_encoded[df_encoded['test'] == box]
    
    # Oblicz punkt podziału (80% danych)
    split_point = int(len(box_data) * 0.8)
    
    # Dodaj pierwsze 80% rekordów do danych treningowych
    X_train_box = box_data.drop('test', axis=1).iloc[:split_point]
    y_train_box = box_data['test'].iloc[:split_point]
    
    # Dodaj ostatnie 20% rekordów do danych testowych
    X_test_box = box_data.drop('test', axis=1).iloc[split_point:]
    y_test_box = box_data['test'].iloc[split_point:]
    
    # Stwórz okienka dla danych treningowych
    X_train_windows, y_train_windows = create_windows(X_train_box, y_train_box, window_size, step_size)
    
    # Stwórz okienka dla danych testowych
    X_test_windows, y_test_windows = create_windows(X_test_box, y_test_box, window_size, step_size)
    
    # Dodaj okienka do list
    X_train.append(X_train_windows)
    y_train.append(y_train_windows)
    X_test.append(X_test_windows)
    y_test.append(y_test_windows)

In [10]:
# Połączenie danych treningowych dla wszystkich pudełek
X_train_combined = np.concatenate(X_train, axis=0)
y_train_combined = np.concatenate(y_train, axis=0)

In [11]:
# Zakodowanie etykiet w formacie one-hot encoding
y_train_combined_encoded = to_categorical(y_train_combined, num_classes=len(df['test'].unique()))

In [12]:
X_test_combined = np.concatenate(X_test, axis=0)
y_test_combined = np.concatenate(y_test, axis=0)
y_test_encoded_combined = to_categorical(y_test_combined, num_classes=len(df['test'].unique()))

In [13]:
def create_model(n_units:int, n_layers: int, learning_rate: float):
    model = Sequential()
    return_sequences = True if n_layers > 0 else False  # Jeśli istnieją warstwy LSTM, pierwsza warstwa GRU musi zwracać sekwencje
    model.add(GRU(n_units, return_sequences=return_sequences, input_shape=(window_size, X_train_combined.shape[2])))
    for i in range(n_layers):
        return_sequences = True if i < n_layers - 1 else False  # ostatnia warstwa LSTM ma return_sequences=False
        model.add(LSTM(n_units, return_sequences=return_sequences))
    model.add(Dense(len(df['test'].unique()), activation='softmax'))
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [14]:
def objective(trial):
    # Define the hyperparameters
    n_units = trial.suggest_int("n_units", 5, 175)
    lr = trial.suggest_float("lr", 1e-5, 1e-2)
    batch_size = trial.suggest_int('batch_size', 5, 125)
    epochs = trial.suggest_int('epochs', 3, 15)
    n_layers = trial.suggest_int('n_layers', 0, 2) 
    # Build and compile the model
    model = create_model(n_units, n_layers, lr)

    # Early stopping
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)

    print('start ', n_units, lr, batch_size, epochs,n_layers)

    # Train the model
    history = model.fit(X_train_combined, y_train_combined_encoded, 
                        validation_data=(X_test_combined, y_test_encoded_combined),
                        epochs=epochs, 
                        batch_size=batch_size, 
                        callbacks=[es, TFKerasPruningCallback(trial, 'val_loss')])

    # Evaluate the model
    score = model.evaluate(X_test_combined, y_test_encoded_combined, verbose=0)

    print(f"Trial {trial.number}, values: {trial.params}, result: {score[1]}")

    return score[1]  # return validation accuracy

In [15]:
logger = logging.get_logger('optuna')
logger.setLevel(logging.INFO)

In [16]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=150)

[32m[I 2023-05-22 01:09:59,430][0m A new study created in memory with name: no-name-2d73c4ac-ed3b-4429-99bc-cb9a38636979[0m


start  86 0.008049637669161341 92 8 1
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


[32m[I 2023-05-22 20:02:11,486][0m Trial 0 finished with value: 0.41969695687294006 and parameters: {'n_units': 86, 'lr': 0.008049637669161341, 'batch_size': 92, 'epochs': 8, 'n_layers': 1}. Best is trial 0 with value: 0.41969695687294006.[0m


Trial 0, values: {'n_units': 86, 'lr': 0.008049637669161341, 'batch_size': 92, 'epochs': 8, 'n_layers': 1}, result: 0.41969695687294006
start  37 0.0009817388994273065 28 4 2
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


[32m[I 2023-05-22 20:50:29,517][0m Trial 1 finished with value: 0.49848484992980957 and parameters: {'n_units': 37, 'lr': 0.0009817388994273065, 'batch_size': 28, 'epochs': 4, 'n_layers': 2}. Best is trial 1 with value: 0.49848484992980957.[0m


Trial 1, values: {'n_units': 37, 'lr': 0.0009817388994273065, 'batch_size': 28, 'epochs': 4, 'n_layers': 2}, result: 0.49848484992980957
start  124 0.008697099267550874 78 10 1
Epoch 1/10

In [None]:
# Rezultaty
best_params = study.best_params
best_accuracy = study.best_value
print(f"Best parameters: {best_params}")
print(f"Best validation accuracy: {best_accuracy}")

In [None]:
for i in range(len(X_test)):
    y_test_encoded = to_categorical(y_test[i], num_classes=len(df['test'].unique()))
    score = grid.score(X_test[i], y_test_encoded)
    print(f"Test {i+1}: Score = {score}")