In [18]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import numpy as np

path = "../../dane/8CPU_20RAM/3600s/few_merged.csv"

In [19]:
# Załaduj dane
df = pd.read_csv(path)

In [20]:
for col in ['replicaId', 'endpointUrl_methods']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [21]:
# Dodajemy skalowanie dla wybranych kolumn
features_to_scale = ['timestamp', 'queueSizeForward_methods', 'queueSizeBack_methods',
                     'cpuUsage_stock', 'memoryUsage_stock',
                     'applicationTime_trading', 'databaseTime_trading',
                     'numberOfSellOffers_trading', 'numberOfBuyOffers_trading']
scaler = StandardScaler()
for col in features_to_scale:
    df[col] = scaler.fit_transform(df[col].values.reshape(-1, 1))

In [22]:
# Wybór cech
features = [
            'timestamp',
            # 'apiTime_methods',
            # 'applicationTime_methods',
            # 'databaseTime_methods',
            'endpointUrl_methods',
            # 'queueSizeForward_methods',
            # 'queueSizeBack_methods',
            'cpuUsage_stock',
            'memoryUsage_stock',
            # 'applicationTime_trading',
            # 'databaseTime_trading',
            # 'numberOfSellOffers_trading',
            # 'numberOfBuyOffers_trading',
            # 'cpuUsage_traffic',
            # 'memoryUsage_traffic',
            # 'replicaId'
            ]

targets = ['cpuUsage_stock', 'memoryUsage_stock']

df_features = df[features]
df_targets = df[targets]

# Kodowanie pudełek
le = LabelEncoder()
df['test'] = le.fit_transform(df['test'])

# Kopiowanie danych
df_encoded = df_features.copy()
df_encoded['test'] = df['test']


In [23]:
# Stwórz puste listy do przechowywania danych treningowych i testowych
X_train = []
X_test = []
y_train = []
y_test = []

In [24]:
def create_windows(X, y, window_size, step_size):
    X_windows = []
    y_windows = []

    # Przesuń okno po danych
    for i in range(0, len(X) - window_size, step_size):
        # Utwórz okno danych
        X_window = X.iloc[i:i + window_size]
        # Utwórz etykietę dla okna (etykieta ostatniej obserwacji w oknie)
        y_window = y.iloc[i + window_size]

        X_windows.append(X_window.values)
        y_windows.append(y_window)

    return np.array(X_windows), np.array(y_windows)

In [25]:
window_size = 700
step_size = 100
# Dla każdego unikalnego pudełka
for box in df_encoded['test'].unique():
    # Wybierz tylko rekordy dla tego pudełka
    box_data = df_encoded[df_encoded['test'] == box]
    
    # Oblicz punkt podziału (80% danych)
    split_point = int(len(box_data) * 0.8)
    
    # Dodaj pierwsze 80% rekordów do danych treningowych
    X_train_box = box_data[features].iloc[:split_point]
    y_train_box = box_data[targets].iloc[:split_point]
    
    # Dodaj ostatnie 20% rekordów do danych testowych
    X_test_box = box_data[features].iloc[split_point:]
    y_test_box = box_data[targets].iloc[split_point:]
    
    # Stwórz okienka dla danych treningowych
    X_train_windows, y_train_windows = create_windows(X_train_box, y_train_box, window_size, step_size)
    
    # Stwórz okienka dla danych testowych
    X_test_windows, y_test_windows = create_windows(X_test_box, y_test_box, window_size, step_size)
    
    # Dodaj okienka do list
    X_train.append(X_train_windows)
    y_train.append(y_train_windows)
    X_test.append(X_test_windows)
    y_test.append(y_test_windows)

In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense
from keras.utils import to_categorical

In [27]:
# Połączenie danych treningowych dla wszystkich pudełek
X_train_combined = np.concatenate(X_train, axis=0)
y_train_combined = np.concatenate(y_train, axis=0)

In [31]:
# Zakodowanie etykiet w formacie one-hot encoding
y_train_combined_encoded = to_categorical(y_train_combined, num_classes=len(df['test'].unique()))

array([[[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.]],

       [[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.]],

       [[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.]],

       ...,

       [[1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.]],

       [[1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.]],

       [[1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.]]], dtype=float32)

In [29]:
# Trenowanie modelu na połączonym zestawie danych
model = Sequential()    
model.add(GRU(150, return_sequences=False, input_shape=(window_size, X_train_combined.shape[2])))
model.add(Dense(len(targets)))
model.compile(loss='mean_squared_error', optimizer='adam') # we use mean squared error for regression tasks

# dodatkowe metryki:
## MSE jest często używany, ponieważ kara duże błędy bardziej niż małe, ale MAE daje nam lepsze pojęcie o rzeczywistym błędzie naszych prognoz.
# model.compile(optimizer='adam', loss='mean_squared_error', metrics=[metrics.RootMeanSquaredError(), metrics.MeanAbsoluteError()])

In [30]:
model.fit(X_train_combined, y_train_combined, epochs=5, batch_size=64)

Epoch 1/5

KeyboardInterrupt: 

In [None]:
for i in range(len(X_test)):
    # loss = model.evaluate(X_test[i], y_test[i])
    # print(f"Test {i+1}: Loss = {loss}")
    loss, rmse, mae = model.evaluate(X_test[i], y_test[i])
    print(f"Test {i+1}: Loss = {loss}, RMSE = {rmse}, MAE = {mae}")

Test 1: Loss = 0.9990668892860413
Test 2: Loss = 0.6317780017852783
Test 3: Loss = 0.6258329153060913
Test 4: Loss = 0.5060584545135498
Test 5: Loss = 0.4704350531101227
