In [9]:
import os
import dask.dataframe as dd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMRegressor, LGBMClassifier
import optuna
import pyarrow.parquet as pq
import pandas as pd
import datetime
from sklearn.metrics import f1_score, mean_squared_error, mean_squared_error

print('Ok - import')

Ok - import


In [2]:
# ---------------- Przygotowanie danych ----------------
competition_path = "M:/PycharmProjects/ds-zt-LGBMRegressor-main/kaggle-dtset"  # Ścieżka do danych konkursowych
batch_size = 1000000  # Rozmiar batcha treningowych
percent_of_dataset = 0.075  # Ile procent dataset używamy - 7,5%

# Ścieżki do danych
train_path = os.path.join(competition_path, "train.parquet")
test_path = os.path.join(competition_path, "test.parquet")
features = [f'feature_{i:02}' for i in range(79)]
target_train = 'responder_6'  # Target do przewidzenia
weights = 'weight'

# Wczytanie danych przy użyciu Dask
ddf_train = dd.read_parquet(train_path, columns=features + [target_train, weights])
data_train = ddf_train.sample(frac=percent_of_dataset, random_state=42).compute()
data_train = data_train.dropna(subset=features + [target_train, weights])  # Usunięcie braków

X_train = data_train[features]
y_train = data_train[target_train]
sample_weights_train = data_train[weights]

print(f"Próbka danych: {data_train.shape}")


Próbka danych: (2652773, 81)


In [10]:
# ---------------- Optymalizacja z Optuna ----------------
def objective(trial):
    # Przestrzeń hiperparametrów
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),           # Liczba drzew
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1), # Współczynnik uczenia
        'num_leaves': trial.suggest_int('num_leaves', 31, 100),                # Liczba liści
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.8, 1.0), # Frakcja cech
        'subsample': trial.suggest_uniform('subsample', 0.8, 1.0),             # Frakcja próbek
        'max_depth': trial.suggest_int('max_depth', -1, 15),                   # Maksymalna głębokość
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 50),   # Minimalna liczba próbek w liściu
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-3, 10.0),        # Regularyzacja L1
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-3, 10.0),        # Regularyzacja L2
        'max_bin': trial.suggest_int('max_bin', 127, 255),                     # Maksymalna liczba binów
        'random_state': 42
    }

    # Model LightGBM dla klasyfikacji binarnej
    model = LGBMClassifier(objective='binary', **params)

    # Cross-validation z podziałem czasowym
    tscv = TimeSeriesSplit(n_splits=5)
    f1_scores = []
    mae_scores = []

    for train_idx, valid_idx in tscv.split(X_train):
        # Przygotowanie danych treningowych i walidacyjnych
        X_fold_train, X_fold_valid = X_train.iloc[train_idx], X_train.iloc[valid_idx]
        y_fold_train, y_fold_valid = y_train.iloc[train_idx], y_train.iloc[valid_idx]
        weights_fold_train = sample_weights_train.iloc[train_idx]

        # Dopasowanie modelu
        model.fit(
            X_fold_train, y_fold_train,
            sample_weight=weights_fold_train,
            eval_set=[(X_fold_valid, y_fold_valid)],
            eval_metric='logloss',
        )

        # Predykcje prawdopodobieństw
        preds_prob = model.predict_proba(X_fold_valid)[:, 1]  # Prawdopodobieństwo klasy 1
        preds_class = (preds_prob > 0.5).astype(int)          # Klasy (0/1)

        # Obliczenie F1-score
        f1 = f1_score(y_fold_valid, preds_class, average='binary')
        f1_scores.append(f1)

        # Obliczenie MAE dla prawdopodobieństw
        mae = mean_absolute_error(y_fold_valid, preds_prob)
        mae_scores.append(mae)

    # Zwracamy wynik do optymalizacji (ważymy F1 i MAE, np. 70% F1 + 30% MAE jako przykład)
    return 0.7 * np.mean(f1_scores) - 0.3 * np.mean(mae_scores)

# Optymalizacja z Optuna
study = optuna.create_study(direction="maximize")  # Maksymalizujemy metrykę ważoną
study.optimize(objective, n_trials=50)

# Wyciąganie najlepszego modelu i metryk dla optymalnych parametrów
best_params = study.best_params

# Przeprowadzenie walidacji z najlepszymi parametrami
model = LGBMClassifier(objective='binary', **best_params)

tscv = TimeSeriesSplit(n_splits=5)
best_f1_scores = []
best_mae_scores = []

for train_idx, valid_idx in tscv.split(X_train):
    X_fold_train, X_fold_valid = X_train.iloc[train_idx], X_train.iloc[valid_idx]
    y_fold_train, y_fold_valid = y_train.iloc[train_idx], y_train.iloc[valid_idx]
    weights_fold_train = sample_weights_train.iloc[train_idx]

    model.fit(
        X_fold_train, y_fold_train,
        sample_weight=weights_fold_train,
        eval_set=[(X_fold_valid, y_fold_valid)],
        eval_metric='logloss',
    )

    preds_prob = model.predict_proba(X_fold_valid)[:, 1]
    preds_class = (preds_prob > 0.5).astype(int)

    f1 = f1_score(y_fold_valid, preds_class, average='binary')
    mae = mean_absolute_error(y_fold_valid, preds_prob)

    best_f1_scores.append(f1)
    best_mae_scores.append(mae)

# Wyświetlenie najlepszych parametrów i wyników
print("\nNajlepsze parametry:")
print(best_params)

print("\nNajlepszy średni wynik F1-score dla najlepszych parametrów:")
print(np.mean(best_f1_scores))

print("\nNajlepszy średni wynik MAE dla najlepszych parametrów:")
print(np.mean(best_mae_scores))

[I 2025-01-27 23:52:21,874] A new study created in memory with name: no-name-3e4e2afd-cf8a-471c-8c30-3853ddad1a28
[W 2025-01-27 23:52:22,870] Trial 0 failed with parameters: {'n_estimators': 413, 'learning_rate': 0.07344810536313844, 'num_leaves': 67, 'feature_fraction': 0.9241849454756739, 'subsample': 0.8339745798478381, 'max_depth': 8, 'min_child_samples': 47, 'lambda_l1': 1.7297466046738614, 'lambda_l2': 0.010697754025866452, 'max_bin': 162} because of the following error: TypeError("LGBMClassifier.fit() got an unexpected keyword argument 'verbose'").
Traceback (most recent call last):
  File "M:\PycharmProjects\ds-zt-LGBMRegressor-main\venv\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Szymon\AppData\Local\Temp\ipykernel_20932\2049778878.py", line 32, in objective
    model.fit(
TypeError: LGBMClassifier.fit() got an unexpected keyword argument 'verbose'
[W 2025-01-27 23:52:22,874] Trial 0 failed with value

TypeError: LGBMClassifier.fit() got an unexpected keyword argument 'verbose'

In [4]:
# ---------------- Wyniki Optuna ----------------
print("\nNajlepsze parametry:")
print(study.best_params)

print("\nNajlepszy wynik (MSE):")
print(study.best_value)

# Zapis wyników do pliku
now = datetime.datetime.now()
date_str = now.strftime("%Y-%m-%d-%H-%M-%S")
file_name = f"optuna_results_{date_str}.txt"

with open(file_name, 'w') as f:
    f.write(f"Najlepsze parametry: {study.best_params}\n")
    f.write(f"Najlepszy wynik (MSE): {study.best_value:.4f}\n")

print(f"\nWyniki zapisano do pliku: {file_name}")


Najlepsze parametry:
{'n_estimators': 459, 'learning_rate': 0.033441061327066174, 'num_leaves': 85, 'feature_fraction': 0.8086691830052026, 'subsample': 0.9862178362318438, 'max_depth': 9, 'min_child_samples': 25, 'lambda_l1': 0.0049638961104124055, 'lambda_l2': 0.006126150398752788, 'max_bin': 254}

Najlepszy wynik (MSE):
0.6385294016276032

Wyniki zapisano do pliku: optuna_results_2025-01-27-16-10-22.txt


In [None]:
from sklearn.metrics import f1_score
