In [None]:
pip install pandas numpy scikit-learn lightgbm optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder
import optuna
import warnings


train_df = pd.read_csv('/content/drive/MyDrive/tjk_dataset/kaggle/train.csv')



test_df = pd.read_csv('/content/drive/MyDrive/tjk_dataset/kaggle/test.csv')


In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          750000 non-null  int64  
 1   Sex         750000 non-null  object 
 2   Age         750000 non-null  int64  
 3   Height      750000 non-null  float64
 4   Weight      750000 non-null  float64
 5   Duration    750000 non-null  float64
 6   Heart_Rate  750000 non-null  float64
 7   Body_Temp   750000 non-null  float64
 8   Calories    750000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 51.5+ MB


In [None]:


TARGET_NAME = 'Calories' # Lütfen kendi hedef değişkeninizin adını buraya girin!

# Birleştirme (Ön İşleme için faydalı)
all_data = pd.concat([train_df.drop(TARGET_NAME, axis=1), test_df], axis=0)

# Kategorik sütunları belirleme ve Label Encoding (Örnek)
# Kendi mevcut ön işleme adımlarınızı buraya ekleyin.
# Eğer One-Hot Encoding kullanıyorsanız, onu kullanmaya devam edin.
# Burada basitlik adına Label Encoding kullanılmıştır.
categorical_cols = all_data.select_dtypes(include='object').columns
for col in categorical_cols:
    le = LabelEncoder()
    all_data[col] = le.fit_transform(all_data[col].astype(str)) # NaN'ları da işlemek için str'ye çevir

# Numerik eksik değerleri ortalama ile doldurma (Örnek)
for col in all_data.select_dtypes(include=np.number).columns:
    if all_data[col].isnull().any():
        all_data[col] = all_data[col].fillna(all_data[col].median())

# Veri setini tekrar ayırma
X = all_data.loc[all_data['id'].isin(train_df['id'])].drop('id', axis=1)
X_test = all_data.loc[all_data['id'].isin(test_df['id'])].drop('id', axis=1)
y = train_df[TARGET_NAME]

# Hedef değişkeni log dönüştürme (eğer gerekliyse ve RMSLE kullanılıyorsa)
# Genellikle satış fiyatı, gelir vb. gibi sağa çarpık dağılımlar için yapılır.
# Eğer hedefiniz 0'dan küçük değerler alabiliyorsa bu kısmı atlayın.
y_transformed = np.log1p(y)
print(f"Hedef değişken '{TARGET_NAME}' log1p dönüşümü uygulandı.")

Hedef değişken 'Calories' log1p dönüşümü uygulandı.


In [None]:
# --- 1. RMSLE Metrik Fonksiyonu ---
def rmsle(y_true, y_pred):
    # Tahminlerin ve gerçek değerlerin negatif olmamasını sağla
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    y_pred[y_pred < 0] = 0 # Negatif tahminleri 0'a çek

    # Log dönüşümünü tersine çevirmeden önce 1 ekle, sonra log al
    # mean_squared_log_error doğrudan y, y_pred alır ve log1p dönüşümünü uygular
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Log dönüşümlü hedefle çalışan RMSLE için
def rmsle_log(y_true_log, y_pred_log):
    # log1p dönüşümünü tersine çevir
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    return rmsle(y_true, y_pred)



In [None]:
# --- 2. Optuna Optimizasyon Hedefi Fonksiyonu ---
def objective(trial):
    params = {
        'objective': 'regression_l1',
        'metric': 'rmse', # BURAYI DEĞİŞTİRDİK!
        'n_estimators': trial.suggest_int('n_estimators', 500, 4000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),
        'random_state': 42,
        'n_jobs': -1,
        'verbose': -1, # Çıktıyı susturur
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(X))
    model_rmlses = []

    for fold, (train_index, val_index) in enumerate(kf.split(X, y_transformed)):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y_transformed.iloc[train_index], y_transformed.iloc[val_index]

        print(f"\n--- Fold {fold+1}/{kf.n_splits} (Trial {trial.number}) ---")
        print(f"  X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
        print(f"  X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")

        # --- Ek Kontroller: Doğrulama Setinde Eksik Veri veya Boşluk Kontrolü ---
        if X_val.empty or y_val.empty:
            print(f"  HATA: Fold {fold+1} için doğrulama seti boş çıktı. Bu durum HPO'yu etkileyebilir.")
            raise ValueError(f"Fold {fold+1} için boş doğrulama seti.")

        nan_in_X_val = X_val.isnull().sum().sum()
        if nan_in_X_val > 0:
            print(f"  UYARI: Fold {fold+1} için X_val içinde {nan_in_X_val} adet NaN değeri bulundu. Ortanca ile dolduruluyor.")
            for col in X_val.columns:
                if X_val[col].isnull().any():
                    X_val[col] = X_val[col].fillna(X_val[col].median())

        nan_in_y_val = y_val.isnull().sum()
        if nan_in_y_val > 0:
            print(f"  UYARI: Fold {fold+1} için y_val içinde {nan_in_y_val} adet NaN değeri bulundu. Ortanca ile dolduruluyor.")
            y_val = y_val.fillna(y_val.median())

        model = lgb.LGBMRegressor(**params)

        try:
            model.fit(X_train, y_train,
                      eval_set=[(X_val, y_val)],
                      eval_metric='rmse', # BURAYI DA DEĞİŞTİRDİK!
                      callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)])
        except ValueError as e:
            print(f"  MODEL FIT SIRASINDA HATA (Fold {fold+1}): {e}")
            print(f"  eval_set'in içeriği kontrol ediliyor: X_val.shape={X_val.shape}, y_val.shape={y_val.shape}")
            raise

        val_preds_log = model.predict(X_val)
        fold_rmsle = rmsle_log(y_val, val_preds_log) # Kendi RMSLE fonksiyonumuzu kullanmaya devam ediyoruz
        model_rmlses.append(fold_rmsle)

        oof_preds[val_index] = val_preds_log

    avg_rmsle = np.mean(model_rmlses)

    print(f"\nTrial {trial.number}: Average RMSLE = {avg_rmsle:.6f}")

    return avg_rmsle


In [None]:
# --- 3. Optuna Çalışması ve En İyi Parametreleri Bulma ---
# Pruning (budama) ile daha hızlı sonuçlar elde edebilirsiniz.
# Optuna'nın web arayüzünü açmak için: `optuna-dashboard sqlite:///db.db` komutunu kullanın
# study = optuna.create_study(direction='minimize', study_name='lightgbm_hpo',
#                             storage='sqlite:///db.db', load_if_exists=True) # Çalışmaları kaydetmek için
study = optuna.create_study(direction='minimize') # Bellekte tutmak için

print("Optimizasyon başlatılıyor...")
# N_trials'ı kendi kaynaklarınıza ve sabrınıza göre ayarlayın.
# Daha fazla deneme, daha iyi sonuçlar verebilir.
study.optimize(objective, n_trials=10) # Başlangıç için 100 deneme, daha sonra artırılabilir.

print("\nOptimizasyon tamamlandı.")
print(f"En iyi deneme skoru (RMSLE): {study.best_value:.6f}")
print("En iyi parametreler:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

best_params = study.best_params
# n_estimators'ı doğrudan kullanmak yerine, erken durdurma ile model eğitmek daha iyi olabilir.
# Bu nedenle, burada n_estimators'ı kaldırıp, nihai modelde erken durdurma kullanacağız.
if 'n_estimators' in best_params:
    del best_params['n_estimators'] # N_estimators'ı early stopping ile belirleyeceğiz


[I 2025-05-10 08:32:48,045] A new study created in memory with name: no-name-7d8120dd-cb5c-472b-a51d-f0a2548ca4ac


Optimizasyon başlatılıyor...

--- Fold 1/5 (Trial 0) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 2/5 (Trial 0) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 3/5 (Trial 0) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 4/5 (Trial 0) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 5/5 (Trial 0) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)


[I 2025-05-10 08:44:26,003] Trial 0 finished with value: 0.060936104088863866 and parameters: {'n_estimators': 1829, 'learning_rate': 0.01726356271060478, 'num_leaves': 49, 'max_depth': 10, 'min_child_samples': 81, 'subsample': 0.8907799606083984, 'colsample_bytree': 0.895847209486863, 'reg_alpha': 0.00033198191687101674, 'reg_lambda': 0.3370101057369513}. Best is trial 0 with value: 0.060936104088863866.



Trial 0: Average RMSLE = 0.060936

--- Fold 1/5 (Trial 1) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 2/5 (Trial 1) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 3/5 (Trial 1) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 4/5 (Trial 1) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 5/5 (Trial 1) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)


[I 2025-05-10 08:54:50,171] Trial 1 finished with value: 0.0643388966268617 and parameters: {'n_estimators': 1364, 'learning_rate': 0.0058979345553018874, 'num_leaves': 42, 'max_depth': 14, 'min_child_samples': 96, 'subsample': 0.9512550752195228, 'colsample_bytree': 0.8482524925583584, 'reg_alpha': 2.075303047735287, 'reg_lambda': 0.00035728120527875974}. Best is trial 0 with value: 0.060936104088863866.



Trial 1: Average RMSLE = 0.064339

--- Fold 1/5 (Trial 2) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 2/5 (Trial 2) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 3/5 (Trial 2) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 4/5 (Trial 2) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 5/5 (Trial 2) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)


[I 2025-05-10 09:05:58,327] Trial 2 finished with value: 0.06096198294712905 and parameters: {'n_estimators': 1749, 'learning_rate': 0.023672394544512265, 'num_leaves': 64, 'max_depth': 8, 'min_child_samples': 63, 'subsample': 0.9014653363718121, 'colsample_bytree': 0.9977370020584656, 'reg_alpha': 2.836185472928493, 'reg_lambda': 8.02084884530579e-06}. Best is trial 0 with value: 0.060936104088863866.



Trial 2: Average RMSLE = 0.060962

--- Fold 1/5 (Trial 3) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 2/5 (Trial 3) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 3/5 (Trial 3) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 4/5 (Trial 3) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 5/5 (Trial 3) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)


[I 2025-05-10 09:23:09,828] Trial 3 finished with value: 0.061222524497985174 and parameters: {'n_estimators': 2466, 'learning_rate': 0.008024832100442561, 'num_leaves': 47, 'max_depth': 13, 'min_child_samples': 88, 'subsample': 0.748140683924433, 'colsample_bytree': 0.6915652941353931, 'reg_alpha': 0.48340265426993784, 'reg_lambda': 2.4662845302124973e-06}. Best is trial 0 with value: 0.060936104088863866.



Trial 3: Average RMSLE = 0.061223

--- Fold 1/5 (Trial 4) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 2/5 (Trial 4) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 3/5 (Trial 4) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 4/5 (Trial 4) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 5/5 (Trial 4) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)


[I 2025-05-10 09:51:15,611] Trial 4 finished with value: 0.06569042836454486 and parameters: {'n_estimators': 3365, 'learning_rate': 0.0021002482565242097, 'num_leaves': 84, 'max_depth': 11, 'min_child_samples': 64, 'subsample': 0.8314866562909842, 'colsample_bytree': 0.7731944350251134, 'reg_alpha': 4.795793860662359, 'reg_lambda': 1.1040029459998645e-06}. Best is trial 0 with value: 0.060936104088863866.



Trial 4: Average RMSLE = 0.065690

--- Fold 1/5 (Trial 5) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 2/5 (Trial 5) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 3/5 (Trial 5) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 4/5 (Trial 5) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 5/5 (Trial 5) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)


[I 2025-05-10 09:56:40,065] Trial 5 finished with value: 0.07689010968350028 and parameters: {'n_estimators': 652, 'learning_rate': 0.007997006444384626, 'num_leaves': 82, 'max_depth': 15, 'min_child_samples': 61, 'subsample': 0.63014655541949, 'colsample_bytree': 0.7629313045286148, 'reg_alpha': 0.028438038114414248, 'reg_lambda': 0.008220272118895583}. Best is trial 0 with value: 0.060936104088863866.



Trial 5: Average RMSLE = 0.076890

--- Fold 1/5 (Trial 6) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 2/5 (Trial 6) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 3/5 (Trial 6) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 4/5 (Trial 6) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 5/5 (Trial 6) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)


[I 2025-05-10 10:08:17,818] Trial 6 finished with value: 0.18362898829423518 and parameters: {'n_estimators': 1481, 'learning_rate': 0.0018379868965649815, 'num_leaves': 91, 'max_depth': 10, 'min_child_samples': 52, 'subsample': 0.6482715938314126, 'colsample_bytree': 0.7474918086022369, 'reg_alpha': 5.9248926938144e-06, 'reg_lambda': 0.6191401090154278}. Best is trial 0 with value: 0.060936104088863866.



Trial 6: Average RMSLE = 0.183629

--- Fold 1/5 (Trial 7) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 2/5 (Trial 7) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 3/5 (Trial 7) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 4/5 (Trial 7) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 5/5 (Trial 7) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)


[I 2025-05-10 10:16:05,418] Trial 7 finished with value: 0.060778643597551585 and parameters: {'n_estimators': 1514, 'learning_rate': 0.08817435952515264, 'num_leaves': 57, 'max_depth': 11, 'min_child_samples': 73, 'subsample': 0.7390601369462992, 'colsample_bytree': 0.7137770429916661, 'reg_alpha': 4.0843382882357355e-07, 'reg_lambda': 0.005875960492300708}. Best is trial 7 with value: 0.060778643597551585.



Trial 7: Average RMSLE = 0.060779

--- Fold 1/5 (Trial 8) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 2/5 (Trial 8) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 3/5 (Trial 8) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 4/5 (Trial 8) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 5/5 (Trial 8) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)


[I 2025-05-10 10:27:19,219] Trial 8 finished with value: 0.06185772338413063 and parameters: {'n_estimators': 1489, 'learning_rate': 0.00803824083295485, 'num_leaves': 85, 'max_depth': 7, 'min_child_samples': 91, 'subsample': 0.8850166791230181, 'colsample_bytree': 0.9290150643610119, 'reg_alpha': 0.009789533703930205, 'reg_lambda': 0.0017149443940718456}. Best is trial 7 with value: 0.060778643597551585.



Trial 8: Average RMSLE = 0.061858

--- Fold 1/5 (Trial 9) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 2/5 (Trial 9) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 3/5 (Trial 9) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 4/5 (Trial 9) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)

--- Fold 5/5 (Trial 9) ---
  X_train shape: (600000, 7), y_train shape: (600000,)
  X_val shape: (150000, 7), y_val shape: (150000,)


[I 2025-05-10 10:40:09,890] Trial 9 finished with value: 0.07730395518672638 and parameters: {'n_estimators': 1630, 'learning_rate': 0.0036793876195184323, 'num_leaves': 77, 'max_depth': 10, 'min_child_samples': 67, 'subsample': 0.841056447981263, 'colsample_bytree': 0.6032634048199287, 'reg_alpha': 1.6549083662039265e-05, 'reg_lambda': 0.010097587691125072}. Best is trial 7 with value: 0.060778643597551585.



Trial 9: Average RMSLE = 0.077304

Optimizasyon tamamlandı.
En iyi deneme skoru (RMSLE): 0.060779
En iyi parametreler:
  n_estimators: 1514
  learning_rate: 0.08817435952515264
  num_leaves: 57
  max_depth: 11
  min_child_samples: 73
  subsample: 0.7390601369462992
  colsample_bytree: 0.7137770429916661
  reg_alpha: 4.0843382882357355e-07
  reg_lambda: 0.005875960492300708


In [None]:
# --- 4. Nihai Model Eğitimi ve Tahmin (K-Katlı CV ile) ---
print("\nNihai model eğitimi ve test seti tahminleri (K-Fold CV ile)...")

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

models = []

for fold, (train_index, val_index) in enumerate(kf.split(X, y_transformed)):
    print(f"\n--- Nihai Model Eğitimi: Fold {fold+1}/{kf.n_splits} Başladı ---") # Eklendi
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y_transformed.iloc[train_index], y_transformed.iloc[val_index]

    # Hata ayıklama için ek kontroller (objektif fonksiyondaki gibi)
    if X_val.empty or y_val.empty:
        print(f"  HATA: Nihai Eğitim: Fold {fold+1} için doğrulama seti boş çıktı. Bu katlama atlanıyor.") # Eklendi
        continue # Bu katlamayı atla

    nan_in_X_val_final = X_val.isnull().sum().sum() # Eklendi
    if nan_in_X_val_final > 0: # Eklendi
        print(f"  UYARI: Nihai Eğitim: Fold {fold+1} için X_val içinde {nan_in_X_val_final} adet NaN değeri bulundu. Ortanca ile dolduruluyor.") # Eklendi
        for col in X_val.columns: # Eklendi
            if X_val[col].isnull().any(): # Eklendi
                X_val[col] = X_val[col].fillna(X_val[col].median()) # Eklendi

    nan_in_y_val_final = y_val.isnull().sum() # Eklendi
    if nan_in_y_val_final > 0: # Eklendi
        print(f"  UYARI: Nihai Eğitim: Fold {fold+1} için y_val içinde {nan_in_y_val_final} adet NaN değeri bulundu. Ortanca ile dolduruluyor.") # Eklendi
        y_val = y_val.fillna(y_val.median()) # Eklendi


    model = lgb.LGBMRegressor(**best_params,
                              n_estimators=2000,
                              objective='regression_l1',
                              metric='rmse', # Emin olun burası 'rmse'
                              random_state=42,
                              n_jobs=-1)

    try: # Hata yakalama için try-except bloğu
        model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  eval_metric='rmse', # Emin olun burası 'rmse'
                  callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=100)])
    except ValueError as e: # Hata olursa yakala
        print(f"  MODEL FIT SIRASINDA HATA (Nihai Eğitim Fold {fold+1}): {e}") # Eklendi
        print(f"  eval_set'in içeriği kontrol ediliyor: X_val.shape={X_val.shape}, y_val.shape={y_val.shape}") # Eklendi
        raise # Hatayı tekrar fırlatarak programı durdur ve hata izini göster

    val_preds_log = model.predict(X_val)
    oof_preds[val_index] = val_preds_log

    fold_rmsle_score = rmsle_log(y_val, val_preds_log)
    print(f"Nihai Eğitim Fold {fold+1} RMSLE: {fold_rmsle_score:.6f}")

    test_preds += model.predict(X_test) / kf.n_splits

    models.append(model)
    print(f"--- Nihai Model Eğitimi: Fold {fold+1}/{kf.n_splits} Tamamlandı ---") # Eklendi

# test_preds'in gerçekten dolduğundan emin olmak için ek kontrol (Eklendi)
if np.all(test_preds == 0) and len(X_test) > 0:
    print("\nUYARI: test_preds hiç doldurulmamış veya tamamen sıfır! Nihai model eğitimi başarısız olmuş olabilir.")
    raise ValueError("test_preds değişkeni tanımlanmamış veya boş kaldı. Lütfen önceki hataları kontrol edin.")

# Toplam OOF RMSLE (Train seti üzerinde modelin genelleme performansı)
total_oof_rmsle = rmsle_log(y_transformed, oof_preds)
print(f"\nOrtalama OOF RMSLE (Genel Train Seti Performansı): {total_oof_rmsle:.6f}")

# Test seti tahminlerini log dönüşümünden tersine çevirme
# Negatif tahminleri 0'a sabitleme (RMSLE'nin doğası gereği)
final_test_preds = np.expm1(test_preds)
final_test_preds[final_test_preds < 0] = 0

print(f"Test seti tahminleri tamamlandı. Tahmin ortalaması: {final_test_preds.mean():.2f}")



Nihai model eğitimi ve test seti tahminleri (K-Fold CV ile)...

--- Nihai Model Eğitimi: Fold 1/5 Başladı ---
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[1994]	valid_0's rmse: 0.0602336
Nihai Eğitim Fold 1 RMSLE: 0.060234
--- Nihai Model Eğitimi: Fold 1/5 Tamamlandı ---

--- Nihai Model Eğitimi: Fold 2/5 Başladı ---
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[1834]	valid_0's rmse: 0.0616444
Nihai Eğitim Fold 2 RMSLE: 0.061644
--- Nihai Model Eğitimi: Fold 2/5 Tamamlandı ---

--- Nihai Model Eğitimi: Fold 3/5 Başladı ---
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[1992]	valid_0's rmse: 0.0607028
Nihai Eğitim Fold 3 RMSLE: 0.060703
--- Nihai Model Eğitimi: Fold 3/5 Tamamlandı ---

--- Nihai Model Eğitimi: Fold 4/5 Başladı ---
Training until validation scores don't improve for 200 rounds
Di

In [None]:
# --- 5. Tahminleri Kaydetme (Kaggle Submission Formatı) ---
submission_df = pd.DataFrame({'id': test_df['id'], TARGET_NAME: final_test_preds})
submission_df.to_csv('submission_optuna_lgbm0.csv', index=False)

print(f"\nTahmin dosyası 'submission_optuna_lgbm.csv' kaydedildi.")
print("İlk 5 satır:")
print(submission_df.head())


Tahmin dosyası 'submission_optuna_lgbm.csv' kaydedildi.
İlk 5 satır:
       id    Calories
0  750000   27.063780
1  750001  108.596913
2  750002   87.535356
3  750003  125.575009
4  750004   76.518977
