In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")
sample_submission = pd.read_csv("/content/sample_solution.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)

train.head()


Train shape: (2000, 65)
Test shape: (500, 56)


Unnamed: 0,Component1_fraction,Component2_fraction,Component3_fraction,Component4_fraction,Component5_fraction,Component1_Property1,Component2_Property1,Component3_Property1,Component4_Property1,Component5_Property1,...,BlendProperty1,BlendProperty2,BlendProperty3,BlendProperty4,BlendProperty5,BlendProperty6,BlendProperty7,BlendProperty8,BlendProperty9,BlendProperty10
0,0.21,0.0,0.42,0.25,0.12,-0.021782,1.981251,0.020036,0.140315,1.032029,...,0.489143,0.607589,0.32167,-1.236055,1.601132,1.384662,0.30585,0.19346,0.580374,-0.762738
1,0.02,0.33,0.19,0.46,0.0,-0.224339,1.148036,-1.10784,0.149533,-0.354,...,-1.257481,-1.475283,-0.437385,-1.402911,0.147941,-1.143244,-0.439171,-1.379041,-1.280989,-0.503625
2,0.08,0.08,0.18,0.5,0.16,0.457763,0.242591,-0.922492,0.908213,0.972003,...,1.784349,0.450467,0.622687,1.375614,-0.42879,1.161616,0.601289,0.87295,0.66,2.024576
3,0.25,0.42,0.0,0.07,0.26,-0.577734,-0.930826,0.815284,0.447514,0.455717,...,-0.066422,0.48373,-1.865442,-0.046295,-0.16382,-0.209693,-1.840566,0.300293,-0.351336,-1.551914
4,0.26,0.16,0.08,0.5,0.0,0.120415,0.666268,-0.626934,2.725357,0.392259,...,-0.118913,-1.172398,0.301785,-1.787407,-0.493361,-0.528049,0.286344,-0.265192,0.430513,0.735073


In [None]:
print("Missing values in train:\n", train.isnull().sum().sum())
print("Missing values in test:\n", test.isnull().sum().sum())

X = train.iloc[:, :55]
y = train.iloc[:, 55:]
print("Input shape:", X.shape)
print("Target shape:", y.shape)


Missing values in train:
 0
Missing values in test:
 0
Input shape: (2000, 55)
Target shape: (2000, 10)


In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test.drop('ID', axis=1))

X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
base_model = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=6, random_state=42)
model = MultiOutputRegressor(base_model)

model.fit(X_train, y_train)

y_pred_val = model.predict(X_val)

val_mape = mean_absolute_percentage_error(y_val, y_pred_val)
print(f"Validation MAPE: {val_mape:.4f}")


Validation MAPE: 2.8744


new start


In [None]:
!pip install lightgbm --quiet


In [None]:
import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor

lgb_params = {
    'n_estimators': 1000,
    'learning_rate': 0.01,
    'max_depth': 10,
    'num_leaves': 100,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'n_jobs': -1
}

lgb_model = MultiOutputRegressor(lgb.LGBMRegressor(**lgb_params))

lgb_model.fit(X_train, y_train)

y_pred_val_lgb = lgb_model.predict(X_val)

val_mape_lgb = mean_absolute_percentage_error(y_val, y_pred_val_lgb)
print(f"Improved LightGBM Validation MAPE: {val_mape_lgb:.4f}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000813 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12985
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 55
[LightGBM] [Info] Start training from score -0.029208
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000779 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12985
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 55
[LightGBM] [Info] Start training from score -0.010603
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000732 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12985
[LightGBM] [Info] Number of data points in the train

new

In [None]:
# Reload full dataset for feature engineering
X_full = train.iloc[:, :55].copy()
y_full = train.iloc[:, 55:]
test_full = test.copy()

# Extract blend composition and component properties
blend_cols = X_full.columns[:5]
component_cols = X_full.columns[5:]

# ✨ Interaction features: Multiply each component property by its blend proportion
for i in range(5):  # Components 1 to 5
    for j in range(10):  # Properties 1 to 10
        base_col = f"Component{i+1}_Property{j+1}"
        weight_col = blend_cols[i]
        new_col = f"{base_col}_weighted"

        X_full[new_col] = X_full[weight_col] * X_full[base_col]
        test_full[new_col] = test_full[weight_col] * test_full[base_col]

# Drop original component properties if needed (optional)
X_full_fe = X_full.drop(columns=component_cols)
test_fe = test_full.drop(columns=component_cols)

print("Engineered features shape:", X_full_fe.shape)


Engineered features shape: (2000, 55)


In [None]:
# Scale
X_scaled_fe = scaler.fit_transform(X_full_fe)
test_scaled_fe = scaler.transform(test_fe.drop('ID', axis=1))

# Train/val split
X_train_fe, X_val_fe, y_train_fe, y_val_fe = train_test_split(X_scaled_fe, y_full, test_size=0.2, random_state=42)

# Train
lgb_model_fe = MultiOutputRegressor(lgb.LGBMRegressor(**lgb_params))
lgb_model_fe.fit(X_train_fe, y_train_fe)

# Validate
y_val_pred_fe = lgb_model_fe.predict(X_val_fe)
mape_fe = mean_absolute_percentage_error(y_val_fe, y_val_pred_fe)
print(f"Engineered LightGBM MAPE: {mape_fe:.4f}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003746 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12981
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 55
[LightGBM] [Info] Start training from score -0.029208
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000833 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12981
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 55
[LightGBM] [Info] Start training from score -0.010603
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000766 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12981
[LightGBM] [Info] Number of data points in the train

In [None]:
y_test_pred_fe = lgb_model_fe.predict(test_scaled_fe)

submission_fe = pd.DataFrame(y_test_pred_fe, columns=[f"BlendProperty{i}" for i in range(1, 11)])
submission_fe.insert(0, 'ID', test['ID'])
submission_fe.to_csv("final_submission_fe.csv", index=False)

print("Final engineered submission saved as 'final_submission_fe.csv'")


Final engineered submission saved as 'final_submission_fe.csv'


till this its 53.78 and the final name is final_submission fe.csv

In [None]:
!pip install optuna --quiet


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/395.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/242.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.7/242.7 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import optuna
from sklearn.model_selection import cross_val_score, KFold
from sklearn.multioutput import MultiOutputRegressor

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 300, 2000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.1),
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'num_leaves': trial.suggest_int('num_leaves', 20, 200),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'random_state': 42,
        'n_jobs': -1
    }

    model = MultiOutputRegressor(lgb.LGBMRegressor(**params))
    cv = KFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_scaled_fe, y_full, cv=cv,
                              scoring='neg_mean_absolute_percentage_error', n_jobs=-1)

    return -scores.mean()  # minimize MAPE

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

print("Best trial:", study.best_trial.params)


[I 2025-07-05 15:41:21,169] A new study created in memory with name: no-name-da7c5814-c51b-43f9-a907-ab0f9063da9d
[I 2025-07-05 15:42:50,767] Trial 0 finished with value: 1.0462551232755068 and parameters: {'n_estimators': 556, 'learning_rate': 0.04133245537311469, 'max_depth': 4, 'num_leaves': 137, 'subsample': 0.7190403469849446, 'colsample_bytree': 0.6033252286593556}. Best is trial 0 with value: 1.0462551232755068.
[I 2025-07-05 15:52:11,934] Trial 1 finished with value: 1.136907803285009 and parameters: {'n_estimators': 1007, 'learning_rate': 0.010492483057288211, 'max_depth': 8, 'num_leaves': 168, 'subsample': 0.7485220443981764, 'colsample_bytree': 0.8518168026979879}. Best is trial 0 with value: 1.0462551232755068.
[I 2025-07-05 16:13:12,436] Trial 2 finished with value: 1.1738762797485394 and parameters: {'n_estimators': 1857, 'learning_rate': 0.005222681864789779, 'max_depth': 12, 'num_leaves': 189, 'subsample': 0.5668018418055949, 'colsample_bytree': 0.8350384343015018}. Bes

KeyboardInterrupt: 

In [None]:
best_params = study.best_trial.params
model_tuned = MultiOutputRegressor(lgb.LGBMRegressor(**best_params))
model_tuned.fit(X_scaled_fe, y_full)

# Predict on test
y_test_pred_tuned = model_tuned.predict(test_scaled_fe)

# Save submission
submission_tuned = pd.DataFrame(y_test_pred_tuned, columns=[f"BlendProperty{i}" for i in range(1, 11)])
submission_tuned.insert(0, 'ID', test['ID'])
submission_tuned.to_csv("submission_lgb_optuna.csv", index=False)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001973 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12982
[LightGBM] [Info] Number of data points in the train set: 2000, number of used features: 55
[LightGBM] [Info] Start training from score -0.014351
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001090 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12982
[LightGBM] [Info] Number of data points in the train set: 2000, number of used features: 55
[LightGBM] [Info] Start training from score -0.006068
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001068 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12982
[LightGBM] [Info] Number of data points in the train

In [None]:
best_params = {
    'n_estimators': 556,
    'learning_rate': 0.0413,
    'max_depth': 4,
    'num_leaves': 137,
    'subsample': 0.719,
    'colsample_bytree': 0.603,
    'random_state': 42,
    'n_jobs': -1
}

final_model = MultiOutputRegressor(lgb.LGBMRegressor(**best_params))
final_model.fit(X_scaled_fe, y_full)

y_test_pred_final = final_model.predict(test_scaled_fe)

final_submission = pd.DataFrame(y_test_pred_final, columns=[f"BlendProperty{i}" for i in range(1, 11)])
final_submission.insert(0, 'ID', test['ID'])
final_submission.to_csv("submission_optuna_final.csv", index=False)

print("Final Optuna-tuned submission saved as 'submission_optuna_final.csv'")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000867 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12982
[LightGBM] [Info] Number of data points in the train set: 2000, number of used features: 55
[LightGBM] [Info] Start training from score -0.014351
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000914 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12982
[LightGBM] [Info] Number of data points in the train set: 2000, number of used features: 55
[LightGBM] [Info] Start training from score -0.006068
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000894 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12982
[LightGBM] [Info] Number of data points in the train

1.04 and submission optuna final csv


In [None]:

lgb_model = MultiOutputRegressor(lgb.LGBMRegressor(**best_params))
lgb_model.fit(X_scaled_fe, y_full)

from catboost import CatBoostRegressor
cat_model = MultiOutputRegressor(
    CatBoostRegressor(
        iterations=1500,
        learning_rate=0.01,
        depth=6,
        loss_function='MAPE',
        verbose=0,
        random_seed=42
    )
)
cat_model.fit(X_scaled_fe, y_full)

y_val_pred_lgb = lgb_model.predict(X_val_fe)
y_val_pred_cat = cat_model.predict(X_val_fe)

y_val_ensemble = (0.7 * y_val_pred_lgb) + (0.3 * y_val_pred_cat)

ensemble_mape = mean_absolute_percentage_error(y_val_fe, y_val_ensemble)
print(f"Ensemble MAPE: {ensemble_mape:.4f}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001601 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12982
[LightGBM] [Info] Number of data points in the train set: 2000, number of used features: 55
[LightGBM] [Info] Start training from score -0.014351
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001015 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12982
[LightGBM] [Info] Number of data points in the train set: 2000, number of used features: 55
[LightGBM] [Info] Start training from score -0.006068
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000922 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12982
[LightGBM] [Info] Number of data points in the train

In [None]:
y_test_pred_lgb = lgb_model.predict(test_scaled_fe)
y_test_pred_cat = cat_model.predict(test_scaled_fe)

y_test_pred_ensemble = (0.7 * y_test_pred_lgb) + (0.3 * y_test_pred_cat)

ensemble_submission = pd.DataFrame(y_test_pred_ensemble, columns=[f"BlendProperty{i}" for i in range(1, 11)])
ensemble_submission.insert(0, 'ID', test['ID'])
ensemble_submission.to_csv("submission_ensemble.csv", index=False)

print("Ensemble submission saved as 'submission_ensemble.csv'")


Ensemble submission saved as 'submission_ensemble.csv'


0.67 and submission_ensemble.csv

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold

X_stack = X_scaled_fe
y_stack = y_full
test_stack = test_scaled_fe

oof_preds = np.zeros((X_stack.shape[0], 10, 3))  # 3 base models
test_preds = np.zeros((test_stack.shape[0], 10, 3))

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(kf.split(X_stack)):
    print(f"Fold {fold + 1}")

    X_tr, X_val_f = X_stack[train_idx], X_stack[val_idx]
    y_tr, y_val_f = y_stack.iloc[train_idx], y_stack.iloc[val_idx]

    lgb_fold = MultiOutputRegressor(lgb.LGBMRegressor(**best_params))
    lgb_fold.fit(X_tr, y_tr)
    oof_preds[val_idx, :, 0] = lgb_fold.predict(X_val_f)
    test_preds[:, :, 0] += lgb_fold.predict(test_stack) / 5

    cat_fold = MultiOutputRegressor(CatBoostRegressor(iterations=1000, learning_rate=0.01, depth=6, verbose=0))
    cat_fold.fit(X_tr, y_tr)
    oof_preds[val_idx, :, 1] = cat_fold.predict(X_val_f)
    test_preds[:, :, 1] += cat_fold.predict(test_stack) / 5

    ridge_fold = MultiOutputRegressor(Ridge(alpha=1.0))
    ridge_fold.fit(X_tr, y_tr)
    oof_preds[val_idx, :, 2] = ridge_fold.predict(X_val_f)
    test_preds[:, :, 2] += ridge_fold.predict(test_stack) / 5


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000737 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12981
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 55
[LightGBM] [Info] Start training from score -0.019191
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000752 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12981
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 55
[LightGBM] [Info] Start training from score 0.005438
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000747 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12981
[LightGBM] [Info] Number of data points in the train 

In [None]:
X_meta = oof_preds.reshape(X_stack.shape[0], -1)
X_test_meta = test_preds.reshape(test_stack.shape[0], -1)

meta_model = MultiOutputRegressor(Ridge(alpha=0.1))
meta_model.fit(X_meta, y_stack)

final_val_preds = meta_model.predict(X_meta)
final_mape = mean_absolute_percentage_error(y_stack, final_val_preds)
print(f"STACKED MAPE: {final_mape:.4f}")


STACKED MAPE: 0.4897


In [None]:
X_test_meta = test_preds.reshape(test_stack.shape[0], -1)

final_test_preds = meta_model.predict(X_test_meta)


In [None]:
submission_stack = pd.DataFrame(final_test_preds, columns=[f"BlendProperty{i}" for i in range(1, 11)])
submission_stack.insert(0, 'ID', test['ID'])
submission_stack.to_csv("submission_stacked.csv", index=False)

print("✅ Final stacked submission saved as 'submission_stacked.csv'")


NameError: name 'pd' is not defined

MAPE = 0.4897 and submission_stacked

In [None]:
from sklearn.linear_model import LinearRegression

correction_model = MultiOutputRegressor(LinearRegression())
correction_model.fit(final_val_preds, y_stack)

corrected_val_preds = correction_model.predict(final_val_preds)
corrected_mape = mean_absolute_percentage_error(y_stack, corrected_val_preds)
print(f"Corrected MAPE: {corrected_mape:.4f}")


Corrected MAPE: 0.4896


In [None]:
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

model_names = ["lgbm", "cat", "ridge", "xgb", "knn"]
oof_preds = np.zeros((X_stack.shape[0], 10, len(model_names)))
test_preds = np.zeros((test_stack.shape[0], 10, len(model_names)))

for fold, (train_idx, val_idx) in enumerate(kf.split(X_stack)):
    print(f"Fold {fold+1}")
    X_tr, X_val_f = X_stack[train_idx], X_stack[val_idx]
    y_tr, y_val_f = y_stack.iloc[train_idx], y_stack.iloc[val_idx]


    model = MultiOutputRegressor(lgb.LGBMRegressor(**best_params)).fit(X_tr, y_tr)
    oof_preds[val_idx, :, 0] = model.predict(X_val_f)
    test_preds[:, :, 0] += model.predict(test_stack) / 5

    model = MultiOutputRegressor(CatBoostRegressor(iterations=1000, learning_rate=0.01, depth=6, verbose=0)).fit(X_tr, y_tr)
    oof_preds[val_idx, :, 1] = model.predict(X_val_f)
    test_preds[:, :, 1] += model.predict(test_stack) / 5

    model = MultiOutputRegressor(Ridge(alpha=1.0)).fit(X_tr, y_tr)
    oof_preds[val_idx, :, 2] = model.predict(X_val_f)
    test_preds[:, :, 2] += model.predict(test_stack) / 5

    model = MultiOutputRegressor(XGBRegressor(n_estimators=600, max_depth=6, learning_rate=0.03)).fit(X_tr, y_tr)
    oof_preds[val_idx, :, 3] = model.predict(X_val_f)
    test_preds[:, :, 3] += model.predict(test_stack) / 5

    model = MultiOutputRegressor(KNeighborsRegressor(n_neighbors=5)).fit(X_tr, y_tr)
    oof_preds[val_idx, :, 4] = model.predict(X_val_f)
    test_preds[:, :, 4] += model.predict(test_stack) / 5


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000871 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12981
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 55
[LightGBM] [Info] Start training from score -0.019191
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000797 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12981
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 55
[LightGBM] [Info] Start training from score 0.005438
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000711 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12981
[LightGBM] [Info] Number of data points in the train 

KeyboardInterrupt: 

In [None]:
X_meta_full = oof_preds.reshape(X_stack.shape[0], -1)
X_test_meta = test_preds.reshape(test_stack.shape[0], -1)

meta_model = MultiOutputRegressor(Ridge(alpha=0.05))
meta_model.fit(X_meta_full, y_stack)

meta_val_preds = meta_model.predict(X_meta_full)
mape_stack = mean_absolute_percentage_error(y_stack, meta_val_preds)
print(f"MegaStack MAPE: {mape_stack:.4f}")


MegaStack MAPE: 0.7976


In [None]:
residuals = y_stack.values - meta_val_preds

residual_model = MultiOutputRegressor(Ridge(alpha=0.01))
residual_model.fit(X_meta_full, residuals)

final_val_preds = meta_val_preds + residual_model.predict(X_meta_full)
final_mape = mean_absolute_percentage_error(y_stack, final_val_preds)
print(f"MegaStack + Residual Correction MAPE: {final_mape:.4f}")


MegaStack + Residual Correction MAPE: 0.7991


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, concatenate

input_dim = X_scaled_fe.shape[1]
inputs = Input(shape=(input_dim,))

x1 = Dense(512, activation='relu')(inputs)
x1 = BatchNormalization()(x1)
x1 = Dropout(0.3)(x1)

x2 = Dense(256, activation='relu')(x1)
x2 = Dropout(0.2)(x2)
x2 = Dense(128, activation='relu')(x2)
x2 = Dropout(0.1)(x2)

x3 = Dense(64, activation='relu')(x2)
x3 = Dense(32, activation='relu')(x3)

merged = concatenate([x1, x2, x3])
out = Dense(10)(merged)

model = Model(inputs=inputs, outputs=out)
model.compile(optimizer=tf.keras.optimizers.Adam(1e-3), loss='mean_absolute_percentage_error')


In [None]:
X_train_dl, X_val_dl, y_train_dl, y_val_dl = train_test_split(X_scaled_fe, y_full, test_size=0.2, random_state=42)

history = model.fit(X_train_dl, y_train_dl,
                    validation_data=(X_val_dl, y_val_dl),
                    epochs=400, batch_size=64, verbose=0)

y_val_pred_dl = model.predict(X_val_dl)
mape_dl = mean_absolute_percentage_error(y_val_dl, y_val_pred_dl)
print(f"FuelNet MAPE: {mape_dl:.4f}")


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
FuelNet MAPE: 1.4945


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_percentage_error
import lightgbm as lgb
import numpy as np
import pandas as pd

blend_cols = X_full.columns[:5].tolist()
component_cols = X_full.columns[5:].tolist()

X_train, X_val, y_train, y_val = train_test_split(X_full, y_full, test_size=0.2, random_state=42)

scaler_blend = StandardScaler()
scaler_comp = StandardScaler()

X_train_blend = scaler_blend.fit_transform(X_train[blend_cols])
X_val_blend   = scaler_blend.transform(X_val[blend_cols])

X_train_comp = scaler_comp.fit_transform(X_train[component_cols])
X_val_comp   = scaler_comp.transform(X_val[component_cols])

best_params = {
    'n_estimators': 556,
    'learning_rate': 0.0413,
    'max_depth': 4,
    'num_leaves': 137,
    'subsample': 0.719,
    'colsample_bytree': 0.603
}
base_model = MultiOutputRegressor(lgb.LGBMRegressor(**best_params))
base_model.fit(X_train_blend, y_train)
y_base_pred_val = base_model.predict(X_val_blend)

residuals_train = y_train.values - base_model.predict(X_train_blend)
residual_model = MultiOutputRegressor(Ridge(alpha=0.01))
residual_model.fit(X_train_comp, residuals_train)

residual_pred_val = residual_model.predict(X_val_comp)
final_pred_val = y_base_pred_val + residual_pred_val

mape_2stage = mean_absolute_percentage_error(y_val, final_pred_val)
print(f"✅ Final Two-Stage Residual MAPE: {mape_2stage:.4f}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000095 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 237
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 5
[LightGBM] [Info] Start training from score -0.030361
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000118 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 237
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 5
[LightGBM] [Info] Start training from score -0.013032
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000108 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 237
[LightGBM] [Info] Number of data points in the train set: 16

In [None]:
from lightgbm import LGBMRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

mape_scores = []
test_preds = []

# Use your final scaled features
X = X_scaled_fe
X_test = X_test_scaled_fe
y = y_full.copy()

for i in range(10):
    y_target = y.iloc[:, i]

    model = LGBMRegressor(n_estimators=1000, learning_rate=0.01)
    model.fit(X, y_target)

    selector = SelectFromModel(model, threshold="median", prefit=True)
    X_selected = selector.transform(X)
    X_test_selected = selector.transform(X_test)

    X_train, X_val, y_train_i, y_val_i = train_test_split(X_selected, y_target, test_size=0.2, random_state=42)

    final_model = LGBMRegressor(n_estimators=2000, learning_rate=0.01)
    final_model.fit(X_train, y_train_i)

    y_val_pred = final_model.predict(X_val)
    mape = mean_absolute_percentage_error(y_val_i, y_val_pred)
    mape_scores.append(mape)

    y_test_pred = final_model.predict(X_test_selected)
    test_preds.append(y_test_pred)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001831 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20636
[LightGBM] [Info] Number of data points in the train set: 2000, number of used features: 85
[LightGBM] [Info] Start training from score -0.016879
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000749 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10435
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 45
[LightGBM] [Info] Start training from score -0.007867
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001775 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20636
[LightGBM] [Info] Number of data points in the train set: 2000, number of used features: 85
[LightGBM] [Info] Start t

In [None]:
avg_mape = sum(mape_scores) / len(mape_scores)
print(f"Average MAPE across all blend properties: {avg_mape:.4f}")


Average MAPE across all blend properties: 1.3004


In [None]:
!pip install tabpfn
!pip install lightgbm optuna


Collecting tabpfn
  Downloading tabpfn-2.0.9-py3-none-any.whl.metadata (25 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.1->tabpfn)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.1->tabpfn)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.1->tabpfn)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.1->tabpfn)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.1->tabpfn)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch<3,>=2.1->tabpfn)
  Downloadin

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load files
train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")

# Separate targets
target_cols = [f"BlendProperty{i}" for i in range(1, 11)]
X = train.drop(columns=target_cols)
y = train[target_cols]

# Normalize (TabPFN expects standardized data)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test.drop(columns=['ID']))

# Keep ID for submission later
test_IDs = test['ID'].values


In [None]:
from tabpfn import TabPFNClassifier
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np
import torch

tabpfn_preds_train = np.zeros((X.shape[0], 10))
tabpfn_preds_test = np.zeros((test_scaled.shape[0], 10))

for i, target in enumerate(target_cols):
    print(f"🧠 Training TabPFN for: {target}")

    y_target = y[target]

    # Bin target for classification-based regression
    kbinner = KBinsDiscretizer(n_bins=8, encode='ordinal', strategy='quantile')
    y_binned = kbinner.fit_transform(y_target.values.reshape(-1, 1)).ravel().astype(int)

    # Train TabPFN
    clf = TabPFNClassifier(device='cuda' if torch.cuda.is_available() else 'cpu')
    clf.fit(X_scaled, y_binned)

    # Predict on train
    probs_train = clf.predict_proba(X_scaled)
    bin_mids = (kbinner.bin_edges_[0][:-1] + kbinner.bin_edges_[0][1:]) / 2
    preds_train = (probs_train * bin_mids).sum(axis=1)
    tabpfn_preds_train[:, i] = preds_train

    # Predict on test
    probs_test = clf.predict_proba(test_scaled)
    preds_test = (probs_test * bin_mids).sum(axis=1)
    tabpfn_preds_test[:, i] = preds_test


🧠 Training TabPFN for: BlendProperty1
🧠 Training TabPFN for: BlendProperty2
🧠 Training TabPFN for: BlendProperty3
🧠 Training TabPFN for: BlendProperty4
🧠 Training TabPFN for: BlendProperty5
🧠 Training TabPFN for: BlendProperty6
🧠 Training TabPFN for: BlendProperty7
🧠 Training TabPFN for: BlendProperty8
🧠 Training TabPFN for: BlendProperty9
🧠 Training TabPFN for: BlendProperty10


In [None]:
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_percentage_error

# Train meta model using TabPFN output as feature
stack_X_train, stack_X_val, stack_y_train, stack_y_val = train_test_split(
    tabpfn_preds_train, y, test_size=0.2, random_state=42
)

lgb_model = MultiOutputRegressor(LGBMRegressor(n_estimators=800, learning_rate=0.03))
lgb_model.fit(stack_X_train, stack_y_train)

val_preds = lgb_model.predict(stack_X_val)
mape = mean_absolute_percentage_error(stack_y_val, val_preds)
print(f"📉 Final Stacked MAPE: {mape:.4f}")




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000360 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 10
[LightGBM] [Info] Start training from score -0.007867




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000265 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 10
[LightGBM] [Info] Start training from score -0.004643




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000272 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 10
[LightGBM] [Info] Start training from score -0.030361




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000255 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 10
[LightGBM] [Info] Start training from score -0.013032




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000161 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 10
[LightGBM] [Info] Start training from score -0.033121




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000246 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 10
[LightGBM] [Info] Start training from score -0.021294




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000295 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 10
[LightGBM] [Info] Start training from score -0.029208




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000066 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 10
[LightGBM] [Info] Start training from score -0.010603




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000155 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 10
[LightGBM] [Info] Start training from score -0.018851




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000220 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 10
[LightGBM] [Info] Start training from score 0.004861




📉 Final Stacked MAPE: 0.9410


In [None]:
# Predict on test set using TabPFN + LightGBM
final_preds_test = lgb_model.predict(tabpfn_preds_test)

# Save submission
submission = pd.DataFrame(final_preds_test, columns=target_cols)
submission.insert(0, 'ID', test_IDs)
submission.to_csv("submission_tabpfn_stack.csv", index=False)




In [None]:
import numpy as np

# Concatenate original features with TabPFN predictions
X_full_aug = np.concatenate([X_scaled, tabpfn_preds_train], axis=1)
X_test_aug = np.concatenate([test_scaled, tabpfn_preds_test], axis=1)

# Train/val split
X_train, X_val, y_train_split, y_val_split = train_test_split(X_full_aug, y, test_size=0.2, random_state=42)


In [None]:
model = MultiOutputRegressor(LGBMRegressor(n_estimators=1200, learning_rate=0.02))
model.fit(X_train, y_train_split)
preds_val = model.predict(X_val)

from sklearn.metrics import mean_absolute_percentage_error
mape = mean_absolute_percentage_error(y_val_split, preds_val)
print(f"🧠 Augmented Stacked MAPE: {mape:.4f}")




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001048 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15535
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 65
[LightGBM] [Info] Start training from score -0.007867




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001022 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15535
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 65
[LightGBM] [Info] Start training from score -0.004643




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000988 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15535
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 65
[LightGBM] [Info] Start training from score -0.030361




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001042 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15535
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 65
[LightGBM] [Info] Start training from score -0.013032




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001046 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15535
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 65
[LightGBM] [Info] Start training from score -0.033121




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001622 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15535
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 65
[LightGBM] [Info] Start training from score -0.021294




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001018 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15535
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 65
[LightGBM] [Info] Start training from score -0.029208




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001177 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15535
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 65
[LightGBM] [Info] Start training from score -0.010603




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001062 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15535
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 65
[LightGBM] [Info] Start training from score -0.018851




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001045 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15535
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 65
[LightGBM] [Info] Start training from score 0.004861




🧠 Augmented Stacked MAPE: 0.5097




In [None]:
# Predict on the test set
final_test_preds = model.predict(X_test_aug)

# Create submission DataFrame
import pandas as pd
target_cols = [f"BlendProperty{i}" for i in range(1, 11)]
submission = pd.DataFrame(final_test_preds, columns=target_cols)
submission.insert(0, 'ID', test_IDs)

# Save to CSV
submission.to_csv("submission_tabpfn_augmented.csv", index=False)
print("✅ Submission file saved: submission_tabpfn_augmented.csv")




✅ Submission file saved: submission_tabpfn_augmented.csv


In [None]:
from sklearn.feature_selection import SelectKBest, f_regression

def get_top_k_features(X, y_target, k=40):
    selector = SelectKBest(score_func=f_regression, k=k)
    selector.fit(X, y_target)
    mask = selector.get_support()
    return X.columns[mask]


In [None]:
top_features = get_top_k_features(pd.DataFrame(X_scaled, columns=X.columns), y['BlendProperty3'], k=40)


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

def train_best_model(X, y_target):
    X_train, X_val, y_train, y_val = train_test_split(X, y_target, test_size=0.2, random_state=42)

    models = {
        'lgbm': LGBMRegressor(n_estimators=800, learning_rate=0.02),
        'catboost': CatBoostRegressor(verbose=0, iterations=1000, learning_rate=0.03),
        'ridge': Ridge(alpha=1.0)
    }

    best_model, best_mape = None, float('inf')
    for name, model in models.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        mape = mean_absolute_percentage_error(y_val, preds)
        print(f"{name} MAPE: {mape:.4f}")
        if mape < best_mape:
            best_mape = mape
            best_model = model

    return best_model, best_mape


In [None]:
def augment_with_tabpfn(X_df, tabpfn_preds, property_index):
    tab_feature = tabpfn_preds[:, property_index].reshape(-1, 1)
    return np.concatenate([X_df.values, tab_feature], axis=1)


In [None]:
# Step 1: Convert scaled array to DataFrame
X_df = pd.DataFrame(X_scaled, columns=X.columns)

# Step 2: Select top-k features
top_feats = get_top_k_features(X_df, y['BlendProperty3'], k=40)

# Step 3: Augment selected features with TabPFN prediction
augmented_X = augment_with_tabpfn(X_df[top_feats], tabpfn_preds_train, 2)  # index 2 for BlendProperty3


In [None]:
trained_models = []
for i, target in enumerate(target_cols):
    print(f"\n🔧 Training for {target}")

    # Select top features
    top_feats = get_top_k_features(pd.DataFrame(X_scaled, columns=X.columns), y[target], k=40)

    # Augment with TabPFN prediction
    X_feat = pd.DataFrame(X_scaled, columns=X.columns)[top_feats]
    X_aug = augment_with_tabpfn(X_feat, tabpfn_preds_train, i)

    # Train and save best model
    model, mape = train_best_model(X_aug, y[target])
    trained_models.append((model, top_feats))



🔧 Training for BlendProperty1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000998 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9618
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 41
[LightGBM] [Info] Start training from score -0.007867




lgbm MAPE: 4.2119
catboost MAPE: 0.8411
ridge MAPE: 7.6340

🔧 Training for BlendProperty2
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001039 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9416
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 41
[LightGBM] [Info] Start training from score -0.004643




lgbm MAPE: 0.4326
catboost MAPE: 0.4012
ridge MAPE: 0.6245

🔧 Training for BlendProperty3
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001078 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9416
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 41
[LightGBM] [Info] Start training from score -0.030361




lgbm MAPE: 0.3988
catboost MAPE: 0.4664
ridge MAPE: 0.5667

🔧 Training for BlendProperty4
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001023 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9415
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 41
[LightGBM] [Info] Start training from score -0.013032




lgbm MAPE: 0.3413
catboost MAPE: 0.3837
ridge MAPE: 0.5706

🔧 Training for BlendProperty5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001044 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9415
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 41
[LightGBM] [Info] Start training from score -0.033121




lgbm MAPE: 0.1020
catboost MAPE: 0.1414
ridge MAPE: 0.7094

🔧 Training for BlendProperty6
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001011 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9416
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 41
[LightGBM] [Info] Start training from score -0.021294




lgbm MAPE: 0.4190
catboost MAPE: 0.7373
ridge MAPE: 0.8448

🔧 Training for BlendProperty7
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001014 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9416
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 41
[LightGBM] [Info] Start training from score -0.029208




lgbm MAPE: 0.3382
catboost MAPE: 0.3656
ridge MAPE: 0.6163

🔧 Training for BlendProperty8
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001019 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9417
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 41
[LightGBM] [Info] Start training from score -0.010603




lgbm MAPE: 0.9858
catboost MAPE: 0.6965
ridge MAPE: 0.5218

🔧 Training for BlendProperty9
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001031 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9416
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 41
[LightGBM] [Info] Start training from score -0.018851




lgbm MAPE: 0.4990
catboost MAPE: 0.3784
ridge MAPE: 0.7184

🔧 Training for BlendProperty10
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001004 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9416
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 41
[LightGBM] [Info] Start training from score 0.004861




lgbm MAPE: 0.2728
catboost MAPE: 0.2615
ridge MAPE: 0.4544


In [None]:
# Predict test set using trained models
final_preds_test = np.zeros((test_scaled.shape[0], 10))

for i, (model, top_feats) in enumerate(trained_models):
    X_feat_test = pd.DataFrame(test_scaled, columns=X.columns)[top_feats]
    X_aug_test = augment_with_tabpfn(X_feat_test, tabpfn_preds_test, i)
    final_preds_test[:, i] = model.predict(X_aug_test)




In [None]:
submission = pd.DataFrame(final_preds_test, columns=target_cols)
submission.insert(0, 'ID', test_IDs)
submission.to_csv("submission_precision_stack.csv", index=False)


In [None]:
from sklearn.preprocessing import PolynomialFeatures

def add_interactions(X, degree=2, interaction_only=True):
    poly = PolynomialFeatures(degree=degree, interaction_only=interaction_only, include_bias=False)
    X_poly = poly.fit_transform(X)
    return X_poly


In [None]:
# Use top features only
X_feat = pd.DataFrame(X_scaled, columns=X.columns)[top_feats]
X_aug = augment_with_tabpfn(X_feat, tabpfn_preds_train, i)
X_interact = add_interactions(pd.DataFrame(X_aug))


In [None]:
from sklearn.linear_model import RidgeCV

def train_residual_model(base_model, X_train, y_train):
    base_preds = base_model.predict(X_train)
    residuals = y_train - base_preds
    meta_model = RidgeCV(alphas=[0.1, 1.0, 10.0])
    meta_model.fit(X_train, residuals)
    return meta_model


In [None]:
from lightgbm import LGBMRegressor
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

# Example target
target_col = "BlendProperty3"

# Use selected features (e.g., from previous top-k selection or augmented features)
X_feat = pd.DataFrame(X_scaled, columns=X.columns)[top_feats]
X_aug = augment_with_tabpfn(X_feat, tabpfn_preds_train, target_cols.index(target_col))

# Train/val split
X_train, X_val, y_train, y_val = train_test_split(X_aug, y[target_col], test_size=0.2, random_state=42)

# ✅ Define the base model
base_model = LGBMRegressor(n_estimators=800, learning_rate=0.02)
base_model.fit(X_train, y_train)

# ✅ Define the residual learning function
def train_residual_model(base_model, X_train, y_train):
    base_preds = base_model.predict(X_train)
    residuals = y_train - base_preds
    meta_model = RidgeCV(alphas=[0.1, 1.0, 10.0])
    meta_model.fit(X_train, residuals)
    return meta_model

# ✅ Train residual model
meta_model = train_residual_model(base_model, X_train, y_train)

# ✅ Final predictions = base + residual correction
final_preds = base_model.predict(X_val) + meta_model.predict(X_val)

# ✅ Evaluate MAPE
mape = mean_absolute_percentage_error(y_val, final_preds)
print(f"📉 Corrected Residual Model MAPE: {mape:.4f}")




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001120 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9416
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 41
[LightGBM] [Info] Start training from score -0.030361




📉 Corrected Residual Model MAPE: 0.3994


In [None]:
from sklearn.linear_model import RidgeCV

def blend_models(preds_list, y_true):
    stacked_preds = np.column_stack(preds_list)
    blender = RidgeCV()
    blender.fit(stacked_preds, y_true)
    return blender


In [None]:
def mape_loss_lgbm(preds, data):
    y = data.get_label()
    err = np.abs((y - preds) / np.clip(np.abs(y), 1e-6, None))
    grad = np.sign(preds - y) / np.clip(np.abs(y), 1e-6, None)
    hess = 1 / np.clip(np.abs(y), 1e-6, None)
    return 'mape', err.mean(), False


In [None]:
import pandas as pd

# `final_preds_test` must be shape (500, 10)
# `test_IDs` must be your test IDs from the test.csv

submission_df = pd.DataFrame(final_preds_test, columns=[f'BlendProperty{i}' for i in range(1, 11)])
submission_df.insert(0, 'ID', test_IDs)

# Round to 6 decimals (recommended for leaderboard stability)
submission_df = submission_df.round(6)

# Save the file
submission_df.to_csv('submission_corrected_residual_model.csv', index=False)
print("✅ Submission file saved: submission_corrected_residual_model.csv")


✅ Submission file saved: submission_corrected_residual_model.csv
