After completing HPO for every model (LGBM, XGBoost, Catboost), a submission for every single model is created in this notebook for comparison to the ensemble.

First for LGBM:

In [None]:
import pandas as pd
import lightgbm as lgb
from creditutils.path_utils import get_project_root

# Pfade
proj_root = get_project_root(levels_up=1)
output_dir = proj_root / "outputs"
train_path = output_dir / "03_train_features_autosearch_baseline.parquet"
test_path  = output_dir / "03_test_features_autosearch_baseline.parquet"
sub_path   = proj_root / "submissions" / "01_submission_ligthgbm_auto_and_handcrafted_baseline.csv"

# Daten laden
train_df = pd.read_parquet(train_path)
test_df  = pd.read_parquet(test_path)

# Zielspalte & Features
y_train = train_df["TARGET"]
X_train = train_df.drop(columns=["SK_ID_CURR", "TARGET"])
X_test  = test_df.drop(columns=["SK_ID_CURR"])

# Gleiche Kategorien erzwingen bei object/categorical Columns
cat_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()
for col in cat_cols:
    X_train[col] = X_train[col].astype("category")
    X_test[col] = pd.Categorical(X_test[col], categories=X_train[col].cat.categories)

# Modell
model = lgb.LGBMClassifier(
    boosting_type="goss",
    learning_rate=0.0031335727235880005,
    max_depth=13,
    num_leaves=100,
    min_child_samples=191,
    reg_alpha=0.037036865048284115,
    reg_lambda=0.0021849671286405664,
    subsample=0.8476213315586094,
    colsample_bytree=0.444071532621635,
    n_estimators=5339,
    random_state=42,
    n_jobs=-1
)

# Training
model.fit(X_train, y_train)

# Prediction
y_pred = model.predict_proba(X_test)[:, 1]

# Submission
submission = pd.DataFrame({
    "SK_ID_CURR": test_df["SK_ID_CURR"],
    "TARGET": y_pred
})
submission.to_csv(sub_path, index=False)
print(f"Submission saved to: {sub_path}")

Catboost:

In [3]:
import pandas as pd
from catboost import CatBoostClassifier
from creditutils.path_utils import get_project_root

# Pfade
proj_root = get_project_root(levels_up=1)
output_dir = proj_root / "outputs"
train_path = output_dir / "03_train_features_autosearch_baseline.parquet"
test_path  = output_dir / "03_test_features_autosearch_baseline.parquet"
sub_path   = proj_root / "submissions" / "01_submission_catboost_baseline.csv"

# Daten laden
train_df = pd.read_parquet(train_path)
test_df  = pd.read_parquet(test_path)

# Ziel & Features
y_train = train_df["TARGET"]
X_train = train_df.drop(columns=["SK_ID_CURR", "TARGET"])
X_test  = test_df.drop(columns=["SK_ID_CURR"])

# Automatisch kategorische Spalten finden (object oder category)
cat_cols = X_train.select_dtypes(include=["object", "category"]).columns
cat_features_idx = [X_train.columns.get_loc(col) for col in cat_cols]

# CatBoost benötigt Strings bei kategorischen Features
X_train[cat_cols] = X_train[cat_cols].astype(str)
X_test[cat_cols]  = X_test[cat_cols].astype(str)

# Modell
model = CatBoostClassifier(
    iterations=1383,
    learning_rate=0.042326510977740595,
    depth=5,
    l2_leaf_reg=5.551455325485743,
    bagging_temperature=0.6584052675829963,
    border_count=72,
    auto_class_weights='Balanced',
    eval_metric='AUC',
    random_seed=42,
    verbose=0,
    task_type='CPU'
)

model.fit(X_train, y_train, cat_features=cat_features_idx)

# Prediction & Submission
y_pred = model.predict_proba(X_test)[:, 1]
submission = pd.DataFrame({
    "SK_ID_CURR": test_df["SK_ID_CURR"],
    "TARGET": y_pred
})
submission.to_csv(sub_path, index=False)
print(f"Submission saved to: {sub_path}")

Submission saved to: C:\Users\tgruenecker\OneDrive\Desktop\Master_Studium\3. Semester\Home_Credit_Projekt\Home_Credit_Project\submissions\01_submission_catboost_baseline.csv


XGBoost

In [3]:
import pandas as pd
from xgboost import XGBClassifier
from creditutils.path_utils import get_project_root

# Pfade
proj_root = get_project_root(levels_up=1)
output_dir = proj_root / "outputs"
train_path = output_dir / "03_train_features_autosearch_baseline.parquet"
test_path  = output_dir / "03_test_features_autosearch_baseline.parquet"
sub_path   = proj_root / "submissions" / "01_submission_xgboost_baseline.csv"

# Daten laden
train_df = pd.read_parquet(train_path)
test_df = pd.read_parquet(test_path)

# Ziel & Features
y_train = train_df["TARGET"]
X_train = train_df.drop(columns=["SK_ID_CURR", "TARGET"])
X_test = test_df.drop(columns=["SK_ID_CURR"])

# Alle nicht-numerischen Spalten in category umwandeln (XGBoost-kompatibel)
non_numeric_cols = X_train.select_dtypes(exclude=["int", "float", "bool"]).columns
for col in non_numeric_cols:
    X_train[col] = X_train[col].astype("category")
    X_test[col] = X_test[col].astype("category")

# Modell
model = XGBClassifier(
    n_estimators=1042,
    learning_rate=0.05558810799284791,
    max_depth=3,
    subsample=0.8787759145726666,
    colsample_bytree=0.9105365550107795,
    gamma=3.633742017324177,
    reg_alpha=3.5602493930649466,
    reg_lambda=2.183731116122563,
    scale_pos_weight=11.387150050352467,
    use_label_encoder=False,
    enable_categorical=True,
    eval_metric='auc',
    tree_method='hist',
    random_state=42,
    n_jobs=-1
)

# Trainieren
model.fit(X_train, y_train)

# Vorhersage & Submission
y_pred = model.predict_proba(X_test)[:, 1]
submission = pd.DataFrame({
    "SK_ID_CURR": test_df["SK_ID_CURR"],
    "TARGET": y_pred
})
submission.to_csv(sub_path, index=False)
print(f"Submission saved to: {sub_path}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Submission saved to: C:\Users\tgruenecker\OneDrive\Desktop\Master_Studium\3. Semester\Home_Credit_Projekt\Home_Credit_Project\submissions\01_submission_xgboost_baseline.csv
