In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from pathlib import Path
import os

RAW = Path("/content/drive/MyDrive/home_credit/raw/raw")
sorted(os.listdir(RAW))


['.DS_Store',
 'HomeCredit_columns_description.csv',
 'POS_CASH_balance.csv',
 'application_test.csv',
 'application_train.csv',
 'bureau.csv',
 'bureau_balance.csv',
 'credit_card_balance.csv',
 'installments_payments.csv',
 'previous_application.csv',
 'sample_submission.csv']

In [None]:
import pandas as pd

train = pd.read_csv(RAW / "application_train.csv")
test  = pd.read_csv(RAW / "application_test.csv")

print(train.shape)
print(test.shape)
print(train["TARGET"].mean())



(307511, 122)
(48744, 121)
0.08072881945686496


In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

X_ext = train[["EXT_SOURCE_1","EXT_SOURCE_2","EXT_SOURCE_3"]]
y = train["TARGET"].astype(int)

pipe_ext = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("lr", LogisticRegression(max_iter=1000))
])

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
auc = cross_val_score(pipe_ext, X_ext, y, cv=cv, scoring="roc_auc", n_jobs=1)

print("EXT_SOURCE AUC:", np.round(auc,5), "Mean:", auc.mean())


EXT_SOURCE AUC: [0.7194  0.71975 0.71394] Mean: 0.7176972027883837


In [None]:
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

# Drop ID (important)
X = train.drop(columns=["TARGET", "SK_ID_CURR"])
y = train["TARGET"].astype(int)

# 50k stratified sample (fast CV)
sss = StratifiedShuffleSplit(n_splits=1, train_size=50000, random_state=42)
idx, _ = next(sss.split(X, y))
X_small = X.iloc[idx]
y_small = y.iloc[idx]

num_cols = X_small.select_dtypes(include=["int64","float64"]).columns
cat_cols = X_small.select_dtypes(include=["object"]).columns

# OneHot: reduce rare categories (big speed + stability)
try:
    ohe = OneHotEncoder(handle_unknown="infrequent_if_exist", min_frequency=50)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore")  # older sklearn fallback

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("sc", StandardScaler(with_mean=False))
        ]), num_cols),
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("ohe", ohe)
        ]), cat_cols),
    ],
    remainder="drop"
)

pipe = Pipeline([
    ("prep", preprocess),
    ("lr", LogisticRegression(
        solver="saga",
        max_iter=2000,
        tol=1e-3
    ))
])

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
auc = cross_val_score(pipe, X_small, y_small, cv=cv, scoring="roc_auc", n_jobs=1)

print("AUC:", np.round(auc, 5))
print("Mean:", auc.mean(), "Std:", auc.std())


AUC: [0.72868 0.74214 0.74308]
Mean: 0.7379668415370543 Std: 0.006578819935672576


In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

# Build X/y (drop ID)
X = train.drop(columns=["TARGET", "SK_ID_CURR"])
y = train["TARGET"].astype(int)
X_test = test.drop(columns=["SK_ID_CURR"])

num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

# OHE compression (big speed win)
try:
    ohe = OneHotEncoder(handle_unknown="infrequent_if_exist", min_frequency=200)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore")  # fallback if older sklearn

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("sc", StandardScaler(with_mean=False))
        ]), num_cols),
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("ohe", ohe)
        ]), cat_cols),
    ],
    remainder="drop"
)

pipe = Pipeline([
    ("prep", preprocess),
    ("lr", LogisticRegression(solver="saga", max_iter=800, tol=1e-2))
])


In [None]:
pipe.fit(X, y)
pred = pipe.predict_proba(X_test)[:, 1]

sub = pd.DataFrame({"SK_ID_CURR": test["SK_ID_CURR"], "TARGET": pred})
sub_path = "/content/submission_baseline_logreg.csv"
sub.to_csv(sub_path, index=False)
print("Saved:", sub_path)
sub.head()


Saved: /content/submission_baseline_logreg.csv


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.052449
1,100005,0.160146
2,100013,0.03312
3,100028,0.039007
4,100038,0.148576


In [None]:
from pathlib import Path
out_dir = Path("/content/drive/MyDrive/home_credit/outputs")
out_dir.mkdir(parents=True, exist_ok=True)

drive_path = out_dir / "submission_baseline_logreg.csv"
sub.to_csv(drive_path, index=False)
print("Saved to Drive:", drive_path)


Saved to Drive: /content/drive/MyDrive/home_credit/outputs/submission_baseline_logreg.csv


In [None]:
from pathlib import Path
RAW = Path("/content/drive/MyDrive/home_credit/raw/raw")


In [None]:
import pandas as pd
import numpy as np

# Read only the columns we need (safer + faster)
cols_available = pd.read_csv(RAW / "previous_application.csv", nrows=1).columns.tolist()

need = [
    "SK_ID_CURR","SK_ID_PREV","NAME_CONTRACT_STATUS",
    "AMT_APPLICATION","AMT_CREDIT","AMT_ANNUITY",
    "AMT_DOWN_PAYMENT","RATE_DOWN_PAYMENT",
    "DAYS_DECISION","CNT_PAYMENT"
]
usecols = [c for c in need if c in cols_available]

prev = pd.read_csv(RAW / "previous_application.csv", usecols=usecols)

print(prev.shape)
prev.head()


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/home_credit/raw/raw/previous_application.csv'

In [None]:
from pathlib import Path

BASE = Path("/content/drive/MyDrive/home_credit")

candidates = [BASE/"raw"/"raw", BASE/"raw", BASE]
RAW = None

for p in candidates:
    if (p/"application_train.csv").exists():
        RAW = p
        break

print("RAW folder selected:", RAW)
print("Exists?", RAW.exists() if RAW else None)


RAW folder selected: None
Exists? None


In [None]:
prev_files = sorted(RAW.glob("*previous*"))
print("Possible previous_application files:")
for f in prev_files:
    print(" -", f.name)


AttributeError: 'NoneType' object has no attribute 'glob'

In [None]:
sorted([f.name for f in RAW.glob("*.csv")])[:50]


AttributeError: 'NoneType' object has no attribute 'glob'

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
from pathlib import Path

MYDRIVE = Path("/content/drive/MyDrive")

# look for either name (some people have HC_ prefix)
candidates = list(MYDRIVE.rglob("application_train.csv")) + list(MYDRIVE.rglob("HC_application_train.csv"))

print("Found:", len(candidates))
for p in candidates[:10]:
    print(p)

# set RAW if found
if len(candidates) > 0:
    RAW = candidates[0].parent
    print("\n✅ RAW folder set to:", RAW)
    print("\nSome CSVs in RAW:")
    print(sorted([f.name for f in RAW.glob("*.csv")])[:30])
else:
    RAW = None
    print("\n❌ Did not find application_train.csv anywhere in MyDrive")


Found: 1
/content/drive/MyDrive/home_credit/raw/raw/application_train.csv

✅ RAW folder set to: /content/drive/MyDrive/home_credit/raw/raw

Some CSVs in RAW:
['HomeCredit_columns_description.csv', 'POS_CASH_balance.csv', 'application_test.csv', 'application_train.csv', 'bureau.csv', 'bureau_balance.csv', 'credit_card_balance.csv', 'installments_payments.csv', 'previous_application.csv', 'sample_submission.csv']


In [None]:
import pandas as pd
import numpy as np

prev = pd.read_csv(RAW / "previous_application.csv")
print(prev.shape)
prev[["SK_ID_CURR","SK_ID_PREV","NAME_CONTRACT_STATUS","AMT_APPLICATION","AMT_CREDIT","DAYS_DECISION"]].head()


(1670214, 37)


Unnamed: 0,SK_ID_CURR,SK_ID_PREV,NAME_CONTRACT_STATUS,AMT_APPLICATION,AMT_CREDIT,DAYS_DECISION
0,271877,2030495,Approved,17145.0,17145.0,-73
1,108129,2802425,Approved,607500.0,679671.0,-164
2,122040,2523466,Approved,112500.0,136444.5,-301
3,176158,2819243,Approved,450000.0,470790.0,-512
4,202054,1784265,Refused,337500.0,404055.0,-781


In [None]:
# Flags
prev["PREV_APPROVED"] = (prev["NAME_CONTRACT_STATUS"] == "Approved").astype(np.int8)
prev["PREV_REFUSED"]  = (prev["NAME_CONTRACT_STATUS"] == "Refused").astype(np.int8)

# Ratios (avoid div0)
prev["APP_CREDIT_RATIO"] = prev["AMT_APPLICATION"] / (prev["AMT_CREDIT"] + 1e-9)

# Aggregate to customer level
prev_agg = prev.groupby("SK_ID_CURR").agg(
    prev_count=("SK_ID_PREV", "nunique"),
    prev_approved_rate=("PREV_APPROVED", "mean"),
    prev_refused_rate=("PREV_REFUSED", "mean"),
    prev_approved_sum=("PREV_APPROVED", "sum"),
    prev_refused_sum=("PREV_REFUSED", "sum"),
    prev_amt_app_mean=("AMT_APPLICATION", "mean"),
    prev_amt_credit_mean=("AMT_CREDIT", "mean"),
    prev_amt_annuity_mean=("AMT_ANNUITY", "mean"),
    prev_app_credit_ratio_mean=("APP_CREDIT_RATIO", "mean"),
    prev_days_decision_min=("DAYS_DECISION", "min"),
    prev_days_decision_max=("DAYS_DECISION", "max"),
).reset_index()

# Time since last application (days)
prev_agg["prev_time_since_last_app"] = -prev_agg["prev_days_decision_max"]

print(prev_agg.shape)
prev_agg.head()


(338857, 13)


Unnamed: 0,SK_ID_CURR,prev_count,prev_approved_rate,prev_refused_rate,prev_approved_sum,prev_refused_sum,prev_amt_app_mean,prev_amt_credit_mean,prev_amt_annuity_mean,prev_app_credit_ratio_mean,prev_days_decision_min,prev_days_decision_max,prev_time_since_last_app
0,100001,1,1.0,0.0,1,0,24835.5,23787.0,3951.0,1.044079,-1740,-1740,1740
1,100002,1,1.0,0.0,1,0,179055.0,179055.0,9251.775,1.0,-606,-606,606
2,100003,3,1.0,0.0,3,0,435436.5,484191.0,56553.99,0.949329,-2341,-746,746
3,100004,1,1.0,0.0,1,0,24282.0,20106.0,5357.25,1.207699,-815,-815,815
4,100005,2,0.5,0.0,1,0,22308.75,20076.75,4813.2,0.555587,-757,-315,315


In [None]:
train2 = train.merge(prev_agg, on="SK_ID_CURR", how="left")
test2  = test.merge(prev_agg, on="SK_ID_CURR", how="left")

print(train2.shape, test2.shape)


NameError: name 'train' is not defined

In [None]:
import pandas as pd

train = pd.read_csv(RAW / "application_train.csv")
test  = pd.read_csv(RAW / "application_test.csv")

print("train:", train.shape, "test:", test.shape)
print("TARGET mean:", train["TARGET"].mean())



train: (307511, 122) test: (48744, 121)
TARGET mean: 0.08072881945686496


In [None]:
print("prev_agg exists?", "prev_agg" in globals())


prev_agg exists? True


In [None]:
train2 = train.merge(prev_agg, on="SK_ID_CURR", how="left")
test2  = test.merge(prev_agg, on="SK_ID_CURR", how="left")

print(train2.shape, test2.shape)


(307511, 134) (48744, 133)


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

X = train2.drop(columns=["TARGET","SK_ID_CURR"])
y = train2["TARGET"].astype(int)
X_test = test2.drop(columns=["SK_ID_CURR"])

num_cols = X.select_dtypes(include=["int64","float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

try:
    ohe = OneHotEncoder(handle_unknown="infrequent_if_exist", min_frequency=200)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore")

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("sc", StandardScaler(with_mean=False))
        ]), num_cols),
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("ohe", ohe)
        ]), cat_cols),
    ],
    remainder="drop"
)

pipe_prev = Pipeline([
    ("prep", preprocess),
    ("lr", LogisticRegression(solver="saga", max_iter=800, tol=1e-2))
])

pipe_prev.fit(X, y)
pred = pipe_prev.predict_proba(X_test)[:, 1]

sub_prev = pd.DataFrame({"SK_ID_CURR": test2["SK_ID_CURR"], "TARGET": pred})
sub_path = "/content/submission_prevagg_logreg.csv"
sub_prev.to_csv(sub_path, index=False)
print("Saved:", sub_path)
sub_prev.head()


Saved: /content/submission_prevagg_logreg.csv


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.053069
1,100005,0.167421
2,100013,0.029586
3,100028,0.041811
4,100038,0.158832


In [None]:
from pathlib import Path
out_dir = Path("/content/drive/MyDrive/home_credit/outputs")
out_dir.mkdir(parents=True, exist_ok=True)

drive_path = out_dir / "submission_prevagg_logreg.csv"
sub_prev.to_csv(drive_path, index=False)
print("Saved to Drive:", drive_path)


Saved to Drive: /content/drive/MyDrive/home_credit/outputs/submission_prevagg_logreg.csv


In [None]:
import pandas as pd
import numpy as np

bureau = pd.read_csv(RAW / "bureau.csv")
bb = pd.read_csv(RAW / "bureau_balance.csv")

print("bureau:", bureau.shape, "bb:", bb.shape)
bureau.head()


bureau: (1716428, 17) bb: (27299925, 3)


Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


In [None]:
# Convert STATUS to "badness" score (higher = worse)
status_map = {"C":0, "0":0, "1":1, "2":2, "3":3, "4":4, "5":5, "X":0}
bb["STATUS_NUM"] = bb["STATUS"].map(status_map).fillna(0).astype(int)

bb_agg = bb.groupby("SK_ID_BUREAU").agg(
    bb_months_count=("MONTHS_BALANCE", "count"),
    bb_months_min=("MONTHS_BALANCE", "min"),
    bb_months_max=("MONTHS_BALANCE", "max"),
    bb_status_worst=("STATUS_NUM", "max"),
    bb_status_mean=("STATUS_NUM", "mean"),
).reset_index()

print(bb_agg.shape)
bb_agg.head()


(817395, 6)


Unnamed: 0,SK_ID_BUREAU,bb_months_count,bb_months_min,bb_months_max,bb_status_worst,bb_status_mean
0,5001709,97,-96,0,0,0.0
1,5001710,83,-82,0,0,0.0
2,5001711,4,-3,0,0,0.0
3,5001712,19,-18,0,0,0.0
4,5001713,22,-21,0,0,0.0


In [None]:
bureau2 = bureau.merge(bb_agg, on="SK_ID_BUREAU", how="left")

# Flags
bureau2["BUREAU_ACTIVE"] = (bureau2["CREDIT_ACTIVE"] == "Active").astype(np.int8)
bureau2["BUREAU_CLOSED"] = (bureau2["CREDIT_ACTIVE"] == "Closed").astype(np.int8)

# Simple ratio (avoid div0)
bureau2["DEBT_CREDIT_RATIO"] = bureau2["AMT_CREDIT_SUM_DEBT"] / (bureau2["AMT_CREDIT_SUM"] + 1e-9)

bureau_agg = bureau2.groupby("SK_ID_CURR").agg(
    bureau_count=("SK_ID_BUREAU", "nunique"),
    bureau_active_rate=("BUREAU_ACTIVE", "mean"),
    bureau_closed_rate=("BUREAU_CLOSED", "mean"),

    bureau_credit_sum=("AMT_CREDIT_SUM", "sum"),
    bureau_debt_sum=("AMT_CREDIT_SUM_DEBT", "sum"),
    bureau_overdue_sum=("AMT_CREDIT_SUM_OVERDUE", "sum"),
    bureau_debt_credit_ratio_mean=("DEBT_CREDIT_RATIO", "mean"),

    bureau_days_credit_min=("DAYS_CREDIT", "min"),
    bureau_days_credit_max=("DAYS_CREDIT", "max"),

    bb_worst_mean=("bb_status_worst", "mean"),
    bb_worst_max=("bb_status_worst", "max"),
    bb_months_count_sum=("bb_months_count", "sum"),
).reset_index()

print(bureau_agg.shape)
bureau_agg.head()


(305811, 13)


Unnamed: 0,SK_ID_CURR,bureau_count,bureau_active_rate,bureau_closed_rate,bureau_credit_sum,bureau_debt_sum,bureau_overdue_sum,bureau_debt_credit_ratio_mean,bureau_days_credit_min,bureau_days_credit_max,bb_worst_mean,bb_worst_max,bb_months_count_sum
0,100001,7,0.428571,0.571429,1453365.0,596686.5,0.0,0.282518,-1572,-49,0.142857,1.0,172.0
1,100002,8,0.25,0.75,865055.565,245781.0,0.0,0.109236,-1437,-103,0.75,1.0,110.0
2,100003,4,0.25,0.75,1017400.5,0.0,0.0,0.0,-2586,-606,,,0.0
3,100004,2,0.0,1.0,189037.8,0.0,0.0,0.0,-1326,-408,,,0.0
4,100005,3,0.666667,0.333333,657126.0,568408.5,0.0,0.601256,-373,-62,0.0,0.0,21.0


In [None]:
train3 = train2.merge(bureau_agg, on="SK_ID_CURR", how="left")
test3  = test2.merge(bureau_agg, on="SK_ID_CURR", how="left")

print(train3.shape, test3.shape)


(307511, 146) (48744, 145)


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

X = train3.drop(columns=["TARGET","SK_ID_CURR"])
y = train3["TARGET"].astype(int)
X_test = test3.drop(columns=["SK_ID_CURR"])

num_cols = X.select_dtypes(include=["int64","float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

try:
    ohe = OneHotEncoder(handle_unknown="infrequent_if_exist", min_frequency=200)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore")

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("sc", StandardScaler(with_mean=False))
        ]), num_cols),
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("ohe", ohe)
        ]), cat_cols),
    ],
    remainder="drop"
)

pipe_bureau = Pipeline([
    ("prep", preprocess),
    ("lr", LogisticRegression(solver="saga", max_iter=800, tol=1e-2))
])

pipe_bureau.fit(X, y)
pred = pipe_bureau.predict_proba(X_test)[:, 1]

sub_bureau = pd.DataFrame({"SK_ID_CURR": test3["SK_ID_CURR"], "TARGET": pred})
sub_path = "/content/submission_prev_bureau_logreg.csv"
sub_bureau.to_csv(sub_path, index=False)
print("Saved:", sub_path)
sub_bureau.head()


Saved: /content/submission_prev_bureau_logreg.csv


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.048938
1,100005,0.176911
2,100013,0.026511
3,100028,0.036085
4,100038,0.160603


In [None]:
from pathlib import Path
out_dir = Path("/content/drive/MyDrive/home_credit/outputs")
out_dir.mkdir(parents=True, exist_ok=True)
drive_path = out_dir / "submission_prev_bureau_logreg.csv"
sub_bureau.to_csv(drive_path, index=False)
print("Saved to Drive:", drive_path)


Saved to Drive: /content/drive/MyDrive/home_credit/outputs/submission_prev_bureau_logreg.csv


In [None]:
print("train3 exists?", "train3" in globals(), "test3 exists?", "test3" in globals())


train3 exists? True test3 exists? True


In [None]:
import pandas as pd
import numpy as np

prev_map = pd.read_csv(RAW / "previous_application.csv", usecols=["SK_ID_PREV","SK_ID_CURR"])
print(prev_map.shape)
prev_map.head()


(1670214, 2)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR
0,2030495,271877
1,2802425,108129
2,2523466,122040
3,2819243,176158
4,1784265,202054


In [None]:
usecols = ["SK_ID_PREV","AMT_INSTALMENT","AMT_PAYMENT","DAYS_INSTALMENT","DAYS_ENTRY_PAYMENT"]
dtypes = {
    "SK_ID_PREV": "int32",
    "AMT_INSTALMENT": "float32",
    "AMT_PAYMENT": "float32",
    "DAYS_INSTALMENT": "int32",
    "DAYS_ENTRY_PAYMENT": "int32"
}

inst = pd.read_csv(RAW / "installments_payments.csv", usecols=usecols, dtype=dtypes)
print(inst.shape)
inst.head()


  chunks = self._reader.read_low_memory(nrows)


ValueError: cannot safely convert passed user dtype of int32 for float64 dtyped data in column 5

In [None]:
# Row-level behavior
inst["PAYMENT_PERC"] = inst["AMT_PAYMENT"] / (inst["AMT_INSTALMENT"] + 1e-9)
inst["PAYMENT_DIFF"] = inst["AMT_INSTALMENT"] - inst["AMT_PAYMENT"]
inst["DAYS_LATE"] = inst["DAYS_ENTRY_PAYMENT"] - inst["DAYS_INSTALMENT"]  # >0 late
inst["LATE_FLAG"] = (inst["DAYS_LATE"] > 0).astype(np.int8)

inst_prev = inst.groupby("SK_ID_PREV").agg(
    inst_count=("AMT_PAYMENT", "count"),
    pay_perc_mean=("PAYMENT_PERC", "mean"),
    pay_perc_max=("PAYMENT_PERC", "max"),
    pay_perc_min=("PAYMENT_PERC", "min"),
    pay_diff_mean=("PAYMENT_DIFF", "mean"),
    pay_diff_max=("PAYMENT_DIFF", "max"),
    days_late_mean=("DAYS_LATE", "mean"),
    days_late_max=("DAYS_LATE", "max"),
    late_rate=("LATE_FLAG", "mean"),
).reset_index()

print(inst_prev.shape)
inst_prev.head()


NameError: name 'inst' is not defined

In [None]:
inst_prev = inst_prev.merge(prev_map, on="SK_ID_PREV", how="left")

inst_agg = inst_prev.groupby("SK_ID_CURR").agg(
    inst_prev_loans=("SK_ID_PREV", "nunique"),
    inst_count_sum=("inst_count", "sum"),
    inst_pay_perc_mean=("pay_perc_mean", "mean"),
    inst_pay_perc_min=("pay_perc_min", "mean"),
    inst_pay_perc_max=("pay_perc_max", "mean"),
    inst_pay_diff_mean=("pay_diff_mean", "mean"),
    inst_pay_diff_max=("pay_diff_max", "max"),
    inst_days_late_mean=("days_late_mean", "mean"),
    inst_days_late_max=("days_late_max", "max"),
    inst_late_rate_mean=("late_rate", "mean"),
).reset_index()

print(inst_agg.shape)
inst_agg.head()


NameError: name 'inst_prev' is not defined

In [None]:
usecols = ["SK_ID_PREV","AMT_INSTALMENT","AMT_PAYMENT","DAYS_INSTALMENT","DAYS_ENTRY_PAYMENT"]

dtypes = {
    "SK_ID_PREV": "int32",
    "AMT_INSTALMENT": "float32",
    "AMT_PAYMENT": "float32",
    "DAYS_INSTALMENT": "float32",      # <-- change to float
    "DAYS_ENTRY_PAYMENT": "float32"    # <-- change to float
}

inst = pd.read_csv(RAW / "installments_payments.csv", usecols=usecols, dtype=dtypes)
print(inst.shape)
inst.head()


(13605401, 5)


Unnamed: 0,SK_ID_PREV,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,-1180.0,-1187.0,6948.359863,6948.359863
1,1330831,-2156.0,-2156.0,1716.525024,1716.525024
2,2085231,-63.0,-63.0,25425.0,25425.0
3,2452527,-2418.0,-2426.0,24350.130859,24350.130859
4,2714724,-1383.0,-1366.0,2165.040039,2160.584961


In [None]:
usecols = ["SK_ID_PREV","AMT_INSTALMENT","AMT_PAYMENT","DAYS_INSTALMENT","DAYS_ENTRY_PAYMENT"]

dtypes = {
    "SK_ID_PREV": "int32",
    "AMT_INSTALMENT": "float32",
    "AMT_PAYMENT": "float32",
    "DAYS_INSTALMENT": "float32",      # <-- change to float
    "DAYS_ENTRY_PAYMENT": "float32"    # <-- change to float
}

inst = pd.read_csv(RAW / "installments_payments.csv", usecols=usecols, dtype=dtypes)
print(inst.shape)
inst.head()


(13605401, 5)


Unnamed: 0,SK_ID_PREV,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,-1180.0,-1187.0,6948.359863,6948.359863
1,1330831,-2156.0,-2156.0,1716.525024,1716.525024
2,2085231,-63.0,-63.0,25425.0,25425.0
3,2452527,-2418.0,-2426.0,24350.130859,24350.130859
4,2714724,-1383.0,-1366.0,2165.040039,2160.584961


In [None]:
import numpy as np

inst["PAYMENT_PERC"] = inst["AMT_PAYMENT"] / (inst["AMT_INSTALMENT"] + 1e-9)
inst["PAYMENT_DIFF"] = inst["AMT_INSTALMENT"] - inst["AMT_PAYMENT"]
inst["DAYS_LATE"] = inst["DAYS_ENTRY_PAYMENT"] - inst["DAYS_INSTALMENT"]
inst["LATE_FLAG"] = (inst["DAYS_LATE"] > 0).astype(np.int8)


In [None]:
# Row-level behavior
inst["PAYMENT_PERC"] = inst["AMT_PAYMENT"] / (inst["AMT_INSTALMENT"] + 1e-9)
inst["PAYMENT_DIFF"] = inst["AMT_INSTALMENT"] - inst["AMT_PAYMENT"]
inst["DAYS_LATE"] = inst["DAYS_ENTRY_PAYMENT"] - inst["DAYS_INSTALMENT"]  # >0 late
inst["LATE_FLAG"] = (inst["DAYS_LATE"] > 0).astype(np.int8)

inst_prev = inst.groupby("SK_ID_PREV").agg(
    inst_count=("AMT_PAYMENT", "count"),
    pay_perc_mean=("PAYMENT_PERC", "mean"),
    pay_perc_max=("PAYMENT_PERC", "max"),
    pay_perc_min=("PAYMENT_PERC", "min"),
    pay_diff_mean=("PAYMENT_DIFF", "mean"),
    pay_diff_max=("PAYMENT_DIFF", "max"),
    days_late_mean=("DAYS_LATE", "mean"),
    days_late_max=("DAYS_LATE", "max"),
    late_rate=("LATE_FLAG", "mean"),
).reset_index()

print(inst_prev.shape)
inst_prev.head()


(997752, 10)


Unnamed: 0,SK_ID_PREV,inst_count,pay_perc_mean,pay_perc_max,pay_perc_min,pay_diff_mean,pay_diff_max,days_late_mean,days_late_max,late_rate
0,1000001,2,1.0,1.0,1.0,0.0,0.0,-16.0,-6.0,0.0
1,1000002,4,1.0,1.0,1.0,0.0,0.0,-19.75,-5.0,0.0
2,1000003,3,1.0,1.0,1.0,0.0,0.0,-15.333333,-14.0,0.0
3,1000004,7,1.0,1.0,1.0,0.0,0.0,-26.714285,-10.0,0.0
4,1000005,11,0.909091,1.0,0.00019,1337.600464,14710.81543,-8.454545,3.0,0.181818


In [None]:
inst_prev = inst_prev.merge(prev_map, on="SK_ID_PREV", how="left")

inst_agg = inst_prev.groupby("SK_ID_CURR").agg(
    inst_prev_loans=("SK_ID_PREV", "nunique"),
    inst_count_sum=("inst_count", "sum"),
    inst_pay_perc_mean=("pay_perc_mean", "mean"),
    inst_pay_perc_min=("pay_perc_min", "mean"),
    inst_pay_perc_max=("pay_perc_max", "mean"),
    inst_pay_diff_mean=("pay_diff_mean", "mean"),
    inst_pay_diff_max=("pay_diff_max", "max"),
    inst_days_late_mean=("days_late_mean", "mean"),
    inst_days_late_max=("days_late_max", "max"),
    inst_late_rate_mean=("late_rate", "mean"),
).reset_index()

print(inst_agg.shape)
inst_agg.head()


(336935, 11)


Unnamed: 0,SK_ID_CURR,inst_prev_loans,inst_count_sum,inst_pay_perc_mean,inst_pay_perc_min,inst_pay_perc_max,inst_pay_diff_mean,inst_pay_diff_max,inst_days_late_mean,inst_days_late_max,inst_late_rate_mean
0,100001.0,1,4,1.0,1.0,1.0,0.0,0.0,-15.5,-6.0,0.0
1,100002.0,1,19,1.0,1.0,1.0,0.0,0.0,-20.421053,-12.0,0.0
2,100003.0,3,25,1.0,1.0,1.0,0.0,0.0,-7.448412,-1.0,0.0
3,100004.0,1,3,1.0,1.0,1.0,0.0,0.0,-7.666667,-3.0,0.0
4,100005.0,1,9,1.0,1.0,1.0,0.0,0.0,-23.555555,1.0,0.111111


In [None]:
train4 = train3.merge(inst_agg, on="SK_ID_CURR", how="left")
test4  = test3.merge(inst_agg, on="SK_ID_CURR", how="left")

print(train4.shape, test4.shape)


(307511, 156) (48744, 155)


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

X = train4.drop(columns=["TARGET","SK_ID_CURR"])
y = train4["TARGET"].astype(int)
X_test = test4.drop(columns=["SK_ID_CURR"])

num_cols = X.select_dtypes(include=["int64","float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

try:
    ohe = OneHotEncoder(handle_unknown="infrequent_if_exist", min_frequency=200)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore")

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("sc", StandardScaler(with_mean=False))
        ]), num_cols),
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("ohe", ohe)
        ]), cat_cols),
    ],
    remainder="drop"
)

pipe_inst = Pipeline([
    ("prep", preprocess),
    ("lr", LogisticRegression(solver="saga", max_iter=800, tol=1e-2))
])

pipe_inst.fit(X, y)
pred = pipe_inst.predict_proba(X_test)[:, 1]

sub_inst = pd.DataFrame({"SK_ID_CURR": test4["SK_ID_CURR"], "TARGET": pred})
sub_path = "/content/submission_prev_bureau_inst_logreg.csv"
sub_inst.to_csv(sub_path, index=False)
print("Saved:", sub_path)
sub_inst.head()


Saved: /content/submission_prev_bureau_inst_logreg.csv


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.042434
1,100005,0.175869
2,100013,0.033174
3,100028,0.038236
4,100038,0.141466


In [None]:
from pathlib import Path
out_dir = Path("/content/drive/MyDrive/home_credit/outputs")
out_dir.mkdir(parents=True, exist_ok=True)

drive_path = out_dir / "submission_prev_bureau_inst_logreg.csv"
sub_inst.to_csv(drive_path, index=False)
print("Saved to Drive:", drive_path)


Saved to Drive: /content/drive/MyDrive/home_credit/outputs/submission_prev_bureau_inst_logreg.csv


In [None]:
print("train4 exists?", "train4" in globals(), "test4 exists?", "test4" in globals())


train4 exists? True test4 exists? True


In [None]:
import pandas as pd
import numpy as np

prev_map = pd.read_csv(RAW / "previous_application.csv", usecols=["SK_ID_PREV","SK_ID_CURR"])
print(prev_map.shape)


(1670214, 2)


In [None]:
usecols = [
    "SK_ID_PREV","MONTHS_BALANCE",
    "AMT_BALANCE","AMT_CREDIT_LIMIT_ACTUAL",
    "AMT_DRAWINGS_ATM_CURRENT","AMT_DRAWINGS_CURRENT",
    "AMT_PAYMENT_TOTAL_CURRENT","AMT_INST_MIN_REGULARITY",
    "SK_DPD","SK_DPD_DEF"
]

cc = pd.read_csv(RAW / "credit_card_balance.csv", usecols=usecols)
print(cc.shape)
cc.head()


(3840312, 10)


Unnamed: 0,SK_ID_PREV,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_TOTAL_CURRENT,SK_DPD,SK_DPD_DEF
0,2562384,-6,56.97,135000,0.0,877.5,1700.325,1800.0,0,0
1,2582071,-1,63975.555,45000,2250.0,2250.0,2250.0,2250.0,0,0
2,1740877,-7,31815.225,450000,0.0,0.0,2250.0,2250.0,0,0
3,1389973,-4,236572.11,225000,2250.0,2250.0,11795.76,11925.0,0,0
4,1891521,-1,453919.455,450000,0.0,11547.0,22924.89,27000.0,0,0


In [None]:
cc["UTILIZATION"] = cc["AMT_BALANCE"] / (cc["AMT_CREDIT_LIMIT_ACTUAL"] + 1e-9)
cc["ATM_SHARE"] = cc["AMT_DRAWINGS_ATM_CURRENT"] / (cc["AMT_DRAWINGS_CURRENT"] + 1e-9)
cc["PAYMENT_TO_MIN"] = cc["AMT_PAYMENT_TOTAL_CURRENT"] / (cc["AMT_INST_MIN_REGULARITY"] + 1e-9)

cc_prev = cc.groupby("SK_ID_PREV").agg(
    cc_months=("MONTHS_BALANCE", "count"),
    cc_util_mean=("UTILIZATION", "mean"),
    cc_util_max=("UTILIZATION", "max"),
    cc_dpd_mean=("SK_DPD", "mean"),
    cc_dpd_max=("SK_DPD", "max"),
    cc_dpd_def_mean=("SK_DPD_DEF", "mean"),
    cc_atm_share_mean=("ATM_SHARE", "mean"),
    cc_pay_to_min_mean=("PAYMENT_TO_MIN", "mean"),
    cc_balance_mean=("AMT_BALANCE", "mean"),
    cc_limit_mean=("AMT_CREDIT_LIMIT_ACTUAL", "mean"),
).reset_index()

print(cc_prev.shape)
cc_prev.head()


(104307, 11)


Unnamed: 0,SK_ID_PREV,cc_months,cc_util_mean,cc_util_max,cc_dpd_mean,cc_dpd_max,cc_dpd_def_mean,cc_atm_share_mean,cc_pay_to_min_mean,cc_balance_mean,cc_limit_mean
0,1000018,5,0.92308,1.012559,0.0,0,0.0,0.091939,1800000000000.0,74946.285,81000.0
1,1000030,8,0.630494,0.915798,0.0,0,0.0,0.020667,2048113000000.0,55991.064375,81562.5
2,1000031,16,0.327366,0.97095,0.0,0,0.0,0.222815,49258130000.0,52394.439375,149625.0
3,1000035,5,0.0,0.0,0.0,0,0.0,,0.0,0.0,225000.0
4,1000077,11,0.0,0.0,0.0,0,0.0,,0.0,0.0,94090.909091


In [None]:
cc_prev = cc_prev.merge(prev_map, on="SK_ID_PREV", how="left")

cc_agg = cc_prev.groupby("SK_ID_CURR").agg(
    cc_prev_loans=("SK_ID_PREV", "nunique"),
    cc_months_sum=("cc_months", "sum"),
    cc_util_mean=("cc_util_mean", "mean"),
    cc_util_max=("cc_util_max", "max"),
    cc_dpd_mean=("cc_dpd_mean", "mean"),
    cc_dpd_max=("cc_dpd_max", "max"),
    cc_dpd_def_mean=("cc_dpd_def_mean", "mean"),
    cc_atm_share_mean=("cc_atm_share_mean", "mean"),
    cc_pay_to_min_mean=("cc_pay_to_min_mean", "mean"),
    cc_balance_mean=("cc_balance_mean", "mean"),
    cc_limit_mean=("cc_limit_mean", "mean"),
).reset_index()

print(cc_agg.shape)
cc_agg.head()


(92447, 12)


Unnamed: 0,SK_ID_CURR,cc_prev_loans,cc_months_sum,cc_util_mean,cc_util_max,cc_dpd_mean,cc_dpd_max,cc_dpd_def_mean,cc_atm_share_mean,cc_pay_to_min_mean,cc_balance_mean,cc_limit_mean
0,100006.0,1,6,0.0,0.0,0.0,0,0.0,,0.0,0.0,270000.0
1,100011.0,1,74,0.302678,1.05,0.0,0,0.0,0.013514,0.5091096,54482.111149,164189.189189
2,100021.0,1,17,0.0,0.0,0.0,0,0.0,,0.0,0.0,675000.0
3,100023.0,1,8,0.0,0.0,0.0,0,0.0,,0.0,0.0,135000.0
4,100028.0,1,49,0.035934,0.165937,0.0,0,0.0,0.045455,4128214000.0,8085.058163,225000.0


In [None]:
train5 = train4.merge(cc_agg, on="SK_ID_CURR", how="left")
test5  = test4.merge(cc_agg, on="SK_ID_CURR", how="left")
print(train5.shape, test5.shape)


(307511, 167) (48744, 166)


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

X = train5.drop(columns=["TARGET","SK_ID_CURR"])
y = train5["TARGET"].astype(int)
X_test = test5.drop(columns=["SK_ID_CURR"])

num_cols = X.select_dtypes(include=["int64","float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

try:
    ohe = OneHotEncoder(handle_unknown="infrequent_if_exist", min_frequency=200)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore")

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("sc", StandardScaler(with_mean=False))
        ]), num_cols),
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("ohe", ohe)
        ]), cat_cols),
    ],
    remainder="drop"
)

pipe_cc = Pipeline([
    ("prep", preprocess),
    ("lr", LogisticRegression(solver="saga", max_iter=800, tol=1e-2))
])

pipe_cc.fit(X, y)
pred = pipe_cc.predict_proba(X_test)[:, 1]

sub_cc = pd.DataFrame({"SK_ID_CURR": test5["SK_ID_CURR"], "TARGET": pred})
sub_path = "/content/submission_prev_bureau_inst_cc_logreg.csv"
sub_cc.to_csv(sub_path, index=False)
print("Saved:", sub_path)
sub_cc.head()


Saved: /content/submission_prev_bureau_inst_cc_logreg.csv


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.042442
1,100005,0.172991
2,100013,0.032501
3,100028,0.033883
4,100038,0.138835


In [None]:
from pathlib import Path
out_dir = Path("/content/drive/MyDrive/home_credit/outputs")
out_dir.mkdir(parents=True, exist_ok=True)
drive_path = out_dir / "submission_prev_bureau_inst_cc_logreg.csv"
sub_cc.to_csv(drive_path, index=False)
print("Saved to Drive:", drive_path)


Saved to Drive: /content/drive/MyDrive/home_credit/outputs/submission_prev_bureau_inst_cc_logreg.csv


In [1]:
!pip -q install lightgbm


In [2]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd

X = train5.drop(columns=["TARGET","SK_ID_CURR"])
y = train5["TARGET"].astype(int).values
X_test = test5.drop(columns=["SK_ID_CURR"])

# One-hot encode everything once (OK for LGBM)
X_all = pd.get_dummies(pd.concat([X, X_test], axis=0), dummy_na=True)
X_enc = X_all.iloc[:len(X)]
X_test_enc = X_all.iloc[len(X):]

print(X_enc.shape, X_test_enc.shape)


NameError: name 'train5' is not defined

In [3]:
[v for v in ["train","test","train3","test3","train4","test4","train5","test5"] if v in globals()]


[]

In [4]:
import pandas as pd
import numpy as np

train = pd.read_csv(RAW / "application_train.csv")
test  = pd.read_csv(RAW / "application_test.csv")
print(train.shape, test.shape)


NameError: name 'RAW' is not defined

In [5]:
from google.colab import drive
drive.mount("/content/drive")



Mounted at /content/drive


In [6]:
from pathlib import Path

MYDRIVE = Path("/content/drive/MyDrive")
candidates = list(MYDRIVE.rglob("application_train.csv"))
print("Found:", len(candidates))
for p in candidates[:5]:
    print(p)

RAW = candidates[0].parent
print("\n✅ RAW set to:", RAW)
print("CSVs:", sorted([f.name for f in RAW.glob("*.csv")]))


Found: 1
/content/drive/MyDrive/home_credit/raw/raw/application_train.csv

✅ RAW set to: /content/drive/MyDrive/home_credit/raw/raw
CSVs: ['HomeCredit_columns_description.csv', 'POS_CASH_balance.csv', 'application_test.csv', 'application_train.csv', 'bureau.csv', 'bureau_balance.csv', 'credit_card_balance.csv', 'installments_payments.csv', 'previous_application.csv', 'sample_submission.csv']


In [7]:
import pandas as pd
train = pd.read_csv(RAW / "application_train.csv")
test  = pd.read_csv(RAW / "application_test.csv")
print(train.shape, test.shape)


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/home_credit/raw/raw/application_train.csv'

In [8]:
from pathlib import Path
import os

BASE = Path("/content/drive/MyDrive")

# Find the actual location of application_train.csv
matches = list(BASE.rglob("application_train.csv"))
print("Found:", len(matches))
for m in matches[:10]:
    print(m)

assert len(matches) > 0, "application_train.csv not found in MyDrive"

RAW = matches[0].parent
print("\n✅ RAW set to:", RAW)
print("Files:", sorted([p.name for p in RAW.glob("*.csv")])[:20])


Found: 1
/content/drive/MyDrive/home_credit/raw/raw/application_train.csv

✅ RAW set to: /content/drive/MyDrive/home_credit/raw/raw
Files: ['HomeCredit_columns_description.csv', 'POS_CASH_balance.csv', 'application_test.csv', 'application_train.csv', 'bureau.csv', 'bureau_balance.csv', 'credit_card_balance.csv', 'installments_payments.csv', 'previous_application.csv', 'sample_submission.csv']


In [9]:
import pandas as pd

train = pd.read_csv(RAW / "application_train.csv")
test  = pd.read_csv(RAW / "application_test.csv")

print(train.shape, test.shape)


(307511, 122) (48744, 121)


In [10]:
from google.colab import drive
drive.mount("/content/drive")

from pathlib import Path
BASE = Path("/content/drive/MyDrive")

matches = list(BASE.rglob("application_train.csv"))
print("Found:", len(matches))
for m in matches[:5]:
    print(m)

RAW = matches[0].parent
print("\n✅ RAW set to:", RAW)
print("Some files:", sorted([p.name for p in RAW.glob("*.csv")])[:15])


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Found: 1
/content/drive/MyDrive/home_credit/raw/raw/application_train.csv

✅ RAW set to: /content/drive/MyDrive/home_credit/raw/raw
Some files: ['HomeCredit_columns_description.csv', 'POS_CASH_balance.csv', 'application_test.csv', 'application_train.csv', 'bureau.csv', 'bureau_balance.csv', 'credit_card_balance.csv', 'installments_payments.csv', 'previous_application.csv', 'sample_submission.csv']


In [11]:
import pandas as pd

train = pd.read_csv(RAW / "application_train.csv")
test  = pd.read_csv(RAW / "application_test.csv")
print(train.shape, test.shape)


(307511, 122) (48744, 121)


In [12]:
from google.colab import drive
drive.mount("/content/drive")

from pathlib import Path
BASE = Path("/content/drive/MyDrive")

# Find where application_train.csv actually is
hits = list(BASE.rglob("application_train.csv"))
print("Found:", len(hits))
for h in hits[:5]:
    print(h)

RAW = hits[0].parent
print("\n✅ RAW =", RAW)
print("Some files:", sorted([p.name for p in RAW.glob("*.csv")])[:20])


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Found: 1
/content/drive/MyDrive/home_credit/raw/raw/application_train.csv

✅ RAW = /content/drive/MyDrive/home_credit/raw/raw
Some files: ['HomeCredit_columns_description.csv', 'POS_CASH_balance.csv', 'application_test.csv', 'application_train.csv', 'bureau.csv', 'bureau_balance.csv', 'credit_card_balance.csv', 'installments_payments.csv', 'previous_application.csv', 'sample_submission.csv']


In [13]:
HOME = Path("/content/drive/MyDrive/home_credit")

# Look for anything that sounds like saved features/aggregates
patterns = [
    "*prev*agg*.*", "*bureau*agg*.*", "*inst*agg*.*", "*cc*agg*.*",
    "*train*features*.*", "*test*features*.*", "*.parquet"
]

found = []
for pat in patterns:
    found += list(HOME.rglob(pat))

found = sorted(set(found))
print("Found files:", len(found))
for f in found[:50]:
    print(f)


Found files: 1
/content/drive/MyDrive/home_credit/outputs/submission_prevagg_logreg.csv


In [14]:
import pandas as pd
import numpy as np

train = pd.read_csv(RAW / "application_train.csv")
test  = pd.read_csv(RAW / "application_test.csv")
print(train.shape, test.shape)


(307511, 122) (48744, 121)


In [15]:
prev = pd.read_csv(
    RAW / "previous_application.csv",
    usecols=["SK_ID_CURR","SK_ID_PREV","NAME_CONTRACT_STATUS","AMT_APPLICATION","AMT_CREDIT","AMT_ANNUITY","DAYS_DECISION"]
)

prev["PREV_APPROVED"] = (prev["NAME_CONTRACT_STATUS"] == "Approved").astype(np.int8)
prev["PREV_REFUSED"]  = (prev["NAME_CONTRACT_STATUS"] == "Refused").astype(np.int8)
prev["APP_CREDIT_RATIO"] = prev["AMT_APPLICATION"] / (prev["AMT_CREDIT"] + 1e-9)

prev_agg = prev.groupby("SK_ID_CURR").agg(
    prev_count=("SK_ID_PREV","nunique"),
    prev_approved_rate=("PREV_APPROVED","mean"),
    prev_refused_rate=("PREV_REFUSED","mean"),
    prev_amt_credit_mean=("AMT_CREDIT","mean"),
    prev_amt_annuity_mean=("AMT_ANNUITY","mean"),
    prev_app_credit_ratio_mean=("APP_CREDIT_RATIO","mean"),
    prev_days_decision_max=("DAYS_DECISION","max"),
).reset_index()

prev_agg["prev_time_since_last_app"] = -prev_agg["prev_days_decision_max"]
print(prev_agg.shape)


(338857, 9)


In [16]:
bureau = pd.read_csv(
    RAW / "bureau.csv",
    usecols=["SK_ID_CURR","SK_ID_BUREAU","CREDIT_ACTIVE","AMT_CREDIT_SUM","AMT_CREDIT_SUM_DEBT","AMT_CREDIT_SUM_OVERDUE","DAYS_CREDIT"]
)

bb = pd.read_csv(
    RAW / "bureau_balance.csv",
    usecols=["SK_ID_BUREAU","MONTHS_BALANCE","STATUS"]
)

status_map = {"C":0,"0":0,"1":1,"2":2,"3":3,"4":4,"5":5,"X":0}
bb["STATUS_NUM"] = bb["STATUS"].map(status_map).fillna(0).astype(np.int8)

bb_agg = bb.groupby("SK_ID_BUREAU").agg(
    bb_months=("MONTHS_BALANCE","count"),
    bb_status_worst=("STATUS_NUM","max"),
    bb_status_mean=("STATUS_NUM","mean"),
).reset_index()

bureau2 = bureau.merge(bb_agg, on="SK_ID_BUREAU", how="left")
bureau2["BUREAU_ACTIVE"] = (bureau2["CREDIT_ACTIVE"] == "Active").astype(np.int8)
bureau2["DEBT_CREDIT_RATIO"] = bureau2["AMT_CREDIT_SUM_DEBT"] / (bureau2["AMT_CREDIT_SUM"] + 1e-9)

bureau_agg = bureau2.groupby("SK_ID_CURR").agg(
    bureau_count=("SK_ID_BUREAU","nunique"),
    bureau_active_rate=("BUREAU_ACTIVE","mean"),
    bureau_credit_sum=("AMT_CREDIT_SUM","sum"),
    bureau_debt_sum=("AMT_CREDIT_SUM_DEBT","sum"),
    bureau_overdue_sum=("AMT_CREDIT_SUM_OVERDUE","sum"),
    bureau_debt_credit_ratio_mean=("DEBT_CREDIT_RATIO","mean"),
    bb_worst_max=("bb_status_worst","max"),
    bb_worst_mean=("bb_status_worst","mean"),
    bb_months_sum=("bb_months","sum"),
    bureau_days_credit_min=("DAYS_CREDIT","min"),
    bureau_days_credit_max=("DAYS_CREDIT","max"),
).reset_index()

print(bureau_agg.shape)


(305811, 12)


In [17]:
inst = pd.read_csv(
    RAW / "installments_payments.csv",
    usecols=["SK_ID_PREV","AMT_INSTALMENT","AMT_PAYMENT","DAYS_INSTALMENT","DAYS_ENTRY_PAYMENT"],
    dtype={
        "SK_ID_PREV":"int32",
        "AMT_INSTALMENT":"float32",
        "AMT_PAYMENT":"float32",
        "DAYS_INSTALMENT":"float32",
        "DAYS_ENTRY_PAYMENT":"float32",
    }
)

inst["PAYMENT_PERC"] = inst["AMT_PAYMENT"] / (inst["AMT_INSTALMENT"] + 1e-9)
inst["PAYMENT_DIFF"] = inst["AMT_INSTALMENT"] - inst["AMT_PAYMENT"]
inst["DAYS_LATE"] = inst["DAYS_ENTRY_PAYMENT"] - inst["DAYS_INSTALMENT"]
inst["LATE_FLAG"] = (inst["DAYS_LATE"] > 0).astype(np.int8)

inst_prev = inst.groupby("SK_ID_PREV").agg(
    inst_count=("AMT_PAYMENT","count"),
    pay_perc_mean=("PAYMENT_PERC","mean"),
    pay_diff_mean=("PAYMENT_DIFF","mean"),
    days_late_mean=("DAYS_LATE","mean"),
    days_late_max=("DAYS_LATE","max"),
    late_rate=("LATE_FLAG","mean"),
).reset_index()

prev_map = prev[["SK_ID_PREV","SK_ID_CURR"]]
inst_prev = inst_prev.merge(prev_map, on="SK_ID_PREV", how="left")

inst_agg = inst_prev.groupby("SK_ID_CURR").agg(
    inst_prev_loans=("SK_ID_PREV","nunique"),
    inst_count_sum=("inst_count","sum"),
    inst_pay_perc_mean=("pay_perc_mean","mean"),
    inst_pay_diff_mean=("pay_diff_mean","mean"),
    inst_days_late_mean=("days_late_mean","mean"),
    inst_days_late_max=("days_late_max","max"),
    inst_late_rate_mean=("late_rate","mean"),
).reset_index()

print(inst_agg.shape)


(336935, 8)


In [18]:
cc = pd.read_csv(
    RAW / "credit_card_balance.csv",
    usecols=[
        "SK_ID_PREV","MONTHS_BALANCE","AMT_BALANCE","AMT_CREDIT_LIMIT_ACTUAL",
        "AMT_DRAWINGS_ATM_CURRENT","AMT_DRAWINGS_CURRENT",
        "AMT_PAYMENT_TOTAL_CURRENT","AMT_INST_MIN_REGULARITY",
        "SK_DPD","SK_DPD_DEF"
    ]
)

cc["UTILIZATION"] = cc["AMT_BALANCE"] / (cc["AMT_CREDIT_LIMIT_ACTUAL"] + 1e-9)
cc["ATM_SHARE"] = cc["AMT_DRAWINGS_ATM_CURRENT"] / (cc["AMT_DRAWINGS_CURRENT"] + 1e-9)
cc["PAYMENT_TO_MIN"] = cc["AMT_PAYMENT_TOTAL_CURRENT"] / (cc["AMT_INST_MIN_REGULARITY"] + 1e-9)

cc_prev = cc.groupby("SK_ID_PREV").agg(
    cc_months=("MONTHS_BALANCE","count"),
    cc_util_mean=("UTILIZATION","mean"),
    cc_util_max=("UTILIZATION","max"),
    cc_dpd_mean=("SK_DPD","mean"),
    cc_dpd_max=("SK_DPD","max"),
    cc_dpd_def_mean=("SK_DPD_DEF","mean"),
    cc_atm_share_mean=("ATM_SHARE","mean"),
    cc_pay_to_min_mean=("PAYMENT_TO_MIN","mean"),
).reset_index()

cc_prev = cc_prev.merge(prev_map, on="SK_ID_PREV", how="left")

cc_agg = cc_prev.groupby("SK_ID_CURR").agg(
    cc_prev_loans=("SK_ID_PREV","nunique"),
    cc_months_sum=("cc_months","sum"),
    cc_util_mean=("cc_util_mean","mean"),
    cc_util_max=("cc_util_max","max"),
    cc_dpd_mean=("cc_dpd_mean","mean"),
    cc_dpd_max=("cc_dpd_max","max"),
    cc_dpd_def_mean=("cc_dpd_def_mean","mean"),
    cc_atm_share_mean=("cc_atm_share_mean","mean"),
    cc_pay_to_min_mean=("cc_pay_to_min_mean","mean"),
).reset_index()

print(cc_agg.shape)


(92447, 10)


In [19]:
train5 = train.merge(prev_agg, on="SK_ID_CURR", how="left") \
             .merge(bureau_agg, on="SK_ID_CURR", how="left") \
             .merge(inst_agg, on="SK_ID_CURR", how="left") \
             .merge(cc_agg, on="SK_ID_CURR", how="left")

test5  = test.merge(prev_agg, on="SK_ID_CURR", how="left") \
            .merge(bureau_agg, on="SK_ID_CURR", how="left") \
            .merge(inst_agg, on="SK_ID_CURR", how="left") \
            .merge(cc_agg, on="SK_ID_CURR", how="left")

print(train5.shape, test5.shape)


(307511, 157) (48744, 156)


In [20]:
OUT = Path("/content/drive/MyDrive/home_credit/processed")
OUT.mkdir(parents=True, exist_ok=True)

prev_agg.to_parquet(OUT/"prev_agg.parquet", index=False)
bureau_agg.to_parquet(OUT/"bureau_agg.parquet", index=False)
inst_agg.to_parquet(OUT/"inst_agg.parquet", index=False)
cc_agg.to_parquet(OUT/"cc_agg.parquet", index=False)

print("Saved aggs to:", OUT)


Saved aggs to: /content/drive/MyDrive/home_credit/processed


In [21]:
OUT = Path("/content/drive/MyDrive/home_credit/processed")
OUT.mkdir(parents=True, exist_ok=True)

prev_agg.to_parquet(OUT/"prev_agg.parquet", index=False)
bureau_agg.to_parquet(OUT/"bureau_agg.parquet", index=False)
inst_agg.to_parquet(OUT/"inst_agg.parquet", index=False)
cc_agg.to_parquet(OUT/"cc_agg.parquet", index=False)

print("Saved aggs to:", OUT)


Saved aggs to: /content/drive/MyDrive/home_credit/processed


In [22]:
from google.colab import drive
drive.mount("/content/drive")

from pathlib import Path
BASE = Path("/content/drive/MyDrive")
RAW = list(BASE.rglob("application_train.csv"))[0].parent
print("✅ RAW =", RAW)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ RAW = /content/drive/MyDrive/home_credit/raw/raw


In [23]:
import pandas as pd
train = pd.read_csv(RAW / "application_train.csv")
test  = pd.read_csv(RAW / "application_test.csv")
print(train.shape, test.shape)


(307511, 122) (48744, 121)


In [24]:
import numpy as np

# ---- prev_agg ----
prev = pd.read_csv(RAW / "previous_application.csv",
                   usecols=["SK_ID_CURR","SK_ID_PREV","NAME_CONTRACT_STATUS","AMT_APPLICATION","AMT_CREDIT","AMT_ANNUITY","DAYS_DECISION"])
prev["PREV_APPROVED"] = (prev["NAME_CONTRACT_STATUS"] == "Approved").astype(np.int8)
prev["PREV_REFUSED"]  = (prev["NAME_CONTRACT_STATUS"] == "Refused").astype(np.int8)
prev["APP_CREDIT_RATIO"] = prev["AMT_APPLICATION"]/(prev["AMT_CREDIT"]+1e-9)

prev_agg = prev.groupby("SK_ID_CURR").agg(
    prev_count=("SK_ID_PREV","nunique"),
    prev_approved_rate=("PREV_APPROVED","mean"),
    prev_refused_rate=("PREV_REFUSED","mean"),
    prev_amt_credit_mean=("AMT_CREDIT","mean"),
    prev_amt_annuity_mean=("AMT_ANNUITY","mean"),
    prev_app_credit_ratio_mean=("APP_CREDIT_RATIO","mean"),
    prev_days_decision_max=("DAYS_DECISION","max"),
).reset_index()
prev_agg["prev_time_since_last_app"] = -prev_agg["prev_days_decision_max"]

# ---- bureau_agg ----
bureau = pd.read_csv(RAW / "bureau.csv",
                     usecols=["SK_ID_CURR","SK_ID_BUREAU","CREDIT_ACTIVE","AMT_CREDIT_SUM","AMT_CREDIT_SUM_DEBT","AMT_CREDIT_SUM_OVERDUE","DAYS_CREDIT"])
bb = pd.read_csv(RAW / "bureau_balance.csv", usecols=["SK_ID_BUREAU","MONTHS_BALANCE","STATUS"])
status_map = {"C":0,"0":0,"1":1,"2":2,"3":3,"4":4,"5":5,"X":0}
bb["STATUS_NUM"] = bb["STATUS"].map(status_map).fillna(0).astype(np.int8)

bb_agg = bb.groupby("SK_ID_BUREAU").agg(
    bb_months=("MONTHS_BALANCE","count"),
    bb_status_worst=("STATUS_NUM","max"),
).reset_index()

bureau2 = bureau.merge(bb_agg, on="SK_ID_BUREAU", how="left")
bureau2["BUREAU_ACTIVE"] = (bureau2["CREDIT_ACTIVE"] == "Active").astype(np.int8)
bureau2["DEBT_CREDIT_RATIO"] = bureau2["AMT_CREDIT_SUM_DEBT"]/(bureau2["AMT_CREDIT_SUM"]+1e-9)

bureau_agg = bureau2.groupby("SK_ID_CURR").agg(
    bureau_count=("SK_ID_BUREAU","nunique"),
    bureau_active_rate=("BUREAU_ACTIVE","mean"),
    bureau_credit_sum=("AMT_CREDIT_SUM","sum"),
    bureau_debt_sum=("AMT_CREDIT_SUM_DEBT","sum"),
    bureau_overdue_sum=("AMT_CREDIT_SUM_OVERDUE","sum"),
    bureau_debt_credit_ratio_mean=("DEBT_CREDIT_RATIO","mean"),
    bb_worst_max=("bb_status_worst","max"),
    bb_months_sum=("bb_months","sum"),
).reset_index()

# ---- installments -> inst_agg ----
inst = pd.read_csv(RAW / "installments_payments.csv",
                   usecols=["SK_ID_PREV","AMT_INSTALMENT","AMT_PAYMENT","DAYS_INSTALMENT","DAYS_ENTRY_PAYMENT"],
                   dtype={"SK_ID_PREV":"int32","AMT_INSTALMENT":"float32","AMT_PAYMENT":"float32","DAYS_INSTALMENT":"float32","DAYS_ENTRY_PAYMENT":"float32"})
inst["PAYMENT_PERC"] = inst["AMT_PAYMENT"]/(inst["AMT_INSTALMENT"]+1e-9)
inst["PAYMENT_DIFF"] = inst["AMT_INSTALMENT"]-inst["AMT_PAYMENT"]
inst["DAYS_LATE"] = inst["DAYS_ENTRY_PAYMENT"]-inst["DAYS_INSTALMENT"]
inst["LATE_FLAG"] = (inst["DAYS_LATE"]>0).astype(np.int8)

inst_prev = inst.groupby("SK_ID_PREV").agg(
    inst_count=("AMT_PAYMENT","count"),
    pay_perc_mean=("PAYMENT_PERC","mean"),
    pay_diff_mean=("PAYMENT_DIFF","mean"),
    days_late_mean=("DAYS_LATE","mean"),
    days_late_max=("DAYS_LATE","max"),
    late_rate=("LATE_FLAG","mean"),
).reset_index()

prev_map = prev[["SK_ID_PREV","SK_ID_CURR"]]
inst_prev = inst_prev.merge(prev_map, on="SK_ID_PREV", how="left")

inst_agg = inst_prev.groupby("SK_ID_CURR").agg(
    inst_prev_loans=("SK_ID_PREV","nunique"),
    inst_count_sum=("inst_count","sum"),
    inst_pay_perc_mean=("pay_perc_mean","mean"),
    inst_pay_diff_mean=("pay_diff_mean","mean"),
    inst_days_late_mean=("days_late_mean","mean"),
    inst_days_late_max=("days_late_max","max"),
    inst_late_rate_mean=("late_rate","mean"),
).reset_index()

# ---- credit card -> cc_agg ----
cc = pd.read_csv(RAW / "credit_card_balance.csv",
                 usecols=["SK_ID_PREV","MONTHS_BALANCE","AMT_BALANCE","AMT_CREDIT_LIMIT_ACTUAL",
                          "AMT_DRAWINGS_ATM_CURRENT","AMT_DRAWINGS_CURRENT",
                          "AMT_PAYMENT_TOTAL_CURRENT","AMT_INST_MIN_REGULARITY","SK_DPD","SK_DPD_DEF"])
cc["UTILIZATION"] = cc["AMT_BALANCE"]/(cc["AMT_CREDIT_LIMIT_ACTUAL"]+1e-9)
cc["ATM_SHARE"] = cc["AMT_DRAWINGS_ATM_CURRENT"]/(cc["AMT_DRAWINGS_CURRENT"]+1e-9)
cc["PAYMENT_TO_MIN"] = cc["AMT_PAYMENT_TOTAL_CURRENT"]/(cc["AMT_INST_MIN_REGULARITY"]+1e-9)

cc_prev = cc.groupby("SK_ID_PREV").agg(
    cc_months=("MONTHS_BALANCE","count"),
    cc_util_mean=("UTILIZATION","mean"),
    cc_util_max=("UTILIZATION","max"),
    cc_dpd_mean=("SK_DPD","mean"),
    cc_dpd_max=("SK_DPD","max"),
    cc_dpd_def_mean=("SK_DPD_DEF","mean"),
    cc_atm_share_mean=("ATM_SHARE","mean"),
    cc_pay_to_min_mean=("PAYMENT_TO_MIN","mean"),
).reset_index()

cc_prev = cc_prev.merge(prev_map, on="SK_ID_PREV", how="left")
cc_agg = cc_prev.groupby("SK_ID_CURR").agg(
    cc_prev_loans=("SK_ID_PREV","nunique"),
    cc_months_sum=("cc_months","sum"),
    cc_util_mean=("cc_util_mean","mean"),
    cc_util_max=("cc_util_max","max"),
    cc_dpd_mean=("cc_dpd_mean","mean"),
    cc_dpd_max=("cc_dpd_max","max"),
    cc_dpd_def_mean=("cc_dpd_def_mean","mean"),
    cc_atm_share_mean=("cc_atm_share_mean","mean"),
    cc_pay_to_min_mean=("cc_pay_to_min_mean","mean"),
).reset_index()

# ---- merge final ----
train5 = train.merge(prev_agg, on="SK_ID_CURR", how="left") \
             .merge(bureau_agg, on="SK_ID_CURR", how="left") \
             .merge(inst_agg, on="SK_ID_CURR", how="left") \
             .merge(cc_agg, on="SK_ID_CURR", how="left")

test5  = test.merge(prev_agg, on="SK_ID_CURR", how="left") \
            .merge(bureau_agg, on="SK_ID_CURR", how="left") \
            .merge(inst_agg, on="SK_ID_CURR", how="left") \
            .merge(cc_agg, on="SK_ID_CURR", how="left")

print("✅ train5/test5:", train5.shape, test5.shape)


✅ train5/test5: (307511, 154) (48744, 153)


In [25]:
!pip -q install lightgbm


In [26]:
import numpy as np
import pandas as pd

X = train5.drop(columns=["TARGET","SK_ID_CURR"])
y = train5["TARGET"].astype(int).values
X_test = test5.drop(columns=["SK_ID_CURR"])

# One-hot encode all at once so columns match
X_all = pd.get_dummies(pd.concat([X, X_test], axis=0), dummy_na=True)
X_enc = X_all.iloc[:len(X)]
X_test_enc = X_all.iloc[len(X):]

print("Encoded shapes:", X_enc.shape, X_test_enc.shape)


Encoded shapes: (307511, 292) (48744, 292)


In [27]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

params = {
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.03,
    "num_leaves": 64,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "min_data_in_leaf": 60,
    "verbosity": -1,
}

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(X_enc))
test_pred = np.zeros(len(X_test_enc))

for fold, (tr_idx, va_idx) in enumerate(folds.split(X_enc, y), 1):
    X_tr, X_va = X_enc.iloc[tr_idx], X_enc.iloc[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]

    dtrain = lgb.Dataset(X_tr, label=y_tr)
    dvalid = lgb.Dataset(X_va, label=y_va)

    model = lgb.train(
        params,
        dtrain,
        num_boost_round=4000,
        valid_sets=[dvalid],
        callbacks=[lgb.early_stopping(150), lgb.log_evaluation(200)]
    )

    oof[va_idx] = model.predict(X_va, num_iteration=model.best_iteration)
    test_pred += model.predict(X_test_enc, num_iteration=model.best_iteration) / folds.n_splits

    print(f"Fold {fold} AUC:", round(roc_auc_score(y_va, oof[va_idx]), 5))

print("CV AUC:", round(roc_auc_score(y, oof), 6))


LightGBMError: Do not support special JSON characters in feature name.

In [28]:
import re
import pandas as pd

# One-hot encode all at once so columns match
X = train5.drop(columns=["TARGET","SK_ID_CURR"])
y = train5["TARGET"].astype(int).values
X_test = test5.drop(columns=["SK_ID_CURR"])

X_all = pd.get_dummies(pd.concat([X, X_test], axis=0), dummy_na=True)

# --- CLEAN FEATURE NAMES FOR LIGHTGBM ---
def make_unique(names):
    seen = {}
    out = []
    for n in names:
        if n not in seen:
            seen[n] = 0
            out.append(n)
        else:
            seen[n] += 1
            out.append(f"{n}_{seen[n]}")
    return out

clean = [re.sub(r"[^0-9A-Za-z_]+", "_", str(c)) for c in X_all.columns]
clean = make_unique(clean)
X_all.columns = clean

# split back
X_enc = X_all.iloc[:len(X)].astype("float32")
X_test_enc = X_all.iloc[len(X):].astype("float32")

print("Encoded shapes:", X_enc.shape, X_test_enc.shape)
print("Example cols:", X_enc.columns[:10].tolist())


Encoded shapes: (307511, 292) (48744, 292)
Example cols: ['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH']


In [29]:
import re
import pandas as pd
import numpy as np

X = train5.drop(columns=["TARGET","SK_ID_CURR"])
y = train5["TARGET"].astype(int).values
X_test = test5.drop(columns=["SK_ID_CURR"])

X_all = pd.get_dummies(pd.concat([X, X_test], axis=0), dummy_na=True)

def make_unique(names):
    seen = {}
    out = []
    for n in names:
        if n not in seen:
            seen[n] = 0
            out.append(n)
        else:
            seen[n] += 1
            out.append(f"{n}_{seen[n]}")
    return out


In [30]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

params = {
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.03,
    "num_leaves": 64,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "min_data_in_leaf": 60,
    "verbosity": -1,
}

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(X_enc))
test_pred = np.zeros(len(X_test_enc))

for fold, (tr_idx, va_idx) in enumerate(folds.split(X_enc, y), 1):
    X_tr, X_va = X_enc.iloc[tr_idx], X_enc.iloc[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]

    dtrain = lgb.Dataset(X_tr, label=y_tr)
    dvalid = lgb.Dataset(X_va, label=y_va)

    model = lgb.train(
        params,
        dtrain,
        num_boost_round=4000,
        valid_sets=[dvalid],
        callbacks=[lgb.early_stopping(150), lgb.log_evaluation(200)]
    )

    oof[va_idx] = model.predict(X_va, num_iteration=model.best_iteration)
    test_pred += model.predict(X_test_enc, num_iteration=model.best_iteration) / folds.n_splits

    print(f"Fold {fold} AUC:", round(roc_auc_score(y_va, oof[va_idx]), 5))

print("CV AUC:", round(roc_auc_score(y, oof), 6))


Training until validation scores don't improve for 150 rounds
[200]	valid_0's auc: 0.770836
[400]	valid_0's auc: 0.774635
[600]	valid_0's auc: 0.775273
Early stopping, best iteration is:
[513]	valid_0's auc: 0.775516
Fold 1 AUC: 0.77552
Training until validation scores don't improve for 150 rounds
[200]	valid_0's auc: 0.77874
[400]	valid_0's auc: 0.783176
Early stopping, best iteration is:
[430]	valid_0's auc: 0.783477
Fold 2 AUC: 0.78348
Training until validation scores don't improve for 150 rounds
[200]	valid_0's auc: 0.772409
[400]	valid_0's auc: 0.776467
[600]	valid_0's auc: 0.776906
Early stopping, best iteration is:
[645]	valid_0's auc: 0.777116
Fold 3 AUC: 0.77712
Training until validation scores don't improve for 150 rounds
[200]	valid_0's auc: 0.777801
[400]	valid_0's auc: 0.781523
[600]	valid_0's auc: 0.78221
Early stopping, best iteration is:
[608]	valid_0's auc: 0.78232
Fold 4 AUC: 0.78232
Training until validation scores don't improve for 150 rounds
[200]	valid_0's auc: 0.

In [31]:
print("Rows, cols:", X_enc.shape)


Rows, cols: (307511, 292)


In [32]:
import pandas as pd
from pathlib import Path

sub_lgb = pd.DataFrame({
    "SK_ID_CURR": test5["SK_ID_CURR"],
    "TARGET": test_pred
})

sub_path = "/content/submission_lgbm_final.csv"
sub_lgb.to_csv(sub_path, index=False)
print("Saved:", sub_path)
sub_lgb.head()


Saved: /content/submission_lgbm_final.csv


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.026931
1,100005,0.144264
2,100013,0.03373
3,100028,0.024793
4,100038,0.138358


In [33]:
out_dir = Path("/content/drive/MyDrive/home_credit/outputs")
out_dir.mkdir(parents=True, exist_ok=True)

drive_path = out_dir / "submission_lgbm_final.csv"
sub_lgb.to_csv(drive_path, index=False)
print("Saved to Drive:", drive_path)


Saved to Drive: /content/drive/MyDrive/home_credit/outputs/submission_lgbm_final.csv


In [34]:
from pathlib import Path

proc = Path("/content/drive/MyDrive/home_credit/processed")
proc.mkdir(parents=True, exist_ok=True)

train5.to_parquet(proc / "train_features.parquet", index=False)
test5.to_parquet(proc / "test_features.parquet", index=False)

print("Saved to:", proc)


Saved to: /content/drive/MyDrive/home_credit/processed


In [35]:
import pandas as pd

results = pd.DataFrame([
    ["LogReg (application only)", 0.72766],
    ["LogReg (+ prev agg)",       0.73830],
    ["LogReg (+ prev+bureau)",    0.74118],
    ["LogReg (+ prev+bureau+inst+cc)", 0.74968],
    ["LightGBM (final)",          0.78213],
], columns=["Model / Feature Block", "Kaggle Public AUC"])

results


Unnamed: 0,Model / Feature Block,Kaggle Public AUC
0,LogReg (application only),0.72766
1,LogReg (+ prev agg),0.7383
2,LogReg (+ prev+bureau),0.74118
3,LogReg (+ prev+bureau+inst+cc),0.74968
4,LightGBM (final),0.78213


In [36]:
results.to_csv("/content/drive/MyDrive/home_credit/outputs/results_log.csv", index=False)
print("Saved results_log.csv")


Saved results_log.csv


In [37]:
imp = pd.DataFrame({
    "feature": X_enc.columns,
    "gain": model.feature_importance(importance_type="gain")
}).sort_values("gain", ascending=False)

imp.head(30)
imp.to_csv("/content/drive/MyDrive/home_credit/outputs/lgbm_feature_importance.csv", index=False)
print("Saved feature importance")


Saved feature importance


In [38]:
from pathlib import Path
import pandas as pd

base = Path("/content/drive/MyDrive/home_credit")
(base / "outputs").mkdir(parents=True, exist_ok=True)
(base / "processed").mkdir(parents=True, exist_ok=True)

# Save final feature tables
train5.to_parquet(base / "processed/train_features.parquet", index=False)
test5.to_parquet(base / "processed/test_features.parquet", index=False)

# Save Kaggle submission file too (optional but nice)
sub_lgb.to_csv(base / "outputs/submission_lgbm_final.csv", index=False)

print("Saved to:", base)


Saved to: /content/drive/MyDrive/home_credit


In [39]:
results = pd.DataFrame([
    ["LogReg (application only)", 0.72766],
    ["LogReg (+ previous_application agg)", 0.73830],
    ["LogReg (+ bureau agg)", 0.74118],
    ["LogReg (+ installments + credit_card agg)", 0.74968],
    ["LightGBM (final)", 0.78213],
], columns=["Model / Feature Block", "Kaggle Public AUC"])

results.to_csv(base / "outputs/results_log.csv", index=False)
results


Unnamed: 0,Model / Feature Block,Kaggle Public AUC
0,LogReg (application only),0.72766
1,LogReg (+ previous_application agg),0.7383
2,LogReg (+ bureau agg),0.74118
3,LogReg (+ installments + credit_card agg),0.74968
4,LightGBM (final),0.78213


In [40]:
imp = pd.DataFrame({
    "feature": X_enc.columns,
    "gain": model.feature_importance(importance_type="gain")
}).sort_values("gain", ascending=False)

imp.head(30).to_csv(base / "outputs/top30_features.csv", index=False)
imp.head(30)


Unnamed: 0,feature,gain
29,EXT_SOURCE_3,93575.071725
28,EXT_SOURCE_2,90477.848548
27,EXT_SOURCE_1,38475.599513
117,bureau_debt_credit_ratio_mean,27494.087523
6,DAYS_BIRTH,27198.016232
2,AMT_CREDIT,21132.107211
108,prev_amt_annuity_mean,20876.886577
3,AMT_ANNUITY,20664.905138
109,prev_app_credit_ratio_mean,20330.548557
107,prev_amt_credit_mean,20101.829954


In [41]:
from pathlib import Path

PORT = Path("/content/drive/MyDrive/home_credit_portfolio")
(PORT / "notebooks").mkdir(parents=True, exist_ok=True)
(PORT / "outputs").mkdir(parents=True, exist_ok=True)
(PORT / "processed").mkdir(parents=True, exist_ok=True)

print("Created:", PORT)


Created: /content/drive/MyDrive/home_credit_portfolio


In [42]:
import pandas as pd
from pathlib import Path

PORT = Path("/content/drive/MyDrive/home_credit_portfolio")

# Save submission (your best one)
sub_lgb.to_csv(PORT / "outputs/submission_lgbm_final.csv", index=False)

# Save results story (edit scores if yours differ)
results = pd.DataFrame([
    ["LogReg (application only)", 0.72766],
    ["LogReg (+ prev agg)", 0.73830],
    ["LogReg (+ prev+bureau+inst+cc)", 0.74968],
    ["LightGBM (final)", 0.78213],
], columns=["Model / Feature Block", "Kaggle Public AUC"])
results.to_csv(PORT / "outputs/results_log.csv", index=False)

print("Saved outputs to:", PORT / "outputs")


Saved outputs to: /content/drive/MyDrive/home_credit_portfolio/outputs
