In [18]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mai-ml-lab-2-308/test_c.csv
/kaggle/input/mai-ml-lab-2-308/train_c.csv
/kaggle/input/mai-ml-lab-2-308/ex_c.csv


In [19]:
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

train_csv = "/kaggle/input/mai-ml-lab-2-308/train_c.csv"
test_csv  = "/kaggle/input/mai-ml-lab-2-308/test_c.csv"

abs_cap = 1000.0
top_k = 8
degree = 4
alpha = 6
clip_val = 10.0
eps = 1e-6


In [20]:
df = pd.read_csv(train_csv)

target_col = "LoanApproved"

df[target_col] = pd.to_numeric(df[target_col])
df = df.dropna(subset=[target_col])

df = df[np.isfinite(df[target_col])]

if abs_cap is not None:
    df = df[df[target_col].abs() <= abs_cap]

cat_cols = [
    "MaritalStatus",
    "HomeOwnershipStatus",
    "LoanPurpose",
    "EmploymentStatus",
    "EducationLevel",
]

X_numeric = df.select_dtypes(include=[np.number]).drop(columns=[target_col])
X_categorical = df[cat_cols].copy()

y = df[target_col].astype(float).values

In [21]:
X_numeric = X_numeric.replace([np.inf, -np.inf], np.nan).astype(float)

lower_q, upper_q = 0.01, 0.99
lower_bounds = X_numeric.quantile(lower_q)
upper_bounds = X_numeric.quantile(upper_q)

X_winsorized = X_numeric.copy()
for col in X_winsorized.columns:
    x = X_winsorized[col].to_numpy(dtype=float, copy=True)
    lo = lower_bounds.get(col, np.nan)
    hi = upper_bounds.get(col, np.nan)
    if np.isfinite(lo):
        x = np.where(np.isfinite(x), np.maximum(x, lo), x)
    if np.isfinite(hi):
        x = np.where(np.isfinite(x), np.minimum(x, hi), x)
    X_winsorized[col] = x

imputer = SimpleImputer(strategy="median").fit(X_winsorized)
X_imputed = pd.DataFrame(
    imputer.transform(X_winsorized),
    columns=X_winsorized.columns,
    index=X_winsorized.index,
)

scaler = StandardScaler().fit(X_imputed)
X_scaled = pd.DataFrame(
    scaler.transform(X_imputed),
    columns=X_imputed.columns,
    index=X_imputed.index,
)

X_scaled.head(2)

Unnamed: 0,Age,AnnualIncome,CreditScore,LoanAmount,LoanDuration,NumberOfDependents,MonthlyDebtPayments,CreditCardUtilizationRate,NumberOfOpenCreditLines,NumberOfCreditInquiries,...,TotalLiabilities,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,Experience,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio
0,-1.113466,-0.564809,-0.766337,-0.488753,0.267843,-0.400618,1.278852,-0.837264,-0.596791,0.024253,...,-0.412038,-0.56149,1.017308,-0.439187,-1.210193,-0.342669,0.643131,0.543748,-0.491378,-0.29162
1,1.320547,0.369619,0.991736,-0.537888,-0.712053,-0.400618,-0.727581,-0.686917,-0.016598,-0.997202,...,0.027228,0.373625,0.102751,-1.364223,1.372561,-0.371166,-1.242181,-1.121284,-0.486119,-0.624373


In [22]:
corr_with_target = (
    X_scaled.assign(target = y)
    .corr(numeric_only=True)["target"]
    .drop("target")
    .abs()
    .sort_values(ascending=False)
)

top_features = corr_with_target.head(min(top_k, len(corr_with_target))).index.tolist()
base_features = [c for c in X_scaled.columns if c not in top_features]

poly_transformer = PolynomialFeatures(degree=degree, include_bias=True)
poly_transformer.fit(X_scaled[top_features])

X_poly = pd.DataFrame(
    poly_transformer.transform(X_scaled[top_features]),
    columns=poly_transformer.get_feature_names_out(top_features),
    index=X_scaled.index,
)

X_poly.head(2)

Unnamed: 0,1,MonthlyIncome,AnnualIncome,InterestRate,BaseInterestRate,CreditScore,TotalDebtToIncomeRatio,MonthlyLoanPayment,LoanAmount,MonthlyIncome^2,...,TotalDebtToIncomeRatio^2 LoanAmount^2,TotalDebtToIncomeRatio MonthlyLoanPayment^3,TotalDebtToIncomeRatio MonthlyLoanPayment^2 LoanAmount,TotalDebtToIncomeRatio MonthlyLoanPayment LoanAmount^2,TotalDebtToIncomeRatio LoanAmount^3,MonthlyLoanPayment^4,MonthlyLoanPayment^3 LoanAmount,MonthlyLoanPayment^2 LoanAmount^2,MonthlyLoanPayment LoanAmount^3,LoanAmount^4
0,1.0,-0.56149,-0.564809,0.543748,0.643131,-0.766337,-0.29162,-0.491378,-0.488753,0.315271,...,0.020315,0.034599,0.034414,0.03423,0.034048,0.058299,0.057988,0.057678,0.05737,0.057063
1,1.0,0.373625,0.369619,-1.121284,-1.242181,0.991736,-0.624373,-0.486119,-0.537888,0.139596,...,0.112791,0.071725,0.079364,0.087816,0.097167,0.055843,0.06179,0.068371,0.075652,0.083708


In [23]:
def make_safe(arr, clip_limit=None):
    arr = np.asarray(arr, dtype=float)
    arr[~np.isfinite(arr)] = 0.0
    if clip_limit is not None:
        arr = np.clip(arr, -clip_limit, clip_limit)
    return arr

clip_limit = clip_val
epsilon = eps

feature_stats = {}
for col in top_features:
    values = X_imputed[col].astype(float).values
    mean_val = float(np.nanmean(values))
    std_val = float(np.nanstd(values))
    feature_stats[col] = (mean_val, std_val if std_val > 0 else epsilon)

feature_list = list(top_features)
derived_features = {}

for col in feature_list:
    x = X_imputed[col].astype(float).values
    mean_val, std_val = feature_stats[col]
    std_val = std_val if std_val > 0 else epsilon

    derived_features[f"{col}__pow2"] = make_safe(np.power(x, 2), clip_limit)
    if degree >= 3:
        derived_features[f"{col}__pow3"] = make_safe(np.power(x, 3), clip_limit)
    if degree >= 4:
        derived_features[f"{col}__pow4"] = make_safe(np.power(x, 4), clip_limit)

    derived_features[f"{col}__abs"] = make_safe(np.abs(x), clip_limit)
    derived_features[f"{col}__sqrt"] = make_safe(np.sqrt(np.clip(x, 0, None)), clip_limit)

    log1p_pos = make_safe(np.log1p(np.clip(x, 0, None)), clip_limit)
    derived_features[f"{col}__log1p_pos"] = log1p_pos
    derived_features[f"{col}__slog1p"] = make_safe(np.sign(x) * np.log1p(np.abs(x)), clip_limit)
    derived_features[f"{col}__recip"] = make_safe(1.0 / (np.where(x == 0.0, epsilon, x)), clip_limit)

    z_score = make_safe((x - mean_val) / (std_val + epsilon), clip_limit)
    derived_features[f"{col}__z"] = z_score
    centered = make_safe(x - mean_val, clip_limit)
    derived_features[f"{col}__ctr"]  = centered
    derived_features[f"{col}__ctr2"] = make_safe(centered**2, clip_limit)
    derived_features[f"{col}__tanh"] = np.tanh(x / (std_val + epsilon))

for fa, fb in combinations(feature_list, 2):
    xa = X_imputed[fa].astype(float).values
    xb = X_imputed[fb].astype(float).values

    za = derived_features[f"{fa}__z"]
    zb = derived_features[f"{fb}__z"]

    derived_features[f"{fa}__x__{fb}"] = make_safe(xa * xb, clip_limit)
    derived_features[f"{fa}__plus__{fb}"] = make_safe(xa + xb, clip_limit)
    derived_features[f"{fa}__minus__{fb}"] = make_safe(xa - xb, clip_limit)
    derived_features[f"{fb}__minus__{fa}"] = make_safe(xb - xa, clip_limit)
    derived_features[f"{fa}__div__{fb}"] = make_safe(xa / (np.where(xb == 0.0, epsilon, xb)), clip_limit)
    derived_features[f"{fb}__div__{fa}"] = make_safe(xb / (np.where(xa == 0.0, epsilon, xa)), clip_limit)

    derived_features[f"{fa}__z_x__{fb}__z"] = make_safe(za * zb, clip_limit)
    derived_features[f"{fa}__z_div__{fb}__z_tanh"] = np.tanh(za / (np.abs(zb) + epsilon))
    derived_features[f"{fa}__rel_diff__{fb}"] = make_safe((xa - xb) / (np.abs(xa) + np.abs(xb) + epsilon), 1.0)
    derived_features[f"{fa}__min__{fb}"] = np.minimum(xa, xb)
    derived_features[f"{fa}__max__{fb}"] = np.maximum(xa, xb)

    la = derived_features[f"{fa}__log1p_pos"]
    lb = derived_features[f"{fb}__log1p_pos"]
    derived_features[f"{fa}__log_x__{fb}__log"] = make_safe(la * lb, clip_limit)

    pa = np.clip(xa, 0, None)
    pb = np.clip(xb, 0, None)
    derived_features[f"{fa}__gmean__{fb}"] = make_safe(np.sqrt(pa * pb), clip_limit)
    derived_features[f"{fa}__hmean__{fb}"] = make_safe(2.0 * pa * pb / (pa + pb + epsilon), clip_limit)

    derived_features[f"{fa}2__x__{fb}__z"] = make_safe((za * za) * zb, clip_limit)
    derived_features[f"{fa}__x__{fb}2__z"] = make_safe(za * (zb * zb), clip_limit)

derived_df = pd.DataFrame(derived_features, index=X_imputed.index)
derived_cols = derived_df.columns.tolist()

derived_scaler = StandardScaler().fit(derived_df)
derived_scaled = pd.DataFrame(
    derived_scaler.transform(derived_df),
    columns=derived_cols,
    index=derived_df.index
)

In [24]:
from sklearn.linear_model import Ridge
from sklearn.metrics import (
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    average_precision_score
)
from sklearn.model_selection import GridSearchCV

ohe_train = pd.get_dummies(X_categorical, drop_first=True)
ohe_cols = ohe_train.columns.tolist()

design_X = pd.concat(
    [
        X_scaled[base_features].reset_index(drop=True),
        X_poly.reset_index(drop=True),
        derived_scaled.reset_index(drop=True),
        ohe_train.reset_index(drop=True),
    ],
    axis=1
)

ridge = Ridge(random_state=42)

param_grid = {
    "alpha": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0],
}

grid_search = GridSearchCV(
    estimator=ridge,
    param_grid=param_grid,
    scoring="roc_auc",  
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(design_X, y)

print("Лучшие параметры:", grid_search.best_params_)
print(f"Лучший ROC-AUC (CV): {grid_search.best_score_:.6f}")

best_model = grid_search.best_estimator_

y_scores = best_model.predict(design_X)

y_proba = np.clip(y_scores, 0.0, 1.0)

y_pred = (y_proba >= 0.5).astype(int)

roc_auc = roc_auc_score(y, y_proba)
pr_auc = average_precision_score(y, y_proba) 
precision = precision_score(y, y_pred, zero_division=0)
recall = recall_score(y, y_pred, zero_division=0)
f1 = f1_score(y, y_pred, zero_division=0)

print(f"ROC-AUC: {roc_auc:.6f}")
print(f"PR-AUC: {pr_auc:.6f}")
print(f"Precision: {precision:.6f}")
print(f"Recall: {recall:.6f}")
print(f"F1-score: {f1:.6f}")

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Лучшие параметры: {'alpha': 100.0}
Лучший ROC-AUC (CV): 0.980005
ROC-AUC: 0.982617
PR-AUC: 0.984045
Precision: 0.928301
Recall: 0.914291
F1-score: 0.921243


In [25]:
df_test = pd.read_csv(test_csv)

X_numeric_test = df_test.select_dtypes(include=[np.number]).reindex(columns=X_numeric.columns)
X_categorical_test = df_test[cat_cols].copy()

X_numeric_test = X_numeric_test.replace([np.inf, -np.inf], np.nan).astype(float)

X_winsorized_test = X_numeric_test.copy()
for col in X_winsorized_test.columns:
    x = X_winsorized_test[col].to_numpy(dtype=float, copy=True)
    lo = lower_bounds.get(col, np.nan)
    hi = upper_bounds.get(col, np.nan)
    if np.isfinite(lo):
        x = np.where(np.isfinite(x), np.maximum(x, lo), x)
    if np.isfinite(hi):
        x = np.where(np.isfinite(x), np.minimum(x, hi), x)
    X_winsorized_test[col] = x

X_imputed_test = pd.DataFrame(
    imputer.transform(X_winsorized_test),
    columns = X_winsorized_test.columns,
    index = X_winsorized_test.index,
)

X_scaled_test = pd.DataFrame(
    scaler.transform(X_imputed_test),
    columns = X_imputed_test.columns,
    index = X_imputed_test.index,
)

In [26]:
X_poly_test = pd.DataFrame(
    poly_transformer.transform(X_scaled_test[top_features]),
    columns=poly_transformer.get_feature_names_out(top_features),
    index=X_scaled_test.index,
)

feature_list = list(top_features)
derived_test = {}

for col in feature_list:
    x = X_imputed_test[col].astype(float).values
    mean_val, std_val = feature_stats[col]
    std_val = std_val if std_val > 0 else epsilon

    derived_test[f"{col}__pow2"] = make_safe(np.power(x, 2), clip_limit)
    if degree >= 3:
        derived_test[f"{col}__pow3"] = make_safe(np.power(x, 3), clip_limit)
    if degree >= 4:
        derived_test[f"{col}__pow4"] = make_safe(np.power(x, 4), clip_limit)

    derived_test[f"{col}__abs"] = make_safe(np.abs(x), clip_limit)
    derived_test[f"{col}__sqrt"] = make_safe(np.sqrt(np.clip(x, 0, None)), clip_limit)

    log1p_pos = make_safe(np.log1p(np.clip(x, 0, None)), clip_limit)
    derived_test[f"{col}__log1p_pos"] = log1p_pos
    derived_test[f"{col}__slog1p"] = make_safe(np.sign(x) * np.log1p(np.abs(x)), clip_limit)
    derived_test[f"{col}__recip"] = make_safe(1.0 / (np.where(x == 0.0, epsilon, x)), clip_limit)

    z_score = make_safe((x - mean_val) / (std_val + epsilon), clip_limit)
    derived_test[f"{col}__z"] = z_score
    centered = make_safe(x - mean_val, clip_limit)
    derived_test[f"{col}__ctr"] = centered
    derived_test[f"{col}__ctr2"] = make_safe(centered**2, clip_limit)
    derived_test[f"{col}__tanh"] = np.tanh(x / (std_val + epsilon))

for fa, fb in combinations(feature_list, 2):
    xa = X_imputed_test[fa].astype(float).values
    xb = X_imputed_test[fb].astype(float).values

    za = derived_test[f"{fa}__z"]
    zb = derived_test[f"{fb}__z"]

    derived_test[f"{fa}__x__{fb}"] = make_safe(xa * xb, clip_limit)
    derived_test[f"{fa}__plus__{fb}"] = make_safe(xa + xb, clip_limit)
    derived_test[f"{fa}__minus__{fb}"] = make_safe(xa - xb, clip_limit)
    derived_test[f"{fb}__minus__{fa}"] = make_safe(xb - xa, clip_limit)
    derived_test[f"{fa}__div__{fb}"] = make_safe(xa / (np.where(xb == 0.0, epsilon, xb)), clip_limit)
    derived_test[f"{fb}__div__{fa}"] = make_safe(xb / (np.where(xa == 0.0, epsilon, xa)), clip_limit)

    derived_test[f"{fa}__z_x__{fb}__z"] = make_safe(za * zb, clip_limit)
    derived_test[f"{fa}__z_div__{fb}__z_tanh"] = np.tanh(za / (np.abs(zb) + epsilon))
    derived_test[f"{fa}__rel_diff__{fb}"] = make_safe((xa - xb) / (np.abs(xa) + np.abs(xb) + epsilon), 1.0)
    derived_test[f"{fa}__min__{fb}"] = np.minimum(xa, xb)
    derived_test[f"{fa}__max__{fb}"] = np.maximum(xa, xb)

    la = derived_test[f"{fa}__log1p_pos"]
    lb = derived_test[f"{fb}__log1p_pos"]
    derived_test[f"{fa}__log_x__{fb}__log"] = make_safe(la * lb, clip_limit)

    pa = np.clip(xa, 0, None)
    pb = np.clip(xb, 0, None)
    derived_test[f"{fa}__gmean__{fb}"] = make_safe(np.sqrt(pa * pb), clip_limit)
    derived_test[f"{fa}__hmean__{fb}"] = make_safe(2.0 * pa * pb / (pa + pb + epsilon), clip_limit)

    derived_test[f"{fa}2__x__{fb}__z"] = make_safe((za * za) * zb, clip_limit)
    derived_test[f"{fa}__x__{fb}2__z"] = make_safe(za * (zb * zb), clip_limit)

derived_test_df = pd.DataFrame(derived_test, index = X_imputed_test.index).reindex(columns = derived_cols, fill_value = 0.0)
derived_test_scaled = pd.DataFrame(
    derived_scaler.transform(derived_test_df),
    columns = derived_cols,
    index = derived_test_df.index,
)


In [27]:
from sklearn.linear_model import Ridge

alpha_final = 100.0 

ridge_final = Ridge(alpha=alpha_final, random_state=42)
ridge_final.fit(design_X, y)

ohe_test = pd.get_dummies(X_categorical_test, drop_first=True).reindex(columns=ohe_cols, fill_value=0)

design_X_test = pd.concat(
    [
        X_scaled_test[base_features].reset_index(drop=True),
        X_poly_test.reset_index(drop=True),
        derived_test_scaled.reset_index(drop=True),
        ohe_test.reset_index(drop=True),
    ],
    axis=1
)

y_pred_test_scores = ridge_final.predict(design_X_test)

y_pred_test = np.clip(y_pred_test_scores, 0.0, 1.0)

submission = pd.DataFrame({"prediction": y_pred_test})

if "ID" in df_test.columns:
    submission.insert(0, "ID", df_test["ID"].values)

submission.to_csv("predictions.csv", index=False)
submission.head()


Unnamed: 0,ID,prediction
0,0,0.948366
1,1,0.04768
2,2,0.967371
3,3,1.0
4,4,1.0
