In [12]:
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

train_csv = "/kaggle/input/mai-ml-lab-1-308/train.csv"
test_csv  = "/kaggle/input/mai-ml-lab-1-308/test.csv"

abs_cap = 1000.0
top_k = 8
degree = 4
alpha = 6
clip_val = 10.0
eps = 1e-6


In [13]:
df = pd.read_csv(train_csv)

target_col = "RiskScore"

df[target_col] = pd.to_numeric(df[target_col])
df = df.dropna(subset=[target_col])

df = df[np.isfinite(df[target_col])]

if abs_cap is not None:
    df = df[df[target_col].abs() <= abs_cap]

cat_cols = [
    "MaritalStatus",
    "HomeOwnershipStatus",
    "LoanPurpose",
    "EmploymentStatus",
    "EducationLevel",
]

X_numeric = df.select_dtypes(include=[np.number]).drop(columns=[target_col])
X_categorical = df[cat_cols].copy()

y = df[target_col].astype(float).values

In [14]:
X_numeric = X_numeric.replace([np.inf, -np.inf], np.nan).astype(float)

lower_q, upper_q = 0.01, 0.99
lower_bounds = X_numeric.quantile(lower_q)
upper_bounds = X_numeric.quantile(upper_q)

X_winsorized = X_numeric.copy()
for col in X_winsorized.columns:
    x = X_winsorized[col].to_numpy(dtype=float, copy=True)
    lo = lower_bounds.get(col, np.nan)
    hi = upper_bounds.get(col, np.nan)
    if np.isfinite(lo):
        x = np.where(np.isfinite(x), np.maximum(x, lo), x)
    if np.isfinite(hi):
        x = np.where(np.isfinite(x), np.minimum(x, hi), x)
    X_winsorized[col] = x

imputer = SimpleImputer(strategy="median").fit(X_winsorized)
X_imputed = pd.DataFrame(
    imputer.transform(X_winsorized),
    columns=X_winsorized.columns,
    index=X_winsorized.index,
)

scaler = StandardScaler().fit(X_imputed)
X_scaled = pd.DataFrame(
    scaler.transform(X_imputed),
    columns=X_imputed.columns,
    index=X_imputed.index,
)

X_scaled.head(2)

Unnamed: 0,Age,AnnualIncome,CreditScore,LoanAmount,LoanDuration,NumberOfDependents,MonthlyDebtPayments,CreditCardUtilizationRate,NumberOfOpenCreditLines,NumberOfCreditInquiries,...,TotalLiabilities,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,Experience,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio
0,-1.115495,-0.564239,-0.764119,-0.489267,0.268131,-0.400573,1.275944,-0.837349,-0.594805,0.025596,...,-0.411881,-0.560983,1.016079,-0.439947,-1.212027,-0.342247,0.640669,0.542226,-0.490248,-0.292058
1,1.320715,0.370108,0.991696,-0.538266,-0.710632,-0.400573,-0.726445,-0.687179,-0.013967,-0.997442,...,0.02737,0.37389,0.101237,-1.364107,1.372705,-0.370877,-1.240983,-1.120743,-0.485037,-0.624207


In [15]:
corr_with_target = (
    X_scaled.assign(target = y)
    .corr(numeric_only=True)["target"]
    .drop("target")
    .abs()
    .sort_values(ascending=False)
)

top_features = corr_with_target.head(min(top_k, len(corr_with_target))).index.tolist()
base_features = [c for c in X_scaled.columns if c not in top_features]

poly_transformer = PolynomialFeatures(degree=degree, include_bias=True)
poly_transformer.fit(X_scaled[top_features])

X_poly = pd.DataFrame(
    poly_transformer.transform(X_scaled[top_features]),
    columns=poly_transformer.get_feature_names_out(top_features),
    index=X_scaled.index,
)

X_poly.head(2)

Unnamed: 0,1,MonthlyIncome,AnnualIncome,CreditScore,InterestRate,BaseInterestRate,TotalDebtToIncomeRatio,BankruptcyHistory,NetWorth,MonthlyIncome^2,...,TotalDebtToIncomeRatio^2 NetWorth^2,TotalDebtToIncomeRatio BankruptcyHistory^3,TotalDebtToIncomeRatio BankruptcyHistory^2 NetWorth,TotalDebtToIncomeRatio BankruptcyHistory NetWorth^2,TotalDebtToIncomeRatio NetWorth^3,BankruptcyHistory^4,BankruptcyHistory^3 NetWorth,BankruptcyHistory^2 NetWorth^2,BankruptcyHistory NetWorth^3,NetWorth^4
0,1.0,-0.560983,-0.564239,-0.764119,0.542226,0.640669,-0.292058,-0.229745,-0.342247,0.314702,...,0.009991,0.003542,0.005276,0.007859,0.011708,0.002786,0.00415,0.006183,0.00921,0.01372
1,1.0,0.37389,0.370108,0.991696,-1.120743,-1.240983,-0.624207,-0.229745,-0.370877,0.139794,...,0.053594,0.007569,0.012219,0.019726,0.031843,0.002786,0.004497,0.00726,0.01172,0.01892


In [16]:
def make_safe(arr, clip_limit=None):
    arr = np.asarray(arr, dtype=float)
    arr[~np.isfinite(arr)] = 0.0
    if clip_limit is not None:
        arr = np.clip(arr, -clip_limit, clip_limit)
    return arr

clip_limit = clip_val
epsilon = eps

feature_stats = {}
for col in top_features:
    values = X_imputed[col].astype(float).values
    mean_val = float(np.nanmean(values))
    std_val = float(np.nanstd(values))
    feature_stats[col] = (mean_val, std_val if std_val > 0 else epsilon)

feature_list = list(top_features)
derived_features = {}

for col in feature_list:
    x = X_imputed[col].astype(float).values
    mean_val, std_val = feature_stats[col]
    std_val = std_val if std_val > 0 else epsilon

    derived_features[f"{col}__pow2"] = make_safe(np.power(x, 2), clip_limit)
    if degree >= 3:
        derived_features[f"{col}__pow3"] = make_safe(np.power(x, 3), clip_limit)
    if degree >= 4:
        derived_features[f"{col}__pow4"] = make_safe(np.power(x, 4), clip_limit)

    derived_features[f"{col}__abs"] = make_safe(np.abs(x), clip_limit)
    derived_features[f"{col}__sqrt"] = make_safe(np.sqrt(np.clip(x, 0, None)), clip_limit)

    log1p_pos = make_safe(np.log1p(np.clip(x, 0, None)), clip_limit)
    derived_features[f"{col}__log1p_pos"] = log1p_pos
    derived_features[f"{col}__slog1p"] = make_safe(np.sign(x) * np.log1p(np.abs(x)), clip_limit)
    derived_features[f"{col}__recip"] = make_safe(1.0 / (np.where(x == 0.0, epsilon, x)), clip_limit)

    z_score = make_safe((x - mean_val) / (std_val + epsilon), clip_limit)
    derived_features[f"{col}__z"] = z_score
    centered = make_safe(x - mean_val, clip_limit)
    derived_features[f"{col}__ctr"]  = centered
    derived_features[f"{col}__ctr2"] = make_safe(centered**2, clip_limit)
    derived_features[f"{col}__tanh"] = np.tanh(x / (std_val + epsilon))

for fa, fb in combinations(feature_list, 2):
    xa = X_imputed[fa].astype(float).values
    xb = X_imputed[fb].astype(float).values

    za = derived_features[f"{fa}__z"]
    zb = derived_features[f"{fb}__z"]

    derived_features[f"{fa}__x__{fb}"] = make_safe(xa * xb, clip_limit)
    derived_features[f"{fa}__plus__{fb}"] = make_safe(xa + xb, clip_limit)
    derived_features[f"{fa}__minus__{fb}"] = make_safe(xa - xb, clip_limit)
    derived_features[f"{fb}__minus__{fa}"] = make_safe(xb - xa, clip_limit)
    derived_features[f"{fa}__div__{fb}"] = make_safe(xa / (np.where(xb == 0.0, epsilon, xb)), clip_limit)
    derived_features[f"{fb}__div__{fa}"] = make_safe(xb / (np.where(xa == 0.0, epsilon, xa)), clip_limit)

    derived_features[f"{fa}__z_x__{fb}__z"] = make_safe(za * zb, clip_limit)
    derived_features[f"{fa}__z_div__{fb}__z_tanh"] = np.tanh(za / (np.abs(zb) + epsilon))
    derived_features[f"{fa}__rel_diff__{fb}"] = make_safe((xa - xb) / (np.abs(xa) + np.abs(xb) + epsilon), 1.0)
    derived_features[f"{fa}__min__{fb}"] = np.minimum(xa, xb)
    derived_features[f"{fa}__max__{fb}"] = np.maximum(xa, xb)

    la = derived_features[f"{fa}__log1p_pos"]
    lb = derived_features[f"{fb}__log1p_pos"]
    derived_features[f"{fa}__log_x__{fb}__log"] = make_safe(la * lb, clip_limit)

    pa = np.clip(xa, 0, None)
    pb = np.clip(xb, 0, None)
    derived_features[f"{fa}__gmean__{fb}"] = make_safe(np.sqrt(pa * pb), clip_limit)
    derived_features[f"{fa}__hmean__{fb}"] = make_safe(2.0 * pa * pb / (pa + pb + epsilon), clip_limit)

    derived_features[f"{fa}2__x__{fb}__z"] = make_safe((za * za) * zb, clip_limit)
    derived_features[f"{fa}__x__{fb}2__z"] = make_safe(za * (zb * zb), clip_limit)

derived_df = pd.DataFrame(derived_features, index=X_imputed.index)
derived_cols = derived_df.columns.tolist()

derived_scaler = StandardScaler().fit(derived_df)
derived_scaled = pd.DataFrame(
    derived_scaler.transform(derived_df),
    columns=derived_cols,
    index=derived_df.index
)

In [17]:
ohe_train = pd.get_dummies(X_categorical, drop_first=True)
ohe_cols = ohe_train.columns.tolist()

design_X = pd.concat(
    [
        X_scaled[base_features].reset_index(drop=True),
        X_poly.reset_index(drop=True),
        derived_scaled.reset_index(drop=True),
        ohe_train.reset_index(drop=True),
    ],
    axis = 1
)

ridge = Ridge(alpha=alpha, random_state=42).fit(design_X, y)
y_pred = ridge.predict(design_X)
train_mse = mean_squared_error(y, y_pred)
print(f"train MSE: {train_mse:.6f}")

train MSE: 22.392784


In [18]:
df_test = pd.read_csv(test_csv)

X_numeric_test = df_test.select_dtypes(include=[np.number]).reindex(columns=X_numeric.columns)
X_categorical_test = df_test[cat_cols].copy()

X_numeric_test = X_numeric_test.replace([np.inf, -np.inf], np.nan).astype(float)

X_winsorized_test = X_numeric_test.copy()
for col in X_winsorized_test.columns:
    x = X_winsorized_test[col].to_numpy(dtype=float, copy=True)
    lo = lower_bounds.get(col, np.nan)
    hi = upper_bounds.get(col, np.nan)
    if np.isfinite(lo):
        x = np.where(np.isfinite(x), np.maximum(x, lo), x)
    if np.isfinite(hi):
        x = np.where(np.isfinite(x), np.minimum(x, hi), x)
    X_winsorized_test[col] = x

X_imputed_test = pd.DataFrame(
    imputer.transform(X_winsorized_test),
    columns = X_winsorized_test.columns,
    index = X_winsorized_test.index,
)

X_scaled_test = pd.DataFrame(
    scaler.transform(X_imputed_test),
    columns = X_imputed_test.columns,
    index = X_imputed_test.index,
)

In [19]:
X_poly_test = pd.DataFrame(
    poly_transformer.transform(X_scaled_test[top_features]),
    columns=poly_transformer.get_feature_names_out(top_features),
    index=X_scaled_test.index,
)

feature_list = list(top_features)
derived_test = {}

for col in feature_list:
    x = X_imputed_test[col].astype(float).values
    mean_val, std_val = feature_stats[col]
    std_val = std_val if std_val > 0 else epsilon

    derived_test[f"{col}__pow2"] = make_safe(np.power(x, 2), clip_limit)
    if degree >= 3:
        derived_test[f"{col}__pow3"] = make_safe(np.power(x, 3), clip_limit)
    if degree >= 4:
        derived_test[f"{col}__pow4"] = make_safe(np.power(x, 4), clip_limit)

    derived_test[f"{col}__abs"] = make_safe(np.abs(x), clip_limit)
    derived_test[f"{col}__sqrt"] = make_safe(np.sqrt(np.clip(x, 0, None)), clip_limit)

    log1p_pos = make_safe(np.log1p(np.clip(x, 0, None)), clip_limit)
    derived_test[f"{col}__log1p_pos"] = log1p_pos
    derived_test[f"{col}__slog1p"] = make_safe(np.sign(x) * np.log1p(np.abs(x)), clip_limit)
    derived_test[f"{col}__recip"] = make_safe(1.0 / (np.where(x == 0.0, epsilon, x)), clip_limit)

    z_score = make_safe((x - mean_val) / (std_val + epsilon), clip_limit)
    derived_test[f"{col}__z"] = z_score
    centered = make_safe(x - mean_val, clip_limit)
    derived_test[f"{col}__ctr"] = centered
    derived_test[f"{col}__ctr2"] = make_safe(centered**2, clip_limit)
    derived_test[f"{col}__tanh"] = np.tanh(x / (std_val + epsilon))

for fa, fb in combinations(feature_list, 2):
    xa = X_imputed_test[fa].astype(float).values
    xb = X_imputed_test[fb].astype(float).values

    za = derived_test[f"{fa}__z"]
    zb = derived_test[f"{fb}__z"]

    derived_test[f"{fa}__x__{fb}"] = make_safe(xa * xb, clip_limit)
    derived_test[f"{fa}__plus__{fb}"] = make_safe(xa + xb, clip_limit)
    derived_test[f"{fa}__minus__{fb}"] = make_safe(xa - xb, clip_limit)
    derived_test[f"{fb}__minus__{fa}"] = make_safe(xb - xa, clip_limit)
    derived_test[f"{fa}__div__{fb}"] = make_safe(xa / (np.where(xb == 0.0, epsilon, xb)), clip_limit)
    derived_test[f"{fb}__div__{fa}"] = make_safe(xb / (np.where(xa == 0.0, epsilon, xa)), clip_limit)

    derived_test[f"{fa}__z_x__{fb}__z"] = make_safe(za * zb, clip_limit)
    derived_test[f"{fa}__z_div__{fb}__z_tanh"] = np.tanh(za / (np.abs(zb) + epsilon))
    derived_test[f"{fa}__rel_diff__{fb}"] = make_safe((xa - xb) / (np.abs(xa) + np.abs(xb) + epsilon), 1.0)
    derived_test[f"{fa}__min__{fb}"] = np.minimum(xa, xb)
    derived_test[f"{fa}__max__{fb}"] = np.maximum(xa, xb)

    la = derived_test[f"{fa}__log1p_pos"]
    lb = derived_test[f"{fb}__log1p_pos"]
    derived_test[f"{fa}__log_x__{fb}__log"] = make_safe(la * lb, clip_limit)

    pa = np.clip(xa, 0, None)
    pb = np.clip(xb, 0, None)
    derived_test[f"{fa}__gmean__{fb}"] = make_safe(np.sqrt(pa * pb), clip_limit)
    derived_test[f"{fa}__hmean__{fb}"] = make_safe(2.0 * pa * pb / (pa + pb + epsilon), clip_limit)

    derived_test[f"{fa}2__x__{fb}__z"] = make_safe((za * za) * zb, clip_limit)
    derived_test[f"{fa}__x__{fb}2__z"] = make_safe(za * (zb * zb), clip_limit)

derived_test_df = pd.DataFrame(derived_test, index = X_imputed_test.index).reindex(columns = derived_cols, fill_value = 0.0)
derived_test_scaled = pd.DataFrame(
    derived_scaler.transform(derived_test_df),
    columns = derived_cols,
    index = derived_test_df.index,
)


In [20]:
ohe_test = pd.get_dummies(X_categorical_test, drop_first = True).reindex(columns = ohe_cols, fill_value = 0)

design_X_test = pd.concat(
    [
        X_scaled_test[base_features].reset_index(drop = True),
        X_poly_test.reset_index(drop = True),
        derived_test_scaled.reset_index(drop = True),
        ohe_test.reset_index(drop = True),
    ],
    axis=1
)

y_pred_test = ridge.predict(design_X_test)
submission = pd.DataFrame({"prediction": y_pred_test})

if "ID" in df_test.columns:
    submission.insert(0, "ID", df_test["ID"].values)

submission.to_csv("predictions.csv", index=False)
submission.head()

Unnamed: 0,ID,prediction
0,0,34.050984
1,1,52.889298
2,2,29.172603
3,3,35.887713
4,4,33.324889
