In [1]:
import warnings
import re
from datetime import datetime

import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error

warnings.filterwarnings("ignore", category=RuntimeWarning)
pd.set_option("compute.use_numexpr", False)


In [2]:
TRAIN_PATH = '/kaggle/input/mldatasetlab/train.csv'
TEST_PATH  = '/kaggle/input/mldatasetlab/test.csv'

train_raw = pd.read_csv(TRAIN_PATH, engine='python')
test = pd.read_csv(TEST_PATH, engine='python')

print(train_raw.shape, test.shape)
train_raw.head(2)


(11017, 35) (5000, 35)


Unnamed: 0,ApplicationDate,Age,AnnualIncome,CreditScore,LoanAmount,LoanDuration,MaritalStatus,NumberOfDependents,HomeOwnershipStatus,MonthlyDebtPayments,...,JobTenure,EmploymentStatus,EducationLevel,Experience,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,RiskScore
0,2010-06-26,27.0,66829.0,549.0,17290.0,60.0,Divorced,1.0,Rent,1095.0,...,4.0,Employed,Associate,4.0,35067.0,0.25779,0.251465,508.97023,0.288013,66.1765
1,1996-09-23,55.0,172147.0,850.0,16110.0,36.0,Widowed,1.0,Mortgage,211.0,...,2.0,Employed,High School,33.0,27001.0,0.08611,0.093173,514.675859,0.050585,28.495737


In [3]:
def parse_number(val):
    if pd.isna(val):
        return np.nan
    s = str(val).strip().replace("\u00a0", "").replace(" ", "")
    if "," in s and "." in s:
        if s.rfind(",") > s.rfind("."):
            s = s.replace(".", "")
            s = s.replace(",", ".")
        else:
            s = s.replace(",", "")
    elif "," in s and "." not in s:
        if s.count(",") == 1 and len(s.split(",")[-1]) <= 2:
            s = s.replace(",", ".")
        else:
            s = s.replace(",", "")
    s = re.sub(r"[^0-9eE\+\-\.]", "", s)
    try:
        return float(s)
    except:
        return np.nan

train = train_raw.copy()
if "RiskScore" not in train.columns:
    raise ValueError("Колонка RiskScore не найдена в train.csv")

y_raw = train["RiskScore"].apply(parse_number).astype(float)

mask_finite = np.isfinite(y_raw.values)
vals = y_raw[mask_finite]
mask_range = (vals >= 0) & (vals <= 100)
mask = np.zeros(len(train), dtype=bool)
mask[np.where(mask_finite)[0][mask_range.values]] = True

dropped = len(train) - int(mask.sum())
train = train.loc[mask].reset_index(drop=True)
train["RiskScore"] = y_raw.loc[mask].reset_index(drop=True)

print("Removed outliers/invalid RiskScore:", dropped)
print(train["RiskScore"].describe())
print("RiskScore min/max:", float(train["RiskScore"].min()), float(train["RiskScore"].max()))
print("Any NaN in RiskScore:", train["RiskScore"].isna().any())


Removed outliers/invalid RiskScore: 745
count    10272.000000
mean        48.358094
std         17.241456
min         14.841417
25%         32.722756
50%         44.195792
75%         65.124190
max         97.597249
Name: RiskScore, dtype: float64
RiskScore min/max: 14.841417296887238 97.59724939432462
Any NaN in RiskScore: False


In [4]:
df_train = train.copy()
df_test = test.copy()

TARGET_COL = "RiskScore"
ID_COL = "ID" if "ID" in df_train.columns else None

DATE_COL = "ApplicationDate"
for df_ in (df_train, df_test):
    if DATE_COL in df_.columns:
        df_[DATE_COL] = pd.to_datetime(df_[DATE_COL], errors="coerce")
        df_[DATE_COL + "_year"] = df_[DATE_COL].dt.year
        df_[DATE_COL + "_month"] = df_[DATE_COL].dt.month
        df_[DATE_COL + "_day"] = df_[DATE_COL].dt.day
        df_.drop(columns=[DATE_COL], inplace=True)

feature_cols = [c for c in df_train.columns if c not in [TARGET_COL, ID_COL]]

X_raw = df_train[feature_cols].copy()
y = df_train[TARGET_COL].values
X_test_raw = df_test[feature_cols].copy()

print("Кол-во объектов:", X_raw.shape[0])
print("Кол-во признаков:", X_raw.shape[1])
X_raw.head()


Кол-во объектов: 10272
Кол-во признаков: 36


Unnamed: 0,Age,AnnualIncome,CreditScore,LoanAmount,LoanDuration,MaritalStatus,NumberOfDependents,HomeOwnershipStatus,MonthlyDebtPayments,CreditCardUtilizationRate,...,EducationLevel,Experience,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,ApplicationDate_year,ApplicationDate_month,ApplicationDate_day
0,27.0,66829.0,549.0,17290.0,60.0,Divorced,1.0,Rent,1095.0,0.151985,...,Associate,4.0,35067.0,0.25779,0.251465,508.97023,0.288013,2010,6,26
1,55.0,172147.0,850.0,16110.0,36.0,Widowed,1.0,Mortgage,211.0,0.175693,...,High School,33.0,27001.0,0.08611,0.093173,514.675859,0.050585,1996,9,23
2,51.0,300000.0,850.0,38436.0,36.0,Married,0.0,Mortgage,546.0,0.444605,...,Bachelor,28.0,278382.0,0.108436,0.115443,1268.276385,0.072571,2015,1,19
3,25.0,34683.0,847.0,19186.0,48.0,Married,0.0,Other,153.0,0.188452,...,High School,0.0,9224.0,0.100686,0.112822,498.505187,0.225415,1981,5,12
4,55.0,300000.0,850.0,30437.0,48.0,Single,2.0,Rent,562.0,0.273431,...,Bachelor,31.0,4502.0,0.110437,0.089037,756.035156,0.052721,1995,5,7


In [5]:
from pandas.api.types import is_numeric_dtype

def convert_numeric_like_columns(df, min_fraction_numeric: float = 0.9):
    df = df.copy()
    numeric_cols = []
    for col in df.columns:
        s = df[col]
        if is_numeric_dtype(s):
            numeric_cols.append(col)
            continue
        sample = s.dropna().astype(str).head(500)
        if sample.empty:
            continue
        parsed = sample.apply(parse_number)
        frac_num = np.isfinite(parsed).mean()
        if frac_num >= min_fraction_numeric:
            df[col] = s.apply(parse_number).astype(float)
            numeric_cols.append(col)
    return df, numeric_cols

full = pd.concat([X_raw, X_test_raw], axis=0, ignore_index=True)
full_converted, numeric_cols = convert_numeric_like_columns(full)

X_all = full_converted.copy()

skew = X_all[numeric_cols].skew(numeric_only=True)
skewed = skew[skew.abs() > 1].index.tolist()

log_cols = []
for col in skewed:
    s = X_all[col]
    if (s > 0).mean() > 0.99:
        new_col = col + "_log1p"
        X_all[new_col] = np.log1p(s.clip(lower=0))
        log_cols.append(new_col)

numeric_cols = numeric_cols + log_cols

X = X_all.iloc[:len(X_raw)].reset_index(drop=True)
X_test = X_all.iloc[len(X_raw):].reset_index(drop=True)

cat_cols = [c for c in feature_cols if c not in numeric_cols]

print("Числовых признаков:", len(numeric_cols))
print("Категориальных признаков:", len(cat_cols))
print("Лог-признаки:", log_cols[:10])


Числовых признаков: 35
Категориальных признаков: 5
Лог-признаки: ['SavingsAccountBalance_log1p', 'TotalLiabilities_log1p', 'MonthlyLoanPayment_log1p', 'TotalDebtToIncomeRatio_log1p']


In [6]:
full_all = pd.concat([X, X_test], axis=0, ignore_index=True)

missing_frac = full_all.isna().mean()
nunique = full_all.nunique(dropna=False)

cols_to_drop = list(
    missing_frac[missing_frac > 0.8].index.union(
        nunique[nunique <= 1].index
    )
)

print("Колонок к удалению:", len(cols_to_drop))

if cols_to_drop:
    X = X.drop(columns=cols_to_drop)
    X_test = X_test.drop(columns=cols_to_drop)
    numeric_cols = [c for c in numeric_cols if c not in cols_to_drop]
    cat_cols = [c for c in cat_cols if c not in cols_to_drop]

for col in numeric_cols:
    s_train = X[col]
    s_test = X_test[col]
    mask_valid = np.isfinite(s_train.values)
    if mask_valid.sum() < 10:
        continue
    s_valid = s_train[mask_valid]
    q_low = s_valid.quantile(0.01)
    q_high = s_valid.quantile(0.99)
    X[col] = s_train.clip(q_low, q_high)
    X_test[col] = s_test.clip(q_low, q_high)

print("После очистки:")
print("Числовых признаков:", len(numeric_cols))
print("Категориальных признаков:", len(cat_cols))


Колонок к удалению: 0
После очистки:
Числовых признаков: 35
Категориальных признаков: 5


In [7]:
y_series = pd.Series(y, index=X.index)

te_cols = []
k = 3.0

for col in cat_cols:
    train_vals = X[col].astype(str)
    stats = y_series.groupby(train_vals).agg(["mean", "count"])
    global_mean = y_series.mean()
    smooth = (stats["mean"] * stats["count"] + k * global_mean) / (stats["count"] + k)
    X[col + "_te"] = train_vals.map(smooth).fillna(global_mean)
    X_test[col + "_te"] = X_test[col].astype(str).map(smooth).fillna(global_mean)
    te_cols.append(col + "_te")

numeric_cols = numeric_cols + te_cols

base_num_cols_all = [c for c in numeric_cols if not c.endswith("_te")]

print("После сглаженного target encoding:")
print("Новых TE-признаков:", len(te_cols))
print("Всего числовых признаков:", len(numeric_cols))
print("Базовых числовых (без TE):", len(base_num_cols_all))
print("Категориальных признаков:", len(cat_cols))


После сглаженного target encoding:
Новых TE-признаков: 5
Всего числовых признаков: 40
Базовых числовых (без TE): 35
Категориальных признаков: 5


In [8]:
numeric_corr = {}
for col in base_num_cols_all:
    s = X[col]
    mask = np.isfinite(s.values) & np.isfinite(y)
    if mask.sum() < 50:
        continue
    try:
        c = np.corrcoef(s.values[mask], y[mask])[0, 1]
        if np.isfinite(c):
            numeric_corr[col] = abs(c)
    except:
        continue

numeric_corr_sorted = sorted(numeric_corr.items(), key=lambda x: x[1], reverse=True)
top_k = min(12, len(numeric_corr_sorted))
top_poly_cols = [c for c, _ in numeric_corr_sorted[:top_k]]
basic_num_cols = [c for c in numeric_cols if c not in top_poly_cols]

top_bin_cols = top_poly_cols[:5]

for col in top_bin_cols:
    full_col = pd.concat([X[col], X_test[col]], axis=0)
    valid = full_col.dropna()
    if valid.nunique() < 5:
        continue
    try:
        _, bin_edges = pd.qcut(valid, q=10, retbins=True, duplicates="drop")
    except:
        continue
    X[col + "_bin"] = pd.cut(X[col], bins=bin_edges, include_lowest=True)
    X_test[col + "_bin"] = pd.cut(X_test[col], bins=bin_edges, include_lowest=True)
    cat_cols.append(col + "_bin")

print("Всего числовых признаков:", len(numeric_cols))
print("top_poly_cols:", top_poly_cols)
print("basic_num_cols:", len(basic_num_cols))
print("Добавлено бинов:", len(top_bin_cols))
print("Категориальных признаков (включая бины):", len(cat_cols))


Всего числовых признаков: 40
top_poly_cols: ['CreditScore', 'MonthlyIncome', 'AnnualIncome', 'BaseInterestRate', 'InterestRate', 'TotalDebtToIncomeRatio_log1p', 'TotalDebtToIncomeRatio', 'MonthlyLoanPayment_log1p', 'BankruptcyHistory', 'NetWorth', 'TotalAssets', 'MonthlyLoanPayment']
basic_num_cols: 28
Добавлено бинов: 5
Категориальных признаков (включая бины): 10


In [9]:
numeric_basic_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scale", StandardScaler()),
])

numeric_poly_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scale", StandardScaler()),
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", min_frequency=0.01)),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num_basic", numeric_basic_pipeline, basic_num_cols),
        ("num_poly", numeric_poly_pipeline, top_poly_cols),
        ("cat", categorical_pipeline, cat_cols),
    ]
)


In [10]:
def y_to_logit(y):
    eps = 1e-3
    y01 = np.clip(y / 100.0, eps, 1 - eps)
    return np.log(y01 / (1.0 - y01))

def logit_to_y(z):
    y01 = 1.0 / (1.0 + np.exp(-z))
    return np.clip(y01 * 100.0, 0.0, 100.0)


In [11]:
def cv_mse_for_alpha(alpha, X, y, cv):
    ridge = Ridge(alpha=alpha, random_state=42)
    pipe = Pipeline(steps=[
        ("preprocess", preprocess),
        ("reg", ridge),
    ])
    mses = []
    for tr_idx, val_idx in cv.split(X):
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]
        y_tr_logit = y_to_logit(y_tr)
        pipe.fit(X_tr, y_tr_logit)
        z_pred = pipe.predict(X_val)
        y_pred = logit_to_y(z_pred)
        y_pred = np.clip(y_pred, 0, 100)
        mses.append(mean_squared_error(y_val, y_pred))
    return float(np.mean(mses))

kf = KFold(n_splits=5, shuffle=True, random_state=42)
alphas = np.logspace(-3, 2, 50)

best_mse = np.inf
best_alpha = None

for a in alphas:
    mse_cv = cv_mse_for_alpha(a, X, y, kf)
    print(f"alpha={a:.5f} -> CV MSE={mse_cv:.4f}")
    if mse_cv < best_mse:
        best_mse = mse_cv
        best_alpha = a

print("Лучший alpha по CV:", best_alpha, "CV MSE:", best_mse)


alpha=0.00100 -> CV MSE=26.2448
alpha=0.00126 -> CV MSE=26.2424
alpha=0.00160 -> CV MSE=26.2398
alpha=0.00202 -> CV MSE=26.2372
alpha=0.00256 -> CV MSE=26.2345
alpha=0.00324 -> CV MSE=26.2319
alpha=0.00409 -> CV MSE=26.2293
alpha=0.00518 -> CV MSE=26.2269
alpha=0.00655 -> CV MSE=26.2245
alpha=0.00829 -> CV MSE=26.2221
alpha=0.01048 -> CV MSE=26.2197
alpha=0.01326 -> CV MSE=26.2171
alpha=0.01677 -> CV MSE=26.2143
alpha=0.02121 -> CV MSE=26.2113
alpha=0.02683 -> CV MSE=26.2078
alpha=0.03393 -> CV MSE=26.2040
alpha=0.04292 -> CV MSE=26.1998
alpha=0.05429 -> CV MSE=26.1951
alpha=0.06866 -> CV MSE=26.1901
alpha=0.08685 -> CV MSE=26.1849
alpha=0.10985 -> CV MSE=26.1794
alpha=0.13895 -> CV MSE=26.1739
alpha=0.17575 -> CV MSE=26.1685
alpha=0.22230 -> CV MSE=26.1633
alpha=0.28118 -> CV MSE=26.1583
alpha=0.35565 -> CV MSE=26.1537
alpha=0.44984 -> CV MSE=26.1496
alpha=0.56899 -> CV MSE=26.1460
alpha=0.71969 -> CV MSE=26.1429
alpha=0.91030 -> CV MSE=26.1404
alpha=1.15140 -> CV MSE=26.1385
alpha=1.

In [12]:
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

ridge_final = Ridge(alpha=best_alpha, random_state=42)

model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("reg", ridge_final),
])

y_tr_logit = y_to_logit(y_tr)
model.fit(X_tr, y_tr_logit)
z_val_pred = model.predict(X_val)
val_pred = logit_to_y(z_val_pred)
val_pred = np.clip(val_pred, 0, 100)
mse_holdout = mean_squared_error(y_val, val_pred)
print("Holdout MSE:", mse_holdout)


Holdout MSE: 26.192872963163072


In [13]:
ridge_final_full = Ridge(alpha=best_alpha, random_state=42)

final_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("reg", ridge_final_full),
])

y_logit = y_to_logit(y)
final_model.fit(X, y_logit)

z_test_pred = final_model.predict(X_test)
test_pred = logit_to_y(z_test_pred)
test_pred = np.clip(test_pred, 0, 100)

submission = pd.DataFrame({
    "ID": test[ID_COL] if ID_COL is not None else np.arange(len(test_pred)),
    "RiskScore": test_pred,
})

submission.head()
submission.to_csv("submission.csv", index=False)
print("Файл submission.csv сохранён.")


Файл submission.csv сохранён.
