In [1]:
import warnings
import re
from datetime import datetime

import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score, classification_report

warnings.filterwarnings("ignore", category=RuntimeWarning)
pd.set_option("compute.use_numexpr", False)


In [2]:
TRAIN_PATH = "/kaggle/input/mllab2/train_c.csv"
TEST_PATH  = "/kaggle/input/mllab2/test_c.csv"

train_raw = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

print("train:", train_raw.shape, "test:", test.shape)
train_raw.head(3)


train: (11017, 35) test: (5000, 35)


Unnamed: 0,ApplicationDate,Age,AnnualIncome,CreditScore,LoanAmount,LoanDuration,MaritalStatus,NumberOfDependents,HomeOwnershipStatus,MonthlyDebtPayments,...,JobTenure,EmploymentStatus,EducationLevel,Experience,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved
0,2010-06-26,27.0,66829.0,549.0,17290.0,60.0,Divorced,1.0,Rent,1095.0,...,4.0,Employed,Associate,4.0,35067.0,0.25779,0.251465,508.97023,0.288013,0.0
1,1996-09-23,55.0,172147.0,850.0,16110.0,36.0,Widowed,1.0,Mortgage,211.0,...,2.0,Employed,High School,33.0,27001.0,0.08611,0.093173,514.675859,0.050585,1.0
2,2015-01-19,51.0,300000.0,850.0,38436.0,36.0,Married,0.0,Mortgage,546.0,...,3.0,Employed,Bachelor,28.0,278382.0,0.108436,0.115443,1268.276385,0.072571,1.0


In [3]:
def parse_number(val):
    if pd.isna(val):
        return np.nan
    s = str(val).strip().replace("\u00a0", "").replace(" ", "")
    if "," in s and "." in s:
        if s.rfind(",") > s.rfind("."):
            s = s.replace(".", "")
            s = s.replace(",", ".")
        else:
            s = s.replace(",", "")
    elif "," in s and "." not in s:
        if s.count(",") == 1 and len(s.split(",")[-1]) <= 2:
            s = s.replace(",", ".")
        else:
            s = s.replace(",", "")
    s = re.sub(r"[^0-9eE\+\-\.]", "", s)
    try:
        return float(s)
    except:
        return np.nan

train = train_raw.copy()

TARGET_COL = "LoanApproved"
if TARGET_COL not in train.columns:
    raise ValueError("Колонка LoanApproved не найдена в train_c.csv")

y_raw = train[TARGET_COL].astype(float)

print("Распределение таргета:")
print(y_raw.value_counts(normalize=True))

train[TARGET_COL] = y_raw


Распределение таргета:
LoanApproved
1.0    0.511776
0.0    0.488224
Name: proportion, dtype: float64


In [4]:
df_train = train.copy()
df_test = test.copy()

ID_COL = "ID" if "ID" in df_test.columns else None

DATE_COL = "ApplicationDate"
for df_ in (df_train, df_test):
    if DATE_COL in df_.columns:
        df_[DATE_COL] = pd.to_datetime(df_[DATE_COL], errors="coerce")
        df_[DATE_COL + "_year"] = df_[DATE_COL].dt.year
        df_[DATE_COL + "_month"] = df_[DATE_COL].dt.month
        df_[DATE_COL + "_day"] = df_[DATE_COL].dt.day
        df_.drop(columns=[DATE_COL], inplace=True)

feature_cols = [c for c in df_train.columns if c not in [TARGET_COL, ID_COL]]

X_raw = df_train[feature_cols].copy()
y = df_train[TARGET_COL].values.astype(float)
X_test_raw = df_test[feature_cols].copy()

print("Кол-во объектов:", X_raw.shape[0])
print("Кол-во признаков:", X_raw.shape[1])
X_raw.head(3)


Кол-во объектов: 11017
Кол-во признаков: 36


Unnamed: 0,Age,AnnualIncome,CreditScore,LoanAmount,LoanDuration,MaritalStatus,NumberOfDependents,HomeOwnershipStatus,MonthlyDebtPayments,CreditCardUtilizationRate,...,EducationLevel,Experience,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,ApplicationDate_year,ApplicationDate_month,ApplicationDate_day
0,27.0,66829.0,549.0,17290.0,60.0,Divorced,1.0,Rent,1095.0,0.151985,...,Associate,4.0,35067.0,0.25779,0.251465,508.97023,0.288013,2010.0,6.0,26.0
1,55.0,172147.0,850.0,16110.0,36.0,Widowed,1.0,Mortgage,211.0,0.175693,...,High School,33.0,27001.0,0.08611,0.093173,514.675859,0.050585,1996.0,9.0,23.0
2,51.0,300000.0,850.0,38436.0,36.0,Married,0.0,Mortgage,546.0,0.444605,...,Bachelor,28.0,278382.0,0.108436,0.115443,1268.276385,0.072571,2015.0,1.0,19.0


In [5]:
def convert_numeric_like_columns(df, min_fraction_numeric: float = 0.9):
    df = df.copy()
    numeric_cols = []
    for col in df.columns:
        s = df[col]
        if is_numeric_dtype(s):
            numeric_cols.append(col)
            continue
        sample = s.dropna().astype(str).head(500)
        if sample.empty:
            continue
        parsed = sample.apply(parse_number)
        frac_num = np.isfinite(parsed).mean()
        if frac_num >= min_fraction_numeric:
            df[col] = s.apply(parse_number).astype(float)
            numeric_cols.append(col)
    return df, numeric_cols

full = pd.concat([X_raw, X_test_raw], axis=0, ignore_index=True)
full_converted, numeric_cols = convert_numeric_like_columns(full)

X_all = full_converted.copy()

skew = X_all[numeric_cols].skew(numeric_only=True)
skewed = skew[skew.abs() > 1].index.tolist()

log_cols = []
for col in skewed:
    s = X_all[col]
    if (s > 0).mean() > 0.99:
        new_col = col + "_log1p"
        X_all[new_col] = np.log1p(s.clip(lower=0))
        log_cols.append(new_col)

numeric_cols = numeric_cols + log_cols

X = X_all.iloc[:len(X_raw)].reset_index(drop=True)
X_test = X_all.iloc[len(X_raw):].reset_index(drop=True)

cat_cols = [c for c in feature_cols if c not in numeric_cols]

print("Числовых признаков:", len(numeric_cols))
print("Категориальных признаков:", len(cat_cols))
print("Лог-признаки (первые 10):", log_cols[:10])


Числовых признаков: 31
Категориальных признаков: 5
Лог-признаки (первые 10): []


In [6]:
full_all = pd.concat([X, X_test], axis=0, ignore_index=True)

missing_frac = full_all.isna().mean()
nunique = full_all.nunique(dropna=False)

cols_to_drop = list(
    missing_frac[missing_frac > 0.8].index.union(
        nunique[nunique <= 1].index
    )
)

print("Колонок к удалению:", len(cols_to_drop))

if cols_to_drop:
    X = X.drop(columns=cols_to_drop)
    X_test = X_test.drop(columns=cols_to_drop)
    numeric_cols = [c for c in numeric_cols if c not in cols_to_drop]
    cat_cols = [c for c in cat_cols if c not in cols_to_drop]

for col in numeric_cols:
    s_train = X[col]
    q_low = s_train.quantile(0.01)
    q_high = s_train.quantile(0.99)
    X[col] = s_train.clip(q_low, q_high)
    X_test[col] = X_test[col].clip(q_low, q_high)

print("После очистки:")
print("Числовых признаков:", len(numeric_cols))
print("Категориальных признаков:", len(cat_cols))


Колонок к удалению: 0
После очистки:
Числовых признаков: 31
Категориальных признаков: 5


In [7]:
y_series = pd.Series(y, index=X.index)

te_cols = []
k = 3.0

for col in cat_cols:
    train_vals = X[col].astype(str)
    stats = y_series.groupby(train_vals).agg(["mean", "count"])
    global_mean = y_series.mean()
    smooth = (stats["mean"] * stats["count"] + k * global_mean) / (stats["count"] + k)
    X[col + "_te"] = train_vals.map(smooth).fillna(global_mean)
    X_test[col + "_te"] = X_test[col].astype(str).map(smooth).fillna(global_mean)
    te_cols.append(col + "_te")

numeric_cols = numeric_cols + te_cols
base_num_cols_all = [c for c in numeric_cols if not c.endswith("_te")]

print("После target encoding:")
print("Новых TE-признаков:", len(te_cols))
print("Всего числовых признаков:", len(numeric_cols))
print("Базовых числовых (без TE):", len(base_num_cols_all))
print("Категориальных признаков:", len(cat_cols))


После target encoding:
Новых TE-признаков: 5
Всего числовых признаков: 36
Базовых числовых (без TE): 31
Категориальных признаков: 5


In [8]:
numeric_corr = {}
for col in base_num_cols_all:
    s = X[col]
    mask = np.isfinite(s.values) & np.isfinite(y)
    if mask.sum() < 50:
        continue
    try:
        c = np.corrcoef(s.values[mask], y[mask])[0, 1]
        if np.isfinite(c):
            numeric_corr[col] = abs(c)
    except Exception:
        continue

numeric_corr_sorted = sorted(numeric_corr.items(), key=lambda x: x[1], reverse=True)
top_k = min(12, len(numeric_corr_sorted))
top_poly_cols = [c for c, _ in numeric_corr_sorted[:top_k]]

basic_num_cols = [c for c in numeric_cols if c not in top_poly_cols]

top_bin_cols = []
for col in top_poly_cols:
    s = X[col]
    mask = np.isfinite(s.values)
    if mask.sum() < 100:
        continue
    valid = s[mask]
    try:
        _, bin_edges = pd.qcut(valid, q=10, retbins=True, duplicates="drop")
    except Exception:
        continue
    X[col + "_bin"] = pd.cut(X[col], bins=bin_edges, include_lowest=True)
    X_test[col + "_bin"] = pd.cut(X_test[col], bins=bin_edges, include_lowest=True)
    cat_cols.append(col + "_bin")
    top_bin_cols.append(col + "_bin")

print("Всего числовых признаков:", len(numeric_cols))
print("top_poly_cols:", top_poly_cols)
print("basic_num_cols:", len(basic_num_cols))
print("Добавлено бинов:", len(top_bin_cols))
print("Категориальных признаков (включая бины):", len(cat_cols))


Всего числовых признаков: 36
top_poly_cols: ['MonthlyIncome', 'AnnualIncome', 'BaseInterestRate', 'InterestRate', 'CreditScore', 'TotalDebtToIncomeRatio', 'MonthlyLoanPayment', 'LoanAmount', 'NetWorth', 'TotalAssets', 'MonthlyDebtPayments', 'BankruptcyHistory']
basic_num_cols: 24
Добавлено бинов: 12
Категориальных признаков (включая бины): 17


In [9]:
numeric_basic_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scale", StandardScaler()),
])

numeric_poly_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scale", StandardScaler()),
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", min_frequency=0.01)),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num_basic", numeric_basic_pipeline, basic_num_cols),
        ("num_poly", numeric_poly_pipeline, top_poly_cols),
        ("cat", categorical_pipeline, cat_cols),
    ]
)


In [10]:
mask = np.isfinite(y)

print("Всего объектов:", len(y))
print("Ненулевых (без NaN) в y:", mask.sum())
print("NaN в y:", (~mask).sum())

X = X.loc[mask].reset_index(drop=True)
y = y[mask]

print("После фильтрации:")
print("X:", X.shape)
print("y:", y.shape)
print("Есть ли теперь NaN в y:", np.isnan(y).any())


Всего объектов: 11017
Ненулевых (без NaN) в y: 10487
NaN в y: 530
После фильтрации:
X: (10487, 53)
y: (10487,)
Есть ли теперь NaN в y: False


In [11]:
RANDOM_STATE = 42

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y
)

print("Train:", X_train.shape, "Valid:", X_valid.shape)
print("Train target distribution:")
print(pd.Series(y_train).value_counts(normalize=True))
print("Valid target distribution:")
print(pd.Series(y_valid).value_counts(normalize=True))


Train: (8389, 53) Valid: (2098, 53)
Train target distribution:
1.0    0.511742
0.0    0.488258
Name: proportion, dtype: float64
Valid target distribution:
1.0    0.511916
0.0    0.488084
Name: proportion, dtype: float64


In [12]:
log_reg_base = LogisticRegression(
    max_iter=5000,
    n_jobs=-1
)

base_clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", log_reg_base),
])

base_clf.fit(X_train, y_train)

y_valid_proba_base = base_clf.predict_proba(X_valid)[:, 1]
y_valid_pred_base = base_clf.predict(X_valid)

roc_auc_base = roc_auc_score(y_valid, y_valid_proba_base)
print("Базовая логистическая регрессия: ROC-AUC на валидации =", roc_auc_base)

print(classification_report(y_valid, y_valid_pred_base))


Базовая логистическая регрессия: ROC-AUC на валидации = 0.9824036894785847
              precision    recall  f1-score   support

         0.0       0.92      0.94      0.93      1024
         1.0       0.94      0.92      0.93      1074

    accuracy                           0.93      2098
   macro avg       0.93      0.93      0.93      2098
weighted avg       0.93      0.93      0.93      2098



In [13]:
param_grid = {
    "clf__C": [0.01, 0.1, 0.3, 1.0, 3.0, 10.0],
    "clf__class_weight": [None, "balanced"],
    "clf__penalty": ["l2"],
    "clf__solver": ["lbfgs"],
}

grid_clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=5000, n_jobs=-1)),
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

grid_search = GridSearchCV(
    estimator=grid_clf,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

print("Лучшие параметры:", grid_search.best_params_)
print("Лучший ROC-AUC по CV (train):", grid_search.best_score_)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Лучшие параметры: {'clf__C': 0.1, 'clf__class_weight': 'balanced', 'clf__penalty': 'l2', 'clf__solver': 'lbfgs'}
Лучший ROC-AUC по CV (train): 0.9846673787689454


In [14]:
best_model_val = grid_search.best_estimator_

y_valid_proba_best = best_model_val.predict_proba(X_valid)[:, 1]
y_valid_pred_best = best_model_val.predict(X_valid)

roc_auc_best = roc_auc_score(y_valid, y_valid_proba_best)
print("Лучшая модель после GridSearch: ROC-AUC на валидации =", roc_auc_best)

print(classification_report(y_valid, y_valid_pred_best))


Лучшая модель после GridSearch: ROC-AUC на валидации = 0.9816717222416201
              precision    recall  f1-score   support

         0.0       0.92      0.94      0.93      1024
         1.0       0.94      0.92      0.93      1074

    accuracy                           0.93      2098
   macro avg       0.93      0.93      0.93      2098
weighted avg       0.93      0.93      0.93      2098



In [15]:
best_params = grid_search.best_params_

final_clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", LogisticRegression(
        max_iter=5000,
        n_jobs=-1,
        C=best_params["clf__C"],
        class_weight=best_params["clf__class_weight"],
        penalty=best_params["clf__penalty"],
        solver=best_params["clf__solver"],
    )),
])

final_clf.fit(X, y)


In [16]:
test_proba = final_clf.predict_proba(X_test)[:, 1]
test_proba[:10]


array([9.90887839e-01, 1.60886216e-02, 9.98829293e-01, 9.98854691e-01,
       9.98606730e-01, 3.48189515e-03, 9.99353258e-01, 9.98243042e-01,
       8.40403503e-01, 4.39989535e-05])

In [17]:
if ID_COL is not None:
    id_test = test[ID_COL].values
else:
    id_test = np.arange(len(test_proba))

submission = pd.DataFrame({
    "ID": id_test,
    "LoanApproved": test_proba
})

submission.head()
submission.to_csv("submission_c.csv", index=False)
print("Файл submission_c.csv сохранён.")


Файл submission_c.csv сохранён.
