In [2]:
import warnings
import re
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    StandardScaler,
    PolynomialFeatures,
    OneHotEncoder
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    GridSearchCV
)
from sklearn.metrics import roc_auc_score, classification_report

In [3]:
warnings.filterwarnings("ignore")
pd.set_option("compute.use_numexpr", False)

TRAIN_PATH = "train_c.csv"
TEST_PATH  = "test_c.csv"

TARGET_COL = "LoanApproved"
ID_COL     = "ID"       
DATE_COL   = "ApplicationDate"

ABS_CAP           = 1000.0
TOP_K_CORR        = 12
DEGREE_POLY       = 2
SMOOTHING_K       = 3.0
MIN_FREQ_OHE       = 0.01
MISSING_frac_DROP  = 0.8
N_SPLITS_CV        = 5
RANDOM_STATE       = 42

def parse_number(val):
    if pd.isna(val):
        return np.nan
    s = str(val).strip().replace("\u00a0", "").replace(" ", "")
    if "," in s and "." in s:
        if s.rfind(",") > s.rfind("."):
            s= s.replace(".", "").replace(",", ".")
        else:
            s = s.replace(",", "")
    elif "," in s and "." not in s:
        if s.count(",") == 1 and len(s.split(",")[-1]) <= 2:
            s = s.replace(",", ".")
        else:
            s = s.replace(",", "")
    s = re.sub(r"[^0-9eE\+\-\.]", "", s)
    try:
        return float(s)
    except:
        return np.nan

def convert_numeric_like_columns(df, min_fraction_numeric: float = 0.9):
    """Преобразует столбцы‑строки, которые могут быть числами."""
    df = df.copy()
    numeric_cols = []
    for col in df.columns:
        s = df[col]
        if is_numeric_dtype(s):
            numeric_cols.append(col)
            continue
        sample = s.dropna().astype(str).head(500)
        if sample.empty:
            continue
        parsed = sample.apply(parse_number)
        if np.isfinite(parsed).mean() >= min_fraction_numeric:
            df[col] = s.apply(parse_number).astype(float)
            numeric_cols.append(col)
    return df, numeric_cols

In [4]:
train_raw = pd.read_csv(TRAIN_PATH)
test      = pd.read_csv(TEST_PATH)

y_raw = train_raw[TARGET_COL].apply(parse_number).astype(float)
y_filtered = y_raw[np.isfinite(y_raw)]
if ABS_CAP is not None:
    y_filtered = y_filtered[y_filtered.abs() <= ABS_CAP]

train = train_raw.loc[y_filtered.index].copy()
train[TARGET_COL] = y_filtered

for df_ in (train, test):
    if DATE_COL in df_.columns:
        df_[DATE_COL] = pd.to_datetime(df_[DATE_COL], errors="coerce")
        df_[DATE_COL + "_year"]  = df_[DATE_COL].dt.year
        df_[DATE_COL + "_month"] = df_[DATE_COL].dt.month
        df_[DATE_COL + "_day"]   = df_[DATE_COL].dt.day
        df_.drop(columns=[DATE_COL], inplace=True)

feature_cols = [c for c in train.columns if c not in [TARGET_COL, ID_COL]]
X_raw  = train[feature_cols].copy()
y      = y_filtered.values.astype(float)
X_test_raw = test[feature_cols].copy()

full_data = pd.concat([X_raw, X_test_raw], axis=0, ignore_index=True)
full_converted, numeric_cols_initial = convert_numeric_like_columns(full_data)
full_processed = full_converted.replace([np.inf, -np.inf], np.nan)

numeric_cols  = list(numeric_cols_initial)
cat_cols_initial = [c for c in full_processed.columns
                    if c not in numeric_cols and c not in [TARGET_COL, ID_COL]]

skew = full_processed[numeric_cols].skew(numeric_only=True)
skewed_cols = skew[skew.abs() > 1].index.tolist()
log_cols = []
for col in skewed_cols:
    if col in numeric_cols and col in full_processed.columns:
        s = full_processed[col]
        if (s >= 0).mean() > 0.99:
            new_col = col + "_log1p"
            full_processed[new_col] = np.log1p(s.clip(lower=0))
            log_cols.append(new_col)
            numeric_cols.append(new_col)
numeric_cols = [c for c in numeric_cols if c in full_processed.columns]

In [5]:
missing_frac = full_processed.isna().mean()
nunique     = full_processed.nunique(dropna=False)
cols_to_drop = list(
    missing_frac[missing_frac > MISSING_frac_DROP].index.union(
        nunique[nunique <= 1].index
    )
)
if cols_to_drop:
    full_processed = full_processed.drop(columns=cols_to_drop)
    numeric_cols = [c for c in numeric_cols if c not in cols_to_drop]
    cat_cols_initial = [c for c in cat_cols_initial if c not in cols_to_drop]

winsorized_numeric_cols = [c for c in numeric_cols if c in full_processed.columns]
if winsorized_numeric_cols:
    lower_q, upper_q = 0.01, 0.99
    lower_bounds = full_processed[winsorized_numeric_cols].quantile(lower_q)
    upper_bounds = full_processed[winsorized_numeric_cols].quantile(upper_q)
    X_winsorized_part = full_processed[winsorized_numeric_cols].copy()

    for col in X_winsorized_part.columns:
        x = X_winsorized_part[col].to_numpy(dtype=float, copy=True)
        lo = lower_bounds.get(col, np.nan)
        hi = upper_bounds.get(col, np.nan)
        if np.isfinite(lo):
            x = np.where(np.isfinite(x), np.maximum(x, lo), x)
        if np.isfinite(hi):
            x = np.where(np.isfinite(x), np.minimum(x, hi), x)
        X_winsorized_part[col] = x
    full_processed[winsorized_numeric_cols] = X_winsorized_part

te_cols = []
y_series_for_te = pd.Series(y, index=range(len(X_raw)))
cat_cols_current = [c for c in full_processed.columns
                    if c not in numeric_cols and c not in [TARGET_COL, ID_COL]]

for col in cat_cols_current:
    if col in full_processed.columns:
        train_vals = full_processed[col].iloc[:len(X_raw)].astype(str)
        stats = y_series_for_te.groupby(train_vals).agg(["mean", "count"])
        global_mean = y_series_for_te.mean()
        smooth = (stats["mean"] * stats["count"] + SMOOTHING_K * global_mean) / (
            stats["count"] + SMOOTHING_K
        )
        te_feature_name = col + "_te"
        full_processed[te_feature_name] = (
            full_processed[col].astype(str).map(smooth).fillna(global_mean)
        )
        te_cols.append(te_feature_name)
        numeric_cols.append(te_feature_name)

numeric_corr = {}
base_num_cols_for_corr = [
    c for c in numeric_cols
    if not c.endswith(("_te", "_log1p", "_bin")) and c in X_raw.columns
]
for col in base_num_cols_for_corr:
    s = full_processed[col].iloc[:len(X_raw)]
    mask = np.isfinite(s.values) & np.isfinite(y)
    if mask.sum() < 50:
        continue
    try:
        c = np.corrcoef(s.values[mask], y[mask])[0, 1]
        if np.isfinite(c):
            numeric_corr[col] = abs(c)
    except:
        continue

numeric_corr_sorted = sorted(numeric_corr.items(),
                            key=lambda x: x[1], reverse=True)
top_poly_bin_cols_candidates = [c for c, _ in numeric_corr_sorted[:TOP_K_CORR]]

top_bin_cols = []
for col in top_poly_bin_cols_candidates:
    if col in numeric_cols and col in full_processed.columns:
        s_train = full_processed[col].iloc[:len(X_raw)]
        mask = np.isfinite(s_train.values)
        if mask.sum() < 100:
            continue
        valid_s = s_train[mask]
        try:
            num_bins = min(10, len(valid_s) // 5)
            if num_bins < 2:
                continue
            _, bin_edges = pd.qcut(valid_s, q=num_bins, retbins=True, duplicates="drop")
            if len(bin_edges) < 2:
                continue
            bin_col_name = col + "_bin"
            full_processed[bin_col_name] = pd.cut(
                full_processed[col], bins=bin_edges, include_lowest=True, duplicates="drop"
            )
            cat_cols_current.append(bin_col_name)
            top_bin_cols.append(bin_col_name)
        except:
            continue

In [6]:
poly_transformer = PolynomialFeatures(degree=DEGREE_POLY, include_bias=False)
poly_cols_to_transform = [
    c for c in top_poly_bin_cols_candidates
    if c in numeric_cols and c in full_processed.columns
]
poly_feature_names = []
if poly_cols_to_transform:
    poly_processing_pipeline = Pipeline(
        [("imputer", SimpleImputer(strategy="median")), ("poly", poly_transformer)]
    )
    poly_features_train = poly_processing_pipeline.fit(
        full_processed[poly_cols_to_transform].iloc[:len(X_raw)]
    ).transform(
        full_processed[poly_cols_to_transform].iloc[:len(X_raw)]
    )
    poly_feature_names = poly_transformer.get_feature_names_out(poly_cols_to_transform)

    linear_col_set = set(poly_cols_to_transform)
    keep_mask = [name not in linear_col_set for name in poly_feature_names]
    poly_features_train = poly_features_train[:, keep_mask]
    poly_feature_names = np.array(poly_feature_names)[keep_mask]

    poly_df = pd.DataFrame(
        poly_features_train,
        index=full_processed.iloc[:len(X_raw)].index,
        columns=poly_feature_names,
    )

    poly_features_test = poly_processing_pipeline.transform(
        full_processed[poly_cols_to_transform].iloc[len(X_raw):]
    )
    poly_features_test = poly_features_test[:, keep_mask]
    poly_df_test = pd.DataFrame(
        poly_features_test,
        index=full_processed.iloc[len(X_raw):].index,
        columns=poly_feature_names,
    )

    poly_df_full = pd.concat([poly_df, poly_df_test], axis=0).sort_index()

    full_processed = pd.concat([full_processed, poly_df_full], axis=1)
    numeric_cols.extend(poly_feature_names)

numeric_cols = [c for c in numeric_cols if c in full_processed.columns]
cat_cols_current = [c for c in cat_cols_current if c in full_processed.columns]
X   = full_processed.iloc[:len(X_raw)].reset_index(drop=True)
X_test = full_processed.iloc[len(X_raw):].reset_index(drop=True)

In [7]:
numeric_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("scale", StandardScaler()),
    ]
)

categorical_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", min_frequency=MIN_FREQ_OHE)),
    ]
)

preprocessor = ColumnTransformer(
    [
        ("num", numeric_pipeline, numeric_cols),
        ("cat", categorical_pipeline, cat_cols_current),
    ],
    remainder="drop",
)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

print("\nРазмеры:")
print(f"train    {X_train.shape} {y_train.shape}")
print(f"validation {X_valid.shape} {y_valid.shape}")
print("\nРаспределение валидации:")
print(pd.Series(y_valid).value_counts(normalize=True))


Размеры:
train    (8389, 136) (8389,)
validation (2098, 136) (2098,)

Распределение валидации:
1.0    0.511916
0.0    0.488084
Name: proportion, dtype: float64


In [8]:
log_reg_pipe = Pipeline(
    [
        ("preprocess", preprocessor),
        (
            "clf",
            LogisticRegression(
                max_iter=2000,
                tol=1e-4,
                solver="saga",
                penalty="l2",
                n_jobs=4,
                random_state=RANDOM_STATE,
            ),
        ),
    ]
)


sgd_pipe = Pipeline(
    [
        ("preprocess", preprocessor),
        (
            "clf",
            SGDClassifier(
                loss="log_loss",
                penalty="elasticnet",
                alpha=0.0005,
                l1_ratio=0.15,
                max_iter=1000,
                tol=1e-4,
                n_jobs=4,
                random_state=RANDOM_STATE,
            ),
        ),
    ]
)

svc_pipe_base = Pipeline(
    [
        ("preprocess", preprocessor),
        (
            "clf",
            LinearSVC(
                C=1.0,
                max_iter=10000,
                tol=1e-5,
                random_state=RANDOM_STATE,
                dual=False,          # <‑‑ ключ
            ),
        ),
    ]
)
svc_pipe = CalibratedClassifierCV(
    svc_pipe_base, cv=3, method="sigmoid", n_jobs=4
)

param_grid_lr = {
    "clf__C": [0.01, 0.1, 1.0],
    "clf__class_weight": [None, "balanced"],
}

print("\n=== GridSearchLR ===")
grid_search_lr = GridSearchCV(
    estimator=log_reg_pipe,
    param_grid=param_grid_lr,
    scoring="roc_auc",
    cv=StratifiedKFold(n_splits=N_SPLITS_CV, shuffle=True, random_state=RANDOM_STATE),
    n_jobs=1,
    verbose=1,
)

grid_search_lr.fit(X_train, y_train)

print("\nЛучшие параметры:", grid_search_lr.best_params_)
print("Лучший ROC‑AUC CV:", grid_search_lr.best_score_)

best_lr = grid_search_lr.best_estimator_
y_valid_proba_lr = best_lr.predict_proba(X_valid)[:, 1]
auc_lr = roc_auc_score(y_valid, y_valid_proba_lr)
print("\nLR ROC‑AUC на vaild:", auc_lr)
print(classification_report(y_valid, best_lr.predict(X_valid)))


=== GridSearchLR ===
Fitting 5 folds for each of 6 candidates, totalling 30 fits

Лучшие параметры: {'clf__C': 0.01, 'clf__class_weight': 'balanced'}
Лучший ROC‑AUC CV: 0.98511344836377

LR ROC‑AUC на vaild: 0.9814334919110801
              precision    recall  f1-score   support

         0.0       0.92      0.93      0.92      1024
         1.0       0.93      0.92      0.93      1074

    accuracy                           0.93      2098
   macro avg       0.93      0.93      0.93      2098
weighted avg       0.93      0.93      0.93      2098



In [9]:
print("\n=== Обучаем SGD ===")
sgd_pipe.fit(X_train, y_train)
y_valid_proba_sgd = sgd_pipe.predict_proba(X_valid)[:, 1]
auc_sgd = roc_auc_score(y_valid, y_valid_proba_sgd)
print("SGD ROC‑AUC:", auc_sgd)
print(classification_report(y_valid, sgd_pipe.predict(X_valid)))

print("\n=== Обучаем SVC (калибровка) ===")
svc_pipe.fit(X_train, y_train)
y_valid_proba_svc = svc_pipe.predict_proba(X_valid)[:, 1]
auc_svc = roc_auc_score(y_valid, y_valid_proba_svc)
print("SVC ROC‑AUC:", auc_svc)
print(classification_report(y_valid, svc_pipe.predict(X_valid)))

ensemble_valid_proba = (y_valid_proba_lr + y_valid_proba_sgd + y_valid_proba_svc) / 3
auc_ensemble = roc_auc_score(y_valid, ensemble_valid_proba)
print("\n=== Ансамбль ROC‑AUC ===", auc_ensemble)
print(classification_report(y_valid, (ensemble_valid_proba >= 0.5).astype(int)))

print("\n=== Финальное обучение на всей данных и предсказание теста ===")
best_lr.fit(X, y)
sgd_pipe.fit(X, y)
svc_pipe.fit(X, y)

test_p1 = best_lr.predict_proba(X_test)[:, 1]
test_p2 = sgd_pipe.predict_proba(X_test)[:, 1]
test_p3 = svc_pipe.predict_proba(X_test)[:, 1]
test_proba = (test_p1 + test_p2 + test_p3) / 3

submission = pd.DataFrame({
    "ID": test[ID_COL] if ID_COL is not None else np.arange(len(test_proba)),
    TARGET_COL: test_proba,
})
submission.to_csv("submission.csv", index=False)
print("\nФайл submission.csv сохранён")
print(submission.head())


=== Обучаем SGD ===
SGD ROC‑AUC: 0.9811407050162941
              precision    recall  f1-score   support

         0.0       0.92      0.93      0.92      1024
         1.0       0.93      0.92      0.93      1074

    accuracy                           0.93      2098
   macro avg       0.93      0.93      0.93      2098
weighted avg       0.93      0.93      0.93      2098


=== Обучаем SVC (калибровка) ===
SVC ROC‑AUC: 0.9818108414804471
              precision    recall  f1-score   support

         0.0       0.92      0.93      0.93      1024
         1.0       0.93      0.92      0.93      1074

    accuracy                           0.93      2098
   macro avg       0.93      0.93      0.93      2098
weighted avg       0.93      0.93      0.93      2098


=== Ансамбль ROC‑AUC === 0.9824727944599628
              precision    recall  f1-score   support

         0.0       0.92      0.93      0.92      1024
         1.0       0.93      0.92      0.93      1074

    accuracy      