In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split

In [8]:
train_csv = "train.csv"
test_csv  = "test.csv"
TARGET_COLUMN = 'RiskScore'

abs_cap = 900.0
top_k = 12
degree = 3
clip_val = 13.0
eps = 1e-21

In [9]:
print("--- Этап 1: Загрузка и предварительная обработка данных ---")

df = pd.read_csv(train_csv)

df[TARGET_COLUMN] = pd.to_numeric(df[TARGET_COLUMN])
df = df.dropna(subset=[TARGET_COLUMN])
df = df[np.isfinite(df[TARGET_COLUMN])]

if abs_cap is not None:
    df = df[df[TARGET_COLUMN].abs() <= abs_cap]

df[TARGET_COLUMN] = np.clip(df[TARGET_COLUMN], -200, 200)

if 'ID' in df.columns:
    df = df.drop(columns=['ID'])
if 'ApplicationDate' in df.columns:
    df = df.drop(columns=['ApplicationDate'])

print("Исходный размер обучающей выборки:", df.shape)

cat_cols = [
    "MaritalStatus",
    "HomeOwnershipStatus",
    "LoanPurpose",
    "EmploymentStatus",
    "EducationLevel",
]

X_numeric_initial = df.select_dtypes(include=[np.number]).drop(columns=[TARGET_COLUMN], errors='ignore')
X_categorical_initial = df[cat_cols].copy()

X_numeric_initial = X_numeric_initial.replace([np.inf, -np.inf], np.nan)

lower_q, upper_q = 0.01, 0.99
lower_bounds = X_numeric_initial.quantile(lower_q)
upper_bounds = X_numeric_initial.quantile(upper_q)

X_winsorized = X_numeric_initial.copy()
for col in X_winsorized.columns:
    x = X_winsorized[col].to_numpy(dtype=float, copy=True)
    lo = lower_bounds.get(col, np.nan)
    hi = upper_bounds.get(col, np.nan)
    if np.isfinite(lo):
        x = np.where(np.isfinite(x), np.maximum(x, lo), x)
    if np.isfinite(hi):
        x = np.where(np.isfinite(x), np.minimum(x, hi), x)
    X_winsorized[col] = x

imputer = SimpleImputer(strategy="median").fit(X_winsorized)
X_imputed = pd.DataFrame(
    imputer.transform(X_winsorized),
    columns=X_winsorized.columns,
    index=X_winsorized.index,
)

scaler = StandardScaler().fit(X_imputed)
X_scaled = pd.DataFrame(
    scaler.transform(X_imputed),
    columns=X_imputed.columns,
    index=X_imputed.index,
)

label_encoders = {}
for column in X_categorical_initial.select_dtypes(include=['object']).columns.tolist():
    le = LabelEncoder()
    all_categories = X_categorical_initial[column].astype(str).unique()
    le.fit(all_categories)
    X_categorical_initial[column] = le.transform(X_categorical_initial[column].astype(str))
    label_encoders[column] = le

df_processed = pd.concat([X_scaled, X_categorical_initial], axis=1)
df_processed[TARGET_COLUMN] = df[TARGET_COLUMN]

print("Исходная предобработка завершена. df_processed инициализирован.")

--- Этап 1: Загрузка и предварительная обработка данных ---
Исходный размер обучающей выборки: (10272, 34)
Исходная предобработка завершена. df_processed инициализирован.


In [10]:
print("\n--- Этап 2: Создание новых признаков ---")

def create_financial_stability_features(df_input):
    df_enhanced = df_input.copy()
    required_cols = ['NetWorth', 'TotalLiabilities', 'TotalAssets', 'AnnualIncome',
                     'SavingsAccountBalance', 'CheckingAccountBalance', 'MonthlyDebtPayments',
                     'MonthlyLoanPayment', 'MonthlyIncome']
    if not all(col in df_enhanced.columns for col in required_cols):
        return df_enhanced
    df_enhanced['FinancialIndependenceRatio'] = (df_enhanced['NetWorth'] + 1) / (df_enhanced['TotalLiabilities'] + 1)
    df_enhanced['AssetDebtCoverage'] = df_enhanced['TotalAssets'] / (df_enhanced['TotalLiabilities'] + 1)
    df_enhanced['FinancialCushion'] = (df_enhanced['SavingsAccountBalance'] + df_enhanced['CheckingAccountBalance']) / (df_enhanced['AnnualIncome'] / 12 + 1)
    df_enhanced['MonthsOfSavings'] = (df_enhanced['SavingsAccountBalance'] + df_enhanced['CheckingAccountBalance']) / (df_enhanced['MonthlyDebtPayments'] + 1)
    df_enhanced['LiquidityRisk'] = (df_enhanced['MonthlyDebtPayments'] + df_enhanced['MonthlyLoanPayment']) / (df_enhanced['MonthlyIncome'] + 1)
    df_enhanced['NetCashFlow'] = df_enhanced['MonthlyIncome'] - df_enhanced['MonthlyDebtPayments'] - df_enhanced['MonthlyLoanPayment']
    df_enhanced['CashFlowAdequacy'] = df_enhanced['NetCashFlow'] / (df_enhanced['MonthlyLoanPayment'] + 1)
    return df_enhanced

def create_debt_risk_features(df_input):
    df_enhanced = df_input.copy()
    required_cols = ['MonthlyDebtPayments', 'MonthlyLoanPayment', 'MonthlyIncome',
                     'TotalLiabilities', 'TotalAssets', 'CreditScore', 'DebtToIncomeRatio',
                     'LoanAmount', 'AnnualIncome']
    if not all(col in df_enhanced.columns for col in required_cols):
        return df_enhanced
    df_enhanced['TotalDebtBurden'] = df_enhanced['MonthlyDebtPayments'] + df_enhanced['MonthlyLoanPayment']
    df_enhanced['TotalDebtToIncome'] = df_enhanced['TotalDebtBurden'] / (df_enhanced['MonthlyIncome'] + 1)
    df_enhanced['DebtTrapRisk'] = df_enhanced['TotalLiabilities'] / (df_enhanced['TotalAssets'] + 1)
    df_enhanced['DefaultRiskScore'] = ((850 - df_enhanced['CreditScore']) / 850) * df_enhanced['DebtToIncomeRatio']
    df_enhanced['NewLoanBurden'] = df_enhanced['MonthlyLoanPayment'] / (df_enhanced['MonthlyIncome'] + 1)
    df_enhanced['LoanToIncomeAnnual'] = df_enhanced['LoanAmount'] / (df_enhanced['AnnualIncome'] + 1)
    return df_enhanced

def add_comprehensive_binning_features(df_input):
    df_enhanced = df_input.copy()
    potential_binning_cols = ['DebtToIncomeRatio', 'LoanAmount', 'AnnualIncome', 'SavingsAccountBalance',
                              'TotalAssets', 'TotalLiabilities', 'NetWorth', 'CreditCardUtilizationRate',
                              'NumberOfOpenCreditLines', 'NumberOfCreditInquiries', 'LengthOfCreditHistory',
                              'PaymentHistory', 'MonthlyIncome', 'CreditScore']

    features_for_binning = {}
    for col in potential_binning_cols:
        if col in df_enhanced.columns:
            features_for_binning[col] = df_enhanced[col]
        else:
            pass

    if not features_for_binning:
        return df_enhanced

    for col in ['AnnualIncome', 'MonthlyIncome', 'TotalLiabilities', 'TotalAssets', 'LoanAmount', 'DebtToIncomeRatio', 'SavingsAccountBalance', 'NetWorth', 'CreditCardUtilizationRate', 'NumberOfOpenCreditLines', 'NumberOfCreditInquiries', 'LengthOfCreditHistory', 'PaymentHistory']:
        if col not in df_enhanced.columns: df_enhanced[col] = 0

    if 'DebtToIncomeRatio' in df_enhanced.columns:
        df_enhanced['DebtToIncome_Binned'] = pd.cut(df_enhanced['DebtToIncomeRatio'], bins=[0, 0.2, 0.4, 0.6, 0.8, 1.0], labels=[1, 2, 3, 4, 5]).astype(float)
    else:
        df_enhanced['DebtToIncome_Binned'] = np.nan

    if 'AnnualIncome' in df_enhanced.columns:
        loan_to_income = df_enhanced['LoanAmount'] / (df_enhanced['AnnualIncome'] + 1) if 'LoanAmount' in df_enhanced.columns else 0
        df_enhanced['LoanToIncome_Binned'] = pd.cut(loan_to_income, bins=[0, 0.5, 1.0, 1.5, 2.0, 10], labels=[1, 2, 3, 4, 5]).astype(float).fillna(3)
    else:
        df_enhanced['LoanToIncome_Binned'] = np.nan

    if 'AnnualIncome' in df_enhanced.columns and 'SavingsAccountBalance' in df_enhanced.columns:
        savings_ratio = df_enhanced['SavingsAccountBalance'] / (df_enhanced['AnnualIncome'] + 1)
        df_enhanced['SavingsRatio_Binned'] = pd.cut(savings_ratio, bins=[0, 0.1, 0.25, 0.5, 1.0, 100], labels=[5, 4, 3, 2, 1]).astype(float).fillna(3)
    else:
        df_enhanced['SavingsRatio_Binned'] = np.nan

    if 'TotalAssets' in df_enhanced.columns and 'TotalLiabilities' in df_enhanced.columns:
        asset_coverage = df_enhanced['TotalAssets'] / (df_enhanced['TotalLiabilities'] + 1)
        df_enhanced['AssetCoverage_Binned'] = pd.cut(asset_coverage, bins=[0, 0.5, 1.0, 2.0, 5.0, 1000], labels=[5, 4, 3, 2, 1]).astype(float).fillna(3)
    else:
        df_enhanced['AssetCoverage_Binned'] = np.nan

    if 'AnnualIncome' in df_enhanced.columns and 'NetWorth' in df_enhanced.columns:
        networth_ratio = df_enhanced['NetWorth'] / (df_enhanced['AnnualIncome'] + 1)
        df_enhanced['NetWorthRatio_Binned'] = pd.cut(networth_ratio, bins=[-1000, 0, 0.5, 1.0, 2.0, 1000], labels=[5, 4, 3, 2, 1]).astype(float).fillna(3)
    else:
        df_enhanced['NetWorthRatio_Binned'] = np.nan

    if 'CreditCardUtilizationRate' in df_enhanced.columns:
        df_enhanced['CreditUtilization_Binned'] = pd.cut(df_enhanced['CreditCardUtilizationRate'], bins=[0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0], labels=[1, 2, 3, 4, 5, 6]).astype(float).fillna(3)
    else:
        df_enhanced['CreditUtilization_Binned'] = np.nan

    if 'NumberOfOpenCreditLines' in df_enhanced.columns:
        df_enhanced['CreditLines_Binned'] = pd.cut(df_enhanced['NumberOfOpenCreditLines'], bins=[0, 1, 3, 5, 7, 10, 100], labels=[1, 2, 3, 4, 5, 6]).astype(float).fillna(3)
    else:
        df_enhanced['CreditLines_Binned'] = np.nan

    if 'NumberOfCreditInquiries' in df_enhanced.columns:
        df_enhanced['CreditInquiries_Binned'] = pd.cut(df_enhanced['NumberOfCreditInquiries'], bins=[-1, 0, 1, 2, 3, 5, 100], labels=[1, 2, 3, 4, 5, 6]).astype(float).fillna(2)
    else:
        df_enhanced['CreditInquiries_Binned'] = np.nan

    if 'LengthOfCreditHistory' in df_enhanced.columns:
        df_enhanced['CreditHistoryLength_Binned'] = pd.cut(df_enhanced['LengthOfCreditHistory'], bins=[0, 2, 5, 7, 10, 15, 100], labels=[6, 5, 4, 3, 2, 1]).astype(float).fillna(4)
    else:
        df_enhanced['CreditHistoryLength_Binned'] = np.nan

    if 'PaymentHistory' in df_enhanced.columns and 'LengthOfCreditHistory' in df_enhanced.columns:
        payment_quality = df_enhanced['PaymentHistory'] / (df_enhanced['LengthOfCreditHistory'] + 1)
        df_enhanced['PaymentQuality_Binned'] = pd.cut(payment_quality, bins=[0, 0.5, 0.7, 0.85, 0.95, 1.0], labels=[5, 4, 3, 2, 1]).astype(float).fillna(3)
    else:
        df_enhanced['PaymentQuality_Binned'] = np.nan

    for col in df_enhanced.columns:
        if df_enhanced[col].isnull().any():
            df_enhanced[col] = df_enhanced[col].fillna(df_enhanced[col].median() if pd.api.types.is_numeric_dtype(df_enhanced[col]) else 0)

    return df_enhanced


def add_log_transformations(df_input, cols_to_log):
    df_log = df_input.copy()
    for col in cols_to_log:
        if col in df_log.columns:
            if (df_log[col] >= 0).all():
                df_log[f'Log_{col}'] = np.log1p(df_log[col])
            else:
                min_val = df_log[col].min()
                shift = abs(min_val) + 1 if min_val < 0 else 1
                df_log[f'Log_{col}'] = np.log1p(df_log[col] + shift)
    return df_log

df_processed = create_financial_stability_features(df_processed)
df_processed = create_debt_risk_features(df_processed)
df_processed = add_comprehensive_binning_features(df_processed)

all_numeric_cols_after_feature_eng = df_processed.select_dtypes(include=np.number).columns.tolist()
numeric_cols_for_log = [col for col in all_numeric_cols_after_feature_eng if col != TARGET_COLUMN and col != 'ApplicationDate']

df_processed = add_log_transformations(df_processed, numeric_cols_for_log)

print("Новые признаки добавлены.")


--- Этап 2: Создание новых признаков ---
Новые признаки добавлены.


In [11]:
print("\n--- Этап 3: Создание и отбор признаков ---")

X = df_processed.drop(TARGET_COLUMN, axis=1)
y = df_processed[TARGET_COLUMN]

X_numeric_for_corr = X.select_dtypes(include=np.number)

if not X_numeric_for_corr.empty and TARGET_COLUMN in df_processed.columns:
    correlation_with_target = X_numeric_for_corr.corrwith(y).abs().sort_values(ascending=False)
    top_features_for_poly = correlation_with_target.head(top_k).index.tolist()
    print(f"Топ-{top_k} признаков по корреляции с {TARGET_COLUMN}: {top_features_for_poly}")
else:
    print(f"Предупреждение: Числовые признаки отсутствуют или целевая колонка '{TARGET_COLUMN}' отсутствует. Топ-признаки для полиномов не определены.")
    top_features_for_poly = []

if top_features_for_poly:
    existing_top_features = [f for f in top_features_for_poly if f in X.columns]
    if len(existing_top_features) > 0:
        poly_transformer = PolynomialFeatures(degree=degree, include_bias=False)
        poly_transformer.fit(X[existing_top_features])

        X_poly = pd.DataFrame(
            poly_transformer.transform(X[existing_top_features]),
            columns=poly_transformer.get_feature_names_out(existing_top_features),
            index=X.index
        )
        print(f"Создано {X_poly.shape[1]} полиномиальных признаков.")
    else:
        print("Предупреждение: Нет существующих признаков для создания полиномов.")
        X_poly = pd.DataFrame(index=X.index)
else:
    X_poly = pd.DataFrame(index=X.index)

clip_limit = clip_val
epsilon = eps

def make_safe(arr, clip_limit=None, epsilon=1e-6):
    arr = np.asarray(arr, dtype=float)
    arr[~np.isfinite(arr)] = 0.0
    if clip_limit is not None:
        arr = np.clip(arr, -clip_limit, clip_limit)
    return arr

new_features_list = []
feature_names_for_comb = []
features_for_combinations = [f for f in top_features_for_poly if f in X.columns]

if len(features_for_combinations) >= 2:
    for fa, fb in combinations(features_for_combinations, 2):
        if fa in X.columns and fb in X.columns:
            xa = X[fa].values
            xb = X[fb].values

            new_features_list.append(make_safe(xa * xb, clip_limit, epsilon))
            feature_names_for_comb.append(f"{fa}__x__{fb}")

            new_features_list.append(make_safe(xa + xb, clip_limit, epsilon))
            feature_names_for_comb.append(f"{fa}__plus__{fb}")

            new_features_list.append(make_safe(xa - xb, clip_limit, epsilon))
            feature_names_for_comb.append(f"{fa}__minus__{fb}")

            new_features_list.append(make_safe(xa / (xb + epsilon), clip_limit, epsilon))
            feature_names_for_comb.append(f"{fa}__div__{fb}")

            new_features_list.append(make_safe(xb / (xa + epsilon), clip_limit, epsilon))
            feature_names_for_comb.append(f"{fb}__div__{fa}")

    X_combined = pd.DataFrame(np.array(new_features_list).T,
                              columns=feature_names_for_comb,
                              index=X.index)
    print(f"Создано {X_combined.shape[1]} комбинированных признаков.")
else:
    X_combined = pd.DataFrame(index=X.index)

X_all_features = pd.concat([X, X_poly, X_combined], axis=1)


--- Этап 3: Создание и отбор признаков ---
Топ-12 признаков по корреляции с RiskScore: ['Log_MonthlyIncome', 'Log_AnnualIncome', 'MonthlyIncome', 'AnnualIncome', 'CreditScore', 'Log_InterestRate', 'InterestRate', 'BaseInterestRate', 'Log_CreditScore', 'Log_BaseInterestRate', 'Log_TotalDebtToIncomeRatio', 'TotalDebtToIncomeRatio']
Создано 454 полиномиальных признаков.
Создано 330 комбинированных признаков.


In [12]:
print("\n--- Этап 4: Отбор признаков ---")

low_corr_threshold = 0.03
if not X_all_features.select_dtypes(include=np.number).empty and TARGET_COLUMN in df_processed.columns:
    X_all_features_numeric = X_all_features.select_dtypes(include=np.number)
    correlations = X_all_features_numeric.corrwith(df_processed[TARGET_COLUMN]).abs()
    low_corr_features = correlations[correlations < low_corr_threshold].index.tolist()
    print(f"Удаление {len(low_corr_features)} признаков с корреляцией < {low_corr_threshold}.")
    features_to_keep_low_corr = X_all_features_numeric.columns.difference(low_corr_features).tolist()
    X_all_features = X_all_features[features_to_keep_low_corr + X_all_features.select_dtypes(exclude=np.number).columns.tolist()]
else:
    print(f"Предупреждение: Числовые признаки отсутствуют или целевая колонка '{TARGET_COLUMN}' отсутствует. Пропуск отбора по низкой корреляции.")


def get_highly_correlated_features_to_remove(df, corr_threshold=0.999, target_corr_threshold=0.22, target_column='RiskScore'):
    if target_column not in df.columns or df.select_dtypes(include=np.number).empty:
        return []

    df_numeric = df.select_dtypes(include=np.number)
    if df_numeric.empty:
        return []

    corr_matrix = df_numeric.corr().abs()

    if target_column not in corr_matrix.columns:
        return []

    target_correlations = corr_matrix[target_column].drop(target_column, errors='ignore')

    upper_triangle = corr_matrix.where(np.triu(np.ones_like(corr_matrix, dtype=bool), k=1))
    columns_to_remove = set()

    for col in upper_triangle.columns:
        if col == target_column: continue

        col_target_corr_val = target_correlations.get(col, 0.0)
        if not isinstance(col_target_corr_val, (int, float)):
            col_target_corr_val = 0.0

        high_corr_with_col = upper_triangle[col][upper_triangle[col] > corr_threshold]

        for correlated_col in high_corr_with_col.index:
            if correlated_col == target_column: continue

            correlated_col_target_corr_val = target_correlations.get(correlated_col, 0.0)
            if not isinstance(correlated_col_target_corr_val, (int, float)):
                correlated_col_target_corr_val = 0.0

            if col_target_corr_val < target_corr_threshold and correlated_col_target_corr_val < target_corr_threshold:
                if col_target_corr_val < correlated_col_target_corr_val:
                    columns_to_remove.add(col)
                else:
                    columns_to_remove.add(correlated_col)
            elif col_target_corr_val < target_corr_threshold and correlated_col_target_corr_val >= target_corr_threshold:
                columns_to_remove.add(col)
            elif col_target_corr_val >= target_corr_threshold and correlated_col_target_corr_val < target_corr_threshold:
                columns_to_remove.add(correlated_col)
            elif col_target_corr_val >= target_corr_threshold and correlated_col_target_corr_val >= target_corr_threshold:
                 if col < correlated_col:
                     columns_to_remove.add(col)
                 else:
                     columns_to_remove.add(correlated_col)
    return list(columns_to_remove)
    
if TARGET_COLUMN in df_processed.columns and pd.api.types.is_numeric_dtype(df_processed[TARGET_COLUMN]):
    df_for_high_corr_check = pd.concat([X_all_features, df_processed[[TARGET_COLUMN]]], axis=1)
    highly_correlated_to_remove = get_highly_correlated_features_to_remove(df_for_high_corr_check, corr_threshold=0.98)
    print(f"Удаление {len(highly_correlated_to_remove)} высоко коррелирующих признаков.")
    X_all_features = X_all_features.drop(columns=highly_correlated_to_remove, errors='ignore')
else:
    print(f"Предупреждение: Целевая колонка '{TARGET_COLUMN}' отсутствует или не является числовой. Пропуск отбора высоко коррелирующих признаков.")


print(f"Финальное количество признаков: {X_all_features.shape[1]}")


--- Этап 4: Отбор признаков ---
Удаление 77 признаков с корреляцией < 0.03.
Удаление 558 высоко коррелирующих признаков.
Финальное количество признаков: 249


In [13]:
print("\n--- Этап 5: Масштабирование финальных признаков ---")
X_all_features_numeric = X_all_features.select_dtypes(include=np.number)
X_all_features_categorical = X_all_features.select_dtypes(exclude=np.number)

if not X_all_features_numeric.empty:
    final_scaler = StandardScaler()
    X_scaled_final_numeric = pd.DataFrame(
        final_scaler.fit_transform(X_all_features_numeric),
        columns=X_all_features_numeric.columns,
        index=X_all_features_numeric.index
    )
    X_scaled_final = pd.concat([X_scaled_final_numeric, X_all_features_categorical], axis=1)
    print("Финальные числовые признаки масштабированы StandardScaler.")
else:
    X_scaled_final = X_all_features_categorical
    print("Нет числовых признаков для финального масштабирования.")


--- Этап 5: Масштабирование финальных признаков ---
Финальные числовые признаки масштабированы StandardScaler.


In [14]:
print("\n--- Этап 6: Подготовка тестовых данных ---")
df_test = pd.read_csv(test_csv)

test_ids = None
if 'ID' in df_test.columns:
    test_ids = df_test['ID'].copy()
    df_test = df_test.drop(columns=['ID'])
if 'ApplicationDate' in df_test.columns:
    df_test = df_test.drop(columns=['ApplicationDate'])

df_test = df_test.replace([np.inf, -np.inf], np.nan)

X_numeric_test_initial = df_test.select_dtypes(include=[np.number])
X_categorical_test_initial = df_test[cat_cols].copy()

X_winsorized_test = X_numeric_test_initial.copy()
for col in X_winsorized_test.columns:
    x = X_winsorized_test[col].to_numpy(dtype=float, copy=True)
    lo = lower_bounds.get(col, np.nan)
    hi = upper_bounds.get(col, np.nan)
    if np.isfinite(lo):
        x = np.where(np.isfinite(x), np.maximum(x, lo), x)
    if np.isfinite(hi):
        x = np.where(np.isfinite(x), np.minimum(x, hi), x)
    X_winsorized_test[col] = x

X_imputed_test = pd.DataFrame(
    imputer.transform(X_winsorized_test),
    columns=X_winsorized_test.columns,
    index=X_winsorized_test.index,
)

X_scaled_test = pd.DataFrame(
    scaler.transform(X_imputed_test),
    columns=X_imputed_test.columns,
    index=X_imputed_test.index,
)

for column in X_categorical_test_initial.columns.tolist():
    if column in label_encoders:
        le = label_encoders[column]
        test_column_str = X_categorical_test_initial[column].astype(str)
        
        unknown_mask = ~np.isin(test_column_str, le.classes_)
        
        if unknown_mask.any():
            processed_column = le.transform(test_column_str)
            processed_column[unknown_mask] = -1
            X_categorical_test_initial[column] = processed_column
        else:
            X_categorical_test_initial[column] = le.transform(test_column_str)

X_test_processed = pd.concat([X_scaled_test, X_categorical_test_initial], axis=1)

df_test_processed = create_financial_stability_features(X_test_processed)
df_test_processed = create_debt_risk_features(df_test_processed)
df_test_processed = add_comprehensive_binning_features(df_test_processed)

df_test_processed = add_log_transformations(df_test_processed, numeric_cols_for_log)

if top_features_for_poly and all(col in df_test_processed.columns for col in top_features_for_poly):
    if 'poly_transformer' in locals() and poly_transformer is not None:
        X_test_poly = pd.DataFrame(
            poly_transformer.transform(df_test_processed[top_features_for_poly]),
            columns=poly_transformer.get_feature_names_out(top_features_for_poly),
            index=df_test_processed.index
        )
    else:
        X_test_poly = pd.DataFrame(index=df_test_processed.index)
else:
    X_test_poly = pd.DataFrame(index=df_test_processed.index)

X_test_combined = pd.DataFrame(index=df_test_processed.index)
if len(features_for_combinations) >= 2:
    new_features_test_list = []
    for fa, fb in combinations(features_for_combinations, 2):
        if fa in df_test_processed.columns and fb in df_test_processed.columns:
            xa_test = df_test_processed[fa].values
            xb_test = df_test_processed[fb].values
            new_features_test_list.append(make_safe(xa_test * xb_test, clip_limit, epsilon))
            new_features_test_list.append(make_safe(xa_test + xb_test, clip_limit, epsilon))
            new_features_test_list.append(make_safe(xa_test - xb_test, clip_limit, epsilon))
            new_features_test_list.append(make_safe(xa_test / (xb_test + epsilon), clip_limit, epsilon))
            new_features_test_list.append(make_safe(xb_test / (xa_test + epsilon), clip_limit, epsilon))
        else:
            new_features_test_list.extend([np.zeros(len(df_test_processed)) for _ in range(5)])

    if new_features_test_list:
        X_test_combined = pd.DataFrame(np.array(new_features_test_list).T,
                                       columns=feature_names_for_comb,
                                       index=df_test_processed.index)
    else:
        X_test_combined = pd.DataFrame(index=df_test_processed.index)

X_test_all_features = pd.concat([df_test_processed, X_test_poly, X_test_combined], axis=1)

missing_cols_in_test = set(X_all_features.columns) - set(X_test_all_features.columns)
for c in missing_cols_in_test:
    X_test_all_features[c] = 0

X_test_all_features = X_test_all_features[X_all_features.columns]


X_test_all_features_numeric = X_test_all_features.select_dtypes(include=np.number)
X_test_all_features_categorical = X_test_all_features.select_dtypes(exclude=np.number)

if not X_test_all_features_numeric.empty:
    if 'final_scaler' in locals() and final_scaler is not None:
        X_test_scaled_final_numeric = pd.DataFrame(
            final_scaler.transform(X_test_all_features_numeric),
            columns=X_test_all_features_numeric.columns,
            index=X_test_all_features_numeric.index
        )
        X_test_scaled_final = pd.concat([X_test_scaled_final_numeric, X_test_all_features_categorical], axis=1)
        print("Финальные тестовые числовые признаки масштабированы StandardScaler.")
    else:
        print("Предупреждение: final_scaler не был обучен. Пропуск масштабирования тестовых признаков.")
        X_test_scaled_final = X_test_all_features
else:
    X_test_scaled_final = X_test_all_features_categorical
    print("Нет числовых признаков для финального масштабирования тестовых данных.")



--- Этап 6: Подготовка тестовых данных ---
Финальные тестовые числовые признаки масштабированы StandardScaler.


In [15]:
print("\n--- Этап 7: Обучение модели LinearRegression ---")

X_train_final = X_scaled_final
y_train_final = y

if not X_train_final.empty:
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_final, y_train_final,
        test_size=0.2,
        random_state=42
    )

    model = LinearRegression()
    model.fit(X_train, y_train)
    print("Модель LinearRegression обучена.")
    print("\n--- Этап 8: Оценка качества модели ---")

    y_train_pred = model.predict(X_train)
    print("КАЧЕСТВО МОДЕЛИ НА ОБУЧАЮЩИХ ДАННЫХ (train_test_split):")
    print(f"R² score: {r2_score(y_train, y_train_pred):.4f}")
    print(f"MSE: {mean_squared_error(y_train, y_train_pred):.4f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_train, y_train_pred)):.4f}")
    print(f"MAE: {mean_absolute_error(y_train, y_train_pred):.4f}")

    y_val_pred = model.predict(X_val)
    print("\n" + "="*50)
    print("КАЧЕСТВО МОДЕЛИ НА ВАЛИДАЦИОННЫХ ДАННЫХ (train_test_split):")
    print(f"R² score: {r2_score(y_val, y_val_pred):.4f}")
    print(f"MSE: {mean_squared_error(y_val, y_val_pred):.4f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_val, y_val_pred)):.4f}")
    print(f"MAE: {mean_absolute_error(y_val, y_val_pred):.4f}")
else:
    print("Ошибка: Обучающие признаки пусты. Модель не может быть обучена.")



--- Этап 7: Обучение модели LinearRegression ---
Модель LinearRegression обучена.

--- Этап 8: Оценка качества модели ---
КАЧЕСТВО МОДЕЛИ НА ОБУЧАЮЩИХ ДАННЫХ (train_test_split):
R² score: 0.9154
MSE: 25.0979
RMSE: 5.0098
MAE: 3.7249

КАЧЕСТВО МОДЕЛИ НА ВАЛИДАЦИОННЫХ ДАННЫХ (train_test_split):
R² score: 0.9087
MSE: 27.3001
RMSE: 5.2250
MAE: 3.8214


In [18]:
print("\n--- Этап 9: Предсказание и создание submission ---")

if 'model' in locals() and X_test_scaled_final is not None and not X_test_scaled_final.empty:
    y_test_pred = model.predict(X_test_scaled_final)

    if test_ids is not None:
        submission = pd.DataFrame({'ID': test_ids, 'RiskScore': y_test_pred})
        submission.to_csv('predictions.csv', index=False)
        print("Файл 'predictions.csv' успешно создан.")
        print("Первые 5 предсказаний:")
        print(submission.head())
    else:
        print("Предупреждение: Колонка 'ID' не найдена в тестовых данных. Файл submission не создан.")
elif X_test_scaled_final is None or X_test_scaled_final.empty:
    print("Ошибка: Тестовые признаки пусты или не были подготовлены. Предсказание невозможно.")
else:
    print("Ошибка: Модель не была обучена. Предсказание невозможно.")


try:
    print("\n--- Этап 10: Оценка на тестовых данных (из ex.csv) ---")
    ex_csv_path = "ex.csv"
    df_ex = pd.read_csv(ex_csv_path)

    if 'submission' in locals() and not submission.empty:
        evaluation_df = pd.merge(df_ex, submission, on='ID', suffixes=('_true', '_pred'))

        if 'RiskScore_pred' in evaluation_df.columns and 'RiskScore_true' in evaluation_df.columns:
            y_true_eval = evaluation_df['RiskScore_true']
            y_pred_eval = evaluation_df['RiskScore_pred']

            print(f"R² score: {r2_score(y_true_eval, y_pred_eval):.4f}")
            print(f"MSE: {mean_squared_error(y_true_eval, y_pred_eval):.4f}")
            print(f"RMSE: {np.sqrt(mean_squared_error(y_true_eval, y_pred_eval)):.4f}")
            print(f"MAE: {mean_absolute_error(y_true_eval, y_pred_eval):.4f}")
        else:
            print("Не удалось найти колонки 'RiskScore_pred' и 'RiskScore_true' после объединения. Оценка на ex.csv невозможна.")
    else:
        print("Файл submission не был создан или пуст. Оценка на ex.csv невозможна.")

except FileNotFoundError:
    print("\nФайл 'ex.csv' не найден. Оценка на тестовых данных (ex.csv) невозможна.")
except Exception as e:
    print(f"\nПроизошла ошибка при оценке на ex.csv: {e}")

print("\n--- Работа скрипта завершена ---")


--- Этап 9: Предсказание и создание submission ---
Файл 'predictions.csv' успешно создан.
Первые 5 предсказаний:
   ID  RiskScore
0   0  33.825161
1   1  52.997754
2   2  30.463817
3   3  36.322889
4   4  30.825911

--- Этап 10: Оценка на тестовых данных (из ex.csv) ---
R² score: -0.3435
MSE: 1490.8414
RMSE: 38.6114
MAE: 32.1667

--- Работа скрипта завершена ---
