In [1]:
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import PolynomialFeatures
from preprocessor import Preprocessor
from evaluator import ModelEvaluator
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [2]:
'''
На основе сравнения гипотез выбрали наилучший набор среди взаимно неконфликтных: гипотезы 3,4,7,8,10
'''

'\nНа основе сравнения гипотез выбрали наилучший набор среди взаимно неконфликтных: гипотезы 2,3,4,10\n'

In [3]:
df_train = pd.read_csv("C:/Users/Роман/Chepenkov_dz2/application_train.csv")
df_test = pd.read_csv("C:/Users/Роман/Chepenkov_dz2/application_test.csv")

In [4]:
def hypothesis_2(df):
    df_encoded = df.copy()
    cat_cols = df.select_dtypes(include=['object', 'string']).columns
    
    train_target = df_train['TARGET']  # global
    
    for col in cat_cols:
        # Используем глобальный train_target для группировки
        encoding_map = train_target.groupby(df[col]).mean().to_dict()
        df_encoded[col] = df[col].map(encoding_map)
    
    return df_encoded

In [5]:
def hypothesis_3(df):
   
    df = df.copy()
    
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns
    
    for col in num_cols:
        if col in df.columns:
            min_val = df[col].min()
            shift = 1 - min_val if min_val <= 0 else 0
            df[col] = np.log1p(df[col] + shift)
    
    return df

In [6]:
def hypothesis_4(df):
   
    df = df.copy()
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns
    
    index = df.index
    columns = df.columns
    
    scaler = StandardScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols]) 
    
    return df 

In [7]:
# Глобальное состояние
POLY_FEATURES = {
    'columns': None,
    'numeric_cols': None
}

def hypothesis_7(df):
    global POLY_FEATURES

    train_target = df_train['TARGET']
    
    df = df.copy()
    numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
    
    if POLY_FEATURES['columns'] is None:
        # Тренировочный режим
        if len(numeric_cols) > 20:
            corr = df[numeric_cols].corrwith(train_target).abs()
            numeric_cols = corr.nlargest(20).index.tolist()
        
        poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
        features = poly.fit_transform(df[numeric_cols])
        cols = poly.get_feature_names_out(numeric_cols)[:1000]
        
        POLY_FEATURES = {
            'columns': cols,
            'numeric_cols': numeric_cols
        }
        
        poly_df = pd.DataFrame(features[:, :1000], columns=cols, index=df.index)
    else:
        # Тестовый режим
        poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
        poly.fit(df[POLY_FEATURES['numeric_cols']])
        features = poly.transform(df[POLY_FEATURES['numeric_cols']])
        
        poly_df = pd.DataFrame(
            data=features[:, :len(POLY_FEATURES['columns'])], 
            columns=POLY_FEATURES['columns'],
            index=df.index
        )
    
    return pd.concat([df, poly_df], axis=1)

In [8]:
WOE_MAPPING = {}

def hypothesis_8(df):
    
    min_samples=100
    alpha=0.5
    TARGET_SERIES = df_train['TARGET']
    global WOE_MAPPING
    
    df = df.copy()
    cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # Режим обучения (когда установлена целевая переменная)
    if TARGET_SERIES is not None and len(WOE_MAPPING) == 0:
        for col in cat_cols:
            # Объединяем данные для анализа
            temp_df = pd.DataFrame({col: df[col], 'target': TARGET_SERIES})
            
            # Вычисляем WOE
            freq_table = pd.crosstab(temp_df[col], temp_df['target'], margins=True)
            freq_table['non_event'] = freq_table[0] + alpha
            freq_table['event'] = freq_table[1] + alpha
            freq_table['distr_non_event'] = freq_table['non_event'] / freq_table.at['All', 'non_event']
            freq_table['distr_event'] = freq_table['event'] / freq_table.at['All', 'event']
            freq_table['woe'] = np.log(freq_table['distr_non_event'] / freq_table['distr_event'])
            
            # Сохраняем mapping
            WOE_MAPPING[col] = freq_table['woe'].to_dict()
            del WOE_MAPPING[col]['All']
            
            # Применяем к данным
            df[col] = df[col].map(WOE_MAPPING[col]).fillna(freq_table.at['All', 'woe'])
    
    # Режим применения на тесте
    elif len(WOE_MAPPING) > 0:
        for col in cat_cols:
            if col in WOE_MAPPING:
                df[col] = df[col].map(WOE_MAPPING[col])
                # Заполняем новые категории средним WOE
                if df[col].isna().any():
                    overall_woe = np.mean(list(WOE_MAPPING[col].values()))
                    df[col] = df[col].fillna(overall_woe)
    
    return df

In [9]:
def hypothesis_10(df):
    return df #заглушка на всякий

In [10]:
preprocessor = Preprocessor(custom_steps=[hypothesis_8, hypothesis_3, hypothesis_4,hypothesis_7, hypothesis_10])

X_train_processed = preprocessor.transform(df_train.drop(columns=['TARGET', 'SK_ID_CURR']))

X_test_processed = preprocessor.transform(df_test.drop(columns=['SK_ID_CURR']))

imputer = SimpleImputer(strategy='mean')
X_train_processed = pd.DataFrame(
    imputer.fit_transform(X_train_processed),
    columns=X_train_processed.columns,
    index=X_train_processed.index
)

X_test_processed = pd.DataFrame(
    imputer.transform(X_test_processed),
    columns=X_test_processed.columns,
    index=X_test_processed.index)

In [19]:
X_train_processed = X_train_processed.loc[:, ~X_train_processed.columns.duplicated()]
X_test_processed = X_test_processed.loc[:, ~X_test_processed.columns.duplicated()]


In [20]:
cat_features = [i for i in range(len(X_train_processed.columns)) 
               if X_train_processed.dtypes.iloc[i] == 'object']

model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    eval_metric='AUC',
    cat_features=cat_features,
    random_seed=42,
    verbose=100
)

evaluator = ModelEvaluator(model)

roc_auc = evaluator.evaluate(X_train_processed, df_train['TARGET'])
print(f"ROC-AUC train: {roc_auc:.5f}")

evaluator.save_test_predictions(
    X_test=X_test_processed,
    ids=df_test['SK_ID_CURR'],
    filename='test_predictions_final.csv')
#ROC-AUC train: 0.79534

#ROC-AUC test: 0.73864
#ROC-AUC test baseline: 0.61204
#ROC-AUC test surplus: 0.1266

0:	total: 476ms	remaining: 7m 55s
100:	total: 27.2s	remaining: 4m 1s
200:	total: 51s	remaining: 3m 22s
300:	total: 1m 13s	remaining: 2m 51s
400:	total: 1m 36s	remaining: 2m 24s
500:	total: 1m 58s	remaining: 1m 57s
600:	total: 2m 19s	remaining: 1m 32s
700:	total: 2m 41s	remaining: 1m 8s
800:	total: 3m 3s	remaining: 45.5s
900:	total: 3m 24s	remaining: 22.5s
999:	total: 3m 46s	remaining: 0us
ROC-AUC train: 0.79534
