In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from typing import List, Optional


import seaborn as sns
import scipy.stats as st
from scipy.stats import probplot, ks_2samp

from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
import missingno as msno
import xgboost as xgb
import catboost as cb
from scipy.stats import skew, kurtosis

%matplotlib inline


import warnings
warnings.simplefilter("ignore")

In [2]:
df_train = pd.read_csv('/Kaggle/input/geekbrains-competitive-data-analysis/train.csv')
df_test = pd.read_csv('/Kaggle/input/geekbrains-competitive-data-analysis/test.csv')
df_profile = pd.read_csv('/Kaggle/input/geekbrains-competitive-data-analysis/client_profile.csv')

In [3]:
df_train.head()

Unnamed: 0,APPLICATION_NUMBER,TARGET,NAME_CONTRACT_TYPE
0,123687442,0,Cash
1,123597908,1,Cash
2,123526683,0,Cash
3,123710391,1,Cash
4,123590329,1,Cash


In [4]:
df_test.head()

Unnamed: 0,APPLICATION_NUMBER,NAME_CONTRACT_TYPE
0,123724268,Cash
1,123456549,Cash
2,123428178,Credit Card
3,123619984,Cash
4,123671104,Cash


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110093 entries, 0 to 110092
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   APPLICATION_NUMBER  110093 non-null  int64 
 1   TARGET              110093 non-null  int64 
 2   NAME_CONTRACT_TYPE  110093 non-null  object
dtypes: int64(2), object(1)
memory usage: 2.5+ MB


In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165141 entries, 0 to 165140
Data columns (total 2 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   APPLICATION_NUMBER  165141 non-null  int64 
 1   NAME_CONTRACT_TYPE  165141 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.5+ MB


In [7]:
df_profile.head()

Unnamed: 0,APPLICATION_NUMBER,GENDER,CHILDRENS,TOTAL_SALARY,AMOUNT_CREDIT,AMOUNT_ANNUITY,EDUCATION_LEVEL,FAMILY_STATUS,REGION_POPULATION,AGE,...,FAMILY_SIZE,EXTERNAL_SCORING_RATING_1,EXTERNAL_SCORING_RATING_2,EXTERNAL_SCORING_RATING_3,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,123666076,F,0,157500.0,270000.0,13500.0,Incomplete higher,Civil marriage,0.008068,8560,...,2.0,0.329471,0.236315,0.678568,0.0,0.0,0.0,0.0,1.0,2.0
1,123423688,F,0,270000.0,536917.5,28467.0,Secondary / secondary special,Married,0.020246,23187,...,2.0,,0.442295,0.802745,0.0,0.0,0.0,0.0,1.0,1.0
2,123501780,M,1,427500.0,239850.0,23850.0,Incomplete higher,Married,0.072508,14387,...,3.0,0.409017,0.738159,,,,,,,
3,123588799,M,0,112500.0,254700.0,17149.5,Secondary / secondary special,Married,0.019101,14273,...,2.0,,0.308994,0.590233,0.0,0.0,0.0,0.0,0.0,3.0
4,123647485,M,0,130500.0,614574.0,19822.5,Lower secondary,Married,0.022625,22954,...,2.0,,0.739408,0.15664,0.0,0.0,1.0,0.0,0.0,6.0


In [8]:
df_profile.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 24 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   APPLICATION_NUMBER          250000 non-null  int64  
 1   GENDER                      250000 non-null  object 
 2   CHILDRENS                   250000 non-null  int64  
 3   TOTAL_SALARY                250000 non-null  float64
 4   AMOUNT_CREDIT               250000 non-null  float64
 5   AMOUNT_ANNUITY              249989 non-null  float64
 6   EDUCATION_LEVEL             250000 non-null  object 
 7   FAMILY_STATUS               250000 non-null  object 
 8   REGION_POPULATION           250000 non-null  float64
 9   AGE                         250000 non-null  int64  
 10  DAYS_ON_LAST_JOB            250000 non-null  int64  
 11  OWN_CAR_AGE                 85041 non-null   float64
 12  FLAG_PHONE                  250000 non-null  int64  
 13  FLAG_EMAIL    

In [9]:
def calculate_feature_separating_ability(
    features: pd.DataFrame, target: pd.Series, fill_value: float = -9999) -> pd.DataFrame:
    """
    Оценка разделяющей способности признаков с помощью метрики GINI.

    Parameters
    ----------
    features: pandas.core.frame.DataFrame
        Матрица признаков.

    target: pandas.core.frame.Series
        Вектор целевой переменной.

    fill_value: float, optional, default = -9999
        Значение для заполнения пропусков в значении признаков.
        Опциональный параметр, по умолчанию, равен -9999;

    Returns
    -------
    scores: pandas.core.frame.DataFrame
        Матрица важности признаков.

    """
    scores = {}
    for feature in features:
        score = roc_auc_score(
            target, features[feature].fillna(fill_value)
        )
        scores[feature] = 2*score - 1

    scores = pd.Series(scores)
    scores = scores.sort_values(ascending=False)

    return scores

In [10]:
numerical = df_profile.select_dtypes(exclude=["object"])
numerical = numerical.merge(df_train["APPLICATION_NUMBER"], how="right", on="APPLICATION_NUMBER")

scores = calculate_feature_separating_ability(
    numerical, df_train["TARGET"]
)
scores.head(n=10)

CHILDRENS                     0.026050
FAMILY_SIZE                   0.011430
AMOUNT_ANNUITY               -0.002536
FLAG_EMAIL                   -0.003034
APPLICATION_NUMBER           -0.004743
OWN_CAR_AGE                  -0.017952
AMT_REQ_CREDIT_BUREAU_YEAR   -0.018639
FLAG_PHONE                   -0.021521
TOTAL_SALARY                 -0.026845
AMOUNT_CREDIT                -0.028179
dtype: float64

In [11]:
def create_numerical_aggs(data: pd.DataFrame,
                          groupby_id: str,
                          aggs: dict,
                          prefix: Optional[str] = None,
                          suffix: Optional[str] = None,
                          ) -> pd.DataFrame:
    """
    Построение агрегаций для числовых признаков.

    Parameters
    ----------
    data: pandas.core.frame.DataFrame
        Выборка для построения агрегаций.

    groupby_id: str
        Название ключа, по которому нужно произвести группировку.

    aggs: dict
        Словарь с названием признака и списка функций.
        Ключ словаря - название признака, который используется для
        вычисления агрегаций, значение словаря - список с названием
        функций для вычисления агрегаций.

    prefix: str, optional, default = None
        Префикс для названия признаков.
        Опциональный параметр, по умолчанию, не используется.

    suffix: str, optional, default = None
        Суффикс для названия признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    stats: pandas.core.frame.DataFrame
        Выборка с рассчитанными агрегациями.

    """
    if not prefix:
        prefix = ""
    if not suffix:
        suffix = ""

    data_grouped = data.groupby(groupby_id)
    stats = data_grouped.agg(aggs)
    stats.columns = [f"{prefix}{feature}_{stat}{suffix}".upper() for feature, stat in stats]
    stats = stats.reset_index()

    return stats

In [12]:
aggs = {
    "TOTAL_SALARY": ["mean"],
    "AMOUNT_CREDIT": ["mean"],
}

stats = create_numerical_aggs(
    df_profile, groupby_id="EDUCATION_LEVEL", aggs=aggs, suffix="_BY_EDUCATION"
)
stats

Unnamed: 0,EDUCATION_LEVEL,TOTAL_SALARY_MEAN_BY_EDUCATION,AMOUNT_CREDIT_MEAN_BY_EDUCATION
0,Academic degree,244621.323529,729561.606618
1,Higher education,208989.672806,689809.957142
2,Incomplete higher,181446.844502,565921.004786
3,Lower secondary,130223.217137,491498.704966
4,Secondary / secondary special,155414.744584,571624.325064


In [13]:
profile_stats = df_profile[["APPLICATION_NUMBER", "EDUCATION_LEVEL", "TOTAL_SALARY", "AMOUNT_CREDIT"]]
profile_stats = profile_stats.merge(
    stats, how="left", on="EDUCATION_LEVEL"
)
profile_stats["TOTAL_SALARY_TO_MEAN_SALARY_BY_EDUCATION"] = profile_stats["TOTAL_SALARY"] / profile_stats["TOTAL_SALARY_MEAN_BY_EDUCATION"]
profile_stats["DELTA_SALARY_TO_MEAN_SALARY_BY_EDUCATION"] = profile_stats["TOTAL_SALARY"] - profile_stats["TOTAL_SALARY_MEAN_BY_EDUCATION"]
profile_stats.head(n=3)

Unnamed: 0,APPLICATION_NUMBER,EDUCATION_LEVEL,TOTAL_SALARY,AMOUNT_CREDIT,TOTAL_SALARY_MEAN_BY_EDUCATION,AMOUNT_CREDIT_MEAN_BY_EDUCATION,TOTAL_SALARY_TO_MEAN_SALARY_BY_EDUCATION,DELTA_SALARY_TO_MEAN_SALARY_BY_EDUCATION
0,123666076,Incomplete higher,157500.0,270000.0,181446.844502,565921.004786,0.868023,-23946.844502
1,123423688,Secondary / secondary special,270000.0,536917.5,155414.744584,571624.325064,1.737287,114585.255416
2,123501780,Incomplete higher,427500.0,239850.0,181446.844502,565921.004786,2.356062,246053.155498


In [14]:
df_train = df_train.merge(
    profile_stats, how="left", on="APPLICATION_NUMBER"
)
df_train.head(n=2)

Unnamed: 0,APPLICATION_NUMBER,TARGET,NAME_CONTRACT_TYPE,EDUCATION_LEVEL,TOTAL_SALARY,AMOUNT_CREDIT,TOTAL_SALARY_MEAN_BY_EDUCATION,AMOUNT_CREDIT_MEAN_BY_EDUCATION,TOTAL_SALARY_TO_MEAN_SALARY_BY_EDUCATION,DELTA_SALARY_TO_MEAN_SALARY_BY_EDUCATION
0,123687442,0,Cash,Secondary / secondary special,157500.0,855000.0,155414.744584,571624.325064,1.013417,2085.255416
1,123597908,1,Cash,,,,,,,


In [15]:
df_test = df_test.merge(
    profile_stats, how="left", on="APPLICATION_NUMBER"
)
df_test.head(n=2)

Unnamed: 0,APPLICATION_NUMBER,NAME_CONTRACT_TYPE,EDUCATION_LEVEL,TOTAL_SALARY,AMOUNT_CREDIT,TOTAL_SALARY_MEAN_BY_EDUCATION,AMOUNT_CREDIT_MEAN_BY_EDUCATION,TOTAL_SALARY_TO_MEAN_SALARY_BY_EDUCATION,DELTA_SALARY_TO_MEAN_SALARY_BY_EDUCATION
0,123724268,Cash,Secondary / secondary special,117000.0,1125000.0,155414.744584,571624.325064,0.752824,-38414.744584
1,123456549,Cash,Secondary / secondary special,81000.0,312768.0,155414.744584,571624.325064,0.521186,-74414.744584


In [16]:
# финансовые признаки
df_profile["RATIO_SALARY_TO_AMOUNT_CREDIT"] = df_profile["TOTAL_SALARY"] / df_profile["AMOUNT_CREDIT"]
df_profile["RATIO_AMOUNT_ANNUITY_TO_SALARY"] = df_profile["AMOUNT_ANNUITY"] / df_profile["TOTAL_SALARY"]

# семейные признаки
df_profile["RATIO_SALARY_TO_PER_FAMILY_SIZE"] = df_profile["TOTAL_SALARY"] / df_profile["FAMILY_SIZE"]

# флаги
df_profile["FLG_MORE_THAN_30PERCENT_FOR_CREDIT"] = np.where(
    df_profile["RATIO_AMOUNT_ANNUITY_TO_SALARY"] > 0.3, 1, 0
)

features = df_profile[["APPLICATION_NUMBER", "RATIO_SALARY_TO_AMOUNT_CREDIT", "RATIO_AMOUNT_ANNUITY_TO_SALARY", "RATIO_SALARY_TO_PER_FAMILY_SIZE", "FLG_MORE_THAN_30PERCENT_FOR_CREDIT"]]

In [17]:
df_train = df_train.merge(
    features, how="left", on="APPLICATION_NUMBER"
)
df_train.head(n=2)

Unnamed: 0,APPLICATION_NUMBER,TARGET,NAME_CONTRACT_TYPE,EDUCATION_LEVEL,TOTAL_SALARY,AMOUNT_CREDIT,TOTAL_SALARY_MEAN_BY_EDUCATION,AMOUNT_CREDIT_MEAN_BY_EDUCATION,TOTAL_SALARY_TO_MEAN_SALARY_BY_EDUCATION,DELTA_SALARY_TO_MEAN_SALARY_BY_EDUCATION,RATIO_SALARY_TO_AMOUNT_CREDIT,RATIO_AMOUNT_ANNUITY_TO_SALARY,RATIO_SALARY_TO_PER_FAMILY_SIZE,FLG_MORE_THAN_30PERCENT_FOR_CREDIT
0,123687442,0,Cash,Secondary / secondary special,157500.0,855000.0,155414.744584,571624.325064,1.013417,2085.255416,0.184211,0.159543,52500.0,0.0
1,123597908,1,Cash,,,,,,,,,,,


In [18]:
df_test = df_test.merge(
    features, how="left", on="APPLICATION_NUMBER"
)
df_test.head(n=2)

Unnamed: 0,APPLICATION_NUMBER,NAME_CONTRACT_TYPE,EDUCATION_LEVEL,TOTAL_SALARY,AMOUNT_CREDIT,TOTAL_SALARY_MEAN_BY_EDUCATION,AMOUNT_CREDIT_MEAN_BY_EDUCATION,TOTAL_SALARY_TO_MEAN_SALARY_BY_EDUCATION,DELTA_SALARY_TO_MEAN_SALARY_BY_EDUCATION,RATIO_SALARY_TO_AMOUNT_CREDIT,RATIO_AMOUNT_ANNUITY_TO_SALARY,RATIO_SALARY_TO_PER_FAMILY_SIZE,FLG_MORE_THAN_30PERCENT_FOR_CREDIT
0,123724268,Cash,Secondary / secondary special,117000.0,1125000.0,155414.744584,571624.325064,0.752824,-38414.744584,0.104,0.281154,58500.0,0.0
1,123456549,Cash,Secondary / secondary special,81000.0,312768.0,155414.744584,571624.325064,0.521186,-74414.744584,0.258978,0.211056,20250.0,0.0


In [19]:
categorical = df_profile[["APPLICATION_NUMBER", "EDUCATION_LEVEL", "FAMILY_STATUS"]]
categorical.head(n=2)

Unnamed: 0,APPLICATION_NUMBER,EDUCATION_LEVEL,FAMILY_STATUS
0,123666076,Incomplete higher,Civil marriage
1,123423688,Secondary / secondary special,Married


In [20]:
categorical["EDUCATION_FAMILY_STATUS"] = categorical["EDUCATION_LEVEL"] + " | " + categorical["FAMILY_STATUS"]
categorical.head(n=2)
categorical = categorical.drop(['EDUCATION_LEVEL'], axis=1)

In [21]:
df_train = df_train.merge(
    categorical, how="left", on="APPLICATION_NUMBER"
)
df_train.head(n=2)

Unnamed: 0,APPLICATION_NUMBER,TARGET,NAME_CONTRACT_TYPE,EDUCATION_LEVEL,TOTAL_SALARY,AMOUNT_CREDIT,TOTAL_SALARY_MEAN_BY_EDUCATION,AMOUNT_CREDIT_MEAN_BY_EDUCATION,TOTAL_SALARY_TO_MEAN_SALARY_BY_EDUCATION,DELTA_SALARY_TO_MEAN_SALARY_BY_EDUCATION,RATIO_SALARY_TO_AMOUNT_CREDIT,RATIO_AMOUNT_ANNUITY_TO_SALARY,RATIO_SALARY_TO_PER_FAMILY_SIZE,FLG_MORE_THAN_30PERCENT_FOR_CREDIT,FAMILY_STATUS,EDUCATION_FAMILY_STATUS
0,123687442,0,Cash,Secondary / secondary special,157500.0,855000.0,155414.744584,571624.325064,1.013417,2085.255416,0.184211,0.159543,52500.0,0.0,Married,Secondary / secondary special | Married
1,123597908,1,Cash,,,,,,,,,,,,,


In [22]:
df_test = df_test.merge(
    categorical, how="left", on="APPLICATION_NUMBER"
)
df_test.head(n=2)

Unnamed: 0,APPLICATION_NUMBER,NAME_CONTRACT_TYPE,EDUCATION_LEVEL,TOTAL_SALARY,AMOUNT_CREDIT,TOTAL_SALARY_MEAN_BY_EDUCATION,AMOUNT_CREDIT_MEAN_BY_EDUCATION,TOTAL_SALARY_TO_MEAN_SALARY_BY_EDUCATION,DELTA_SALARY_TO_MEAN_SALARY_BY_EDUCATION,RATIO_SALARY_TO_AMOUNT_CREDIT,RATIO_AMOUNT_ANNUITY_TO_SALARY,RATIO_SALARY_TO_PER_FAMILY_SIZE,FLG_MORE_THAN_30PERCENT_FOR_CREDIT,FAMILY_STATUS,EDUCATION_FAMILY_STATUS
0,123724268,Cash,Secondary / secondary special,117000.0,1125000.0,155414.744584,571624.325064,0.752824,-38414.744584,0.104,0.281154,58500.0,0.0,Married,Secondary / secondary special | Married
1,123456549,Cash,Secondary / secondary special,81000.0,312768.0,155414.744584,571624.325064,0.521186,-74414.744584,0.258978,0.211056,20250.0,0.0,Married,Secondary / secondary special | Married


In [23]:
gender_dummies = pd.get_dummies(df_profile["GENDER"])
family_status_dummies = pd.get_dummies(df_profile["FAMILY_STATUS"])

categorical2 = pd.concat([gender_dummies, family_status_dummies], axis=1)
categorical2['APPLICATION_NUMBER'] = df_profile['APPLICATION_NUMBER']
categorical2.head(n=2)


Unnamed: 0,F,M,XNA,Civil marriage,Married,Separated,Single / not married,Unknown,Widow,APPLICATION_NUMBER
0,1,0,0,1,0,0,0,0,0,123666076
1,1,0,0,0,1,0,0,0,0,123423688


In [24]:
df_train = df_train.merge(
    categorical2, how="left", on="APPLICATION_NUMBER"
)
df_train.head(n=2)

Unnamed: 0,APPLICATION_NUMBER,TARGET,NAME_CONTRACT_TYPE,EDUCATION_LEVEL,TOTAL_SALARY,AMOUNT_CREDIT,TOTAL_SALARY_MEAN_BY_EDUCATION,AMOUNT_CREDIT_MEAN_BY_EDUCATION,TOTAL_SALARY_TO_MEAN_SALARY_BY_EDUCATION,DELTA_SALARY_TO_MEAN_SALARY_BY_EDUCATION,...,EDUCATION_FAMILY_STATUS,F,M,XNA,Civil marriage,Married,Separated,Single / not married,Unknown,Widow
0,123687442,0,Cash,Secondary / secondary special,157500.0,855000.0,155414.744584,571624.325064,1.013417,2085.255416,...,Secondary / secondary special | Married,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,123597908,1,Cash,,,,,,,,...,,,,,,,,,,


In [25]:
df_test = df_test.merge(
    categorical2, how="left", on="APPLICATION_NUMBER"
)
df_test.head(n=2)

Unnamed: 0,APPLICATION_NUMBER,NAME_CONTRACT_TYPE,EDUCATION_LEVEL,TOTAL_SALARY,AMOUNT_CREDIT,TOTAL_SALARY_MEAN_BY_EDUCATION,AMOUNT_CREDIT_MEAN_BY_EDUCATION,TOTAL_SALARY_TO_MEAN_SALARY_BY_EDUCATION,DELTA_SALARY_TO_MEAN_SALARY_BY_EDUCATION,RATIO_SALARY_TO_AMOUNT_CREDIT,...,EDUCATION_FAMILY_STATUS,F,M,XNA,Civil marriage,Married,Separated,Single / not married,Unknown,Widow
0,123724268,Cash,Secondary / secondary special,117000.0,1125000.0,155414.744584,571624.325064,0.752824,-38414.744584,0.104,...,Secondary / secondary special | Married,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,123456549,Cash,Secondary / secondary special,81000.0,312768.0,155414.744584,571624.325064,0.521186,-74414.744584,0.258978,...,Secondary / secondary special | Married,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [26]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110093 entries, 0 to 110092
Data columns (total 25 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   APPLICATION_NUMBER                        110093 non-null  int64  
 1   TARGET                                    110093 non-null  int64  
 2   NAME_CONTRACT_TYPE                        110093 non-null  object 
 3   EDUCATION_LEVEL                           89539 non-null   object 
 4   TOTAL_SALARY                              89539 non-null   float64
 5   AMOUNT_CREDIT                             89539 non-null   float64
 6   TOTAL_SALARY_MEAN_BY_EDUCATION            89539 non-null   float64
 7   AMOUNT_CREDIT_MEAN_BY_EDUCATION           89539 non-null   float64
 8   TOTAL_SALARY_TO_MEAN_SALARY_BY_EDUCATION  89539 non-null   float64
 9   DELTA_SALARY_TO_MEAN_SALARY_BY_EDUCATION  89539 non-null   float64
 10  RATIO_SALARY_TO_AMOU

In [27]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165141 entries, 0 to 165140
Data columns (total 24 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   APPLICATION_NUMBER                        165141 non-null  int64  
 1   NAME_CONTRACT_TYPE                        165141 non-null  object 
 2   EDUCATION_LEVEL                           134176 non-null  object 
 3   TOTAL_SALARY                              134176 non-null  float64
 4   AMOUNT_CREDIT                             134176 non-null  float64
 5   TOTAL_SALARY_MEAN_BY_EDUCATION            134176 non-null  float64
 6   AMOUNT_CREDIT_MEAN_BY_EDUCATION           134176 non-null  float64
 7   TOTAL_SALARY_TO_MEAN_SALARY_BY_EDUCATION  134176 non-null  float64
 8   DELTA_SALARY_TO_MEAN_SALARY_BY_EDUCATION  134176 non-null  float64
 9   RATIO_SALARY_TO_AMOUNT_CREDIT             134176 non-null  float64
 10  RATIO_AMOUNT_ANNUITY

In [28]:
numerical_features = df_train.select_dtypes(include=[np.number])
cat_features = df_train.select_dtypes(include = ['object'])
for num in numerical_features:
    numerical_features.loc[(numerical_features[num].isnull(), [num])] = numerical_features[num].median()

for cat in cat_features:
    cat_features.loc[(cat_features[cat].isnull(), [cat])] = df_train[cat].mode()[0]
    
df_train = pd.concat([numerical_features, cat_features], axis=1)


In [29]:
numerical_features = df_test.select_dtypes(include=[np.number])
cat_features = df_test.select_dtypes(include = ['object'])
for num in numerical_features:
    numerical_features.loc[(numerical_features[num].isnull(), [num])] = numerical_features[num].median()

for cat in cat_features:
    cat_features.loc[(cat_features[cat].isnull(), [cat])] = df_test[cat].mode()[0]
    
df_test = pd.concat([numerical_features, cat_features], axis=1)

In [30]:
correlation = df_train.corr()
corr_with_target = correlation["TARGET"].sort_values(ascending = False)
corr_with_target

TARGET                                      1.000000
M                                           0.043532
Single / not married                        0.021638
Civil marriage                              0.019607
RATIO_AMOUNT_ANNUITY_TO_SALARY              0.013422
Separated                                   0.002136
Unknown                                    -0.000894
XNA                                        -0.000894
FLG_MORE_THAN_30PERCENT_FOR_CREDIT         -0.001063
APPLICATION_NUMBER                         -0.002239
RATIO_SALARY_TO_AMOUNT_CREDIT              -0.006672
TOTAL_SALARY_TO_MEAN_SALARY_BY_EDUCATION   -0.007063
DELTA_SALARY_TO_MEAN_SALARY_BY_EDUCATION   -0.007504
RATIO_SALARY_TO_PER_FAMILY_SIZE            -0.016752
Widow                                      -0.017763
TOTAL_SALARY                               -0.019773
Married                                    -0.020237
AMOUNT_CREDIT                              -0.026684
F                                          -0.

In [31]:
def fit_catboost(x_train, y_train, model_params, categorical, *args):
    """
    Обучение модели CatBoostClassifier.

    Parameters
    ----------
    x_train: pandas.core.frame.DataFrame
        Матрица признаков для обучения модели.

    y_valid: pandas.core.frame.Series
        Вектор целевой переменной для валидации модели.

    model_params: dict
        Словарь со значением гиперпараметров модели.

    categorical: List[str]
        Список с названием категориальных признаков.

    Returns
    -------
    model: catboost.core.CatBoostClassifier
        Экземпляр обученного классификатора.

    """
    eval_set = [(x_train, y_train)]

    if args == 2:
        eval_set.append((args[0], args[1]))

    model = cb.CatBoostClassifier(**model_params)
    model.fit(
        X=x_train,
        y=y_train,
        eval_set=eval_set,
        cat_features=categorical
    )

    return model

def evaluate_model(model, *args):
    """
    Оценка качества модели.

    Parameters
    ----------
    model: catboost.core.CatBoostClassifier
        Экземпляр обученного классификатора.

    args: pandas.core.frame.DataFrame
        Пары из обучающей выборки и вектора истинных ответов.
        Опциональные параметры.

    """
    eval_data = [(args[i], args[i+1]) for i in range(0, len(args), 2)]
    if eval_data:
        for sample, target in eval_data:
            y_pred = model.predict_proba(sample)[:, 1]
            score = roc_auc_score(target, y_pred)
            print(f"score = {round(score, 6)}")

def prepare_data(X, categorical, to_drop):
    """
    Преобразование данных для передачи в модель.

    Parameters
    ----------
    X: pandas.core.frame.DataFrame
        Матрица признаков для передачи в модель.

    categorical: List[str]
        Список с названием категориальных признаков.

    drop_features: List[str]
        Список с названием признаков, которые не должны
        участвовать в обучении.

    Returns
    -------
    X_transformed: pandas.core.frame.DataFrame
        Матрица признаков, подготовленная для передачи в модель.

    """
    X_transformed = X.copy()
    to_drop = set(X.columns) & set(to_drop)

    if to_drop:
        X_transformed = X_transformed.drop(to_drop, axis=1)

    X_transformed[categorical] = X_transformed[categorical].astype(str)
    return X_transformed

In [32]:
cb_params_10000 = {
    "n_estimators": 10000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 300,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 100,
    "thread_count": 6,
    "random_seed": 27
}

to_drop = [
    "TARGET",
]

categorical = df_train.select_dtypes(include=["object"]).columns.tolist()

In [33]:
x_train, x_valid = train_test_split(
    df_train, train_size=0.7, random_state=27, shuffle=True
)
y_train, y_valid = train_test_split(
    df_train["TARGET"], train_size=0.7, random_state=27, shuffle=True
)


x_train = prepare_data(x_train, categorical=categorical, to_drop=to_drop)
x_valid = prepare_data(x_valid, categorical=categorical, to_drop=to_drop)
x_test = prepare_data(df_test, categorical=categorical, to_drop=to_drop)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_test.shape))


model = fit_catboost(
    x_train, y_train, cb_params_10000, categorical, x_valid, y_valid)

IndentationError: unexpected indent (<ipython-input-33-55fa5220b0d7>, line 19)

In [None]:
y_pred = model.predict_proba(df_test)[:, 1]

In [None]:
y_pred = pd.DataFrame(y_pred)

In [None]:
y_pred = y_pred.rename({0: 'TARGET'}, axis=1)

In [None]:
y_pred

In [None]:
df_result = pd.read_csv('/Kaggle/input/geekbrains-competitive-data-analysis/test.csv')
df_result = df_result.drop(['NAME_CONTRACT_TYPE'], axis=1)
df_result = pd.concat([df_result, y_pred], axis=1)

In [None]:
df_result

In [None]:
# 4.4. Экспорт результатов
import csv
filename = 'Kaldin_cb_prof_1.csv'
df_result.to_csv(filename, index=None)

In [None]:
#Сохранение таблиц

filename = 'train_prof_upd_1.csv'
df_train.to_csv(filename, index=None)
filename = 'test_prof_upd_1.csv'
df_test.to_csv(filename, index=None)

In [34]:
df_train_app = pd.read_csv('train_app_upd_1.csv')
df_test_app = pd.read_csv('test_app_upd_1.csv')
df_train_app = df_train_app.drop(['TARGET', 'NAME_CONTRACT_TYPE'], axis=1)
df_test_app = df_test_app.drop(['NAME_CONTRACT_TYPE'], axis=1)

In [35]:
df_train_merged = pd.merge(df_train, df_train_app, on=('APPLICATION_NUMBER'))
df_test_merged = pd.merge(df_test, df_test_app, on=('APPLICATION_NUMBER'))


In [36]:
df_train_merged

Unnamed: 0,APPLICATION_NUMBER,TARGET,TOTAL_SALARY,AMOUNT_CREDIT,TOTAL_SALARY_MEAN_BY_EDUCATION,AMOUNT_CREDIT_MEAN_BY_EDUCATION,TOTAL_SALARY_TO_MEAN_SALARY_BY_EDUCATION,DELTA_SALARY_TO_MEAN_SALARY_BY_EDUCATION,RATIO_SALARY_TO_AMOUNT_CREDIT,RATIO_AMOUNT_ANNUITY_TO_SALARY,...,PREV_APP_AMT_APPLICATION_MAX_APPROVED_APPS,PREV_APP_CNT_PAYMENT_MEAN_APPROVED_APPS,NAME_CONTRACT_STATUS_APPROVED_RATIO,NAME_CONTRACT_STATUS_APPROVED_TOTAL,NAME_CONTRACT_STATUS_CANCELED_RATIO,NAME_CONTRACT_STATUS_CANCELED_TOTAL,NAME_CONTRACT_STATUS_REFUSED_RATIO,NAME_CONTRACT_STATUS_REFUSED_TOTAL,NAME_CONTRACT_STATUS_UNUSED OFFER_RATIO,NAME_CONTRACT_STATUS_UNUSED OFFER_TOTAL
0,123687442,0,157500.0,855000.0,155414.744584,571624.325064,1.013417,2085.255416,0.184211,0.159543,...,72634.140,10.666667,1.000000,3.0,0.000000,0.0,0.000000,0.0,0.0,0.0
1,123597908,1,148500.0,517788.0,155414.744584,571624.325064,0.868643,-20414.744584,0.306215,0.162671,...,495000.000,19.333333,0.750000,3.0,0.000000,0.0,0.250000,1.0,0.0,0.0
2,123526683,0,135000.0,1006920.0,208989.672806,689809.957142,0.645965,-73989.672806,0.134072,0.316000,...,1395000.000,18.000000,0.666667,4.0,0.166667,1.0,0.166667,1.0,0.0,0.0
3,123710391,1,180000.0,518562.0,155414.744584,571624.325064,1.158191,24585.255416,0.347114,0.127625,...,100858.500,14.000000,1.000000,2.0,0.000000,0.0,0.000000,0.0,0.0,0.0
4,123590329,1,148500.0,517788.0,155414.744584,571624.325064,0.868643,-20414.744584,0.306215,0.162671,...,105381.000,9.000000,0.222222,2.0,0.111111,1.0,0.666667,6.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110088,123458312,0,148500.0,517788.0,155414.744584,571624.325064,0.868643,-20414.744584,0.306215,0.162671,...,90000.000,6.500000,0.800000,4.0,0.000000,0.0,0.200000,1.0,0.0,0.0
110089,123672463,0,175500.0,269550.0,155414.744584,571624.325064,1.129236,20085.255416,0.651085,0.071897,...,270000.000,8.250000,0.615385,8.0,0.153846,2.0,0.230769,3.0,0.0,0.0
110090,123723001,0,148500.0,517788.0,155414.744584,571624.325064,0.868643,-20414.744584,0.306215,0.162671,...,34245.000,9.000000,1.000000,2.0,0.000000,0.0,0.000000,0.0,0.0,0.0
110091,123554358,0,270000.0,1024740.0,181446.844502,565921.004786,1.488039,88553.155498,0.263481,0.183067,...,261180.000,18.000000,0.500000,1.0,0.000000,0.0,0.500000,1.0,0.0,0.0


In [37]:
df_test_merged

Unnamed: 0,APPLICATION_NUMBER,TOTAL_SALARY,AMOUNT_CREDIT,TOTAL_SALARY_MEAN_BY_EDUCATION,AMOUNT_CREDIT_MEAN_BY_EDUCATION,TOTAL_SALARY_TO_MEAN_SALARY_BY_EDUCATION,DELTA_SALARY_TO_MEAN_SALARY_BY_EDUCATION,RATIO_SALARY_TO_AMOUNT_CREDIT,RATIO_AMOUNT_ANNUITY_TO_SALARY,RATIO_SALARY_TO_PER_FAMILY_SIZE,...,PREV_APP_AMT_APPLICATION_MAX_APPROVED_APPS,PREV_APP_CNT_PAYMENT_MEAN_APPROVED_APPS,NAME_CONTRACT_STATUS_APPROVED_RATIO,NAME_CONTRACT_STATUS_APPROVED_TOTAL,NAME_CONTRACT_STATUS_CANCELED_RATIO,NAME_CONTRACT_STATUS_CANCELED_TOTAL,NAME_CONTRACT_STATUS_REFUSED_RATIO,NAME_CONTRACT_STATUS_REFUSED_TOTAL,NAME_CONTRACT_STATUS_UNUSED OFFER_RATIO,NAME_CONTRACT_STATUS_UNUSED OFFER_TOTAL
0,123724268,117000.0,1125000.0,155414.744584,571624.325064,0.752824,-38414.744584,0.104000,0.281154,58500.0,...,450000.0,6.000000,0.500000,2.0,0.500000,2.0,0.00,0.0,0.0,0.0
1,123456549,81000.0,312768.0,155414.744584,571624.325064,0.521186,-74414.744584,0.258978,0.211056,20250.0,...,540000.0,24.000000,1.000000,2.0,0.000000,0.0,0.00,0.0,0.0,0.0
2,123428178,157500.0,450000.0,155414.744584,571624.325064,1.013417,2085.255416,0.350000,0.142857,39375.0,...,900000.0,16.400000,0.625000,5.0,0.125000,1.0,0.25,2.0,0.0,0.0
3,123619984,144900.0,514777.5,155414.744584,571624.325064,0.868643,-20414.744584,0.305853,0.163000,75000.0,...,397755.0,36.000000,1.000000,1.0,0.000000,0.0,0.00,0.0,0.0,0.0
4,123671104,90000.0,254700.0,208989.672806,689809.957142,0.430643,-118989.672806,0.353357,0.277100,30000.0,...,130545.0,9.333333,0.600000,3.0,0.000000,0.0,0.20,1.0,0.2,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165136,123487967,144900.0,514777.5,155414.744584,571624.325064,0.868643,-20414.744584,0.305853,0.163000,75000.0,...,450000.0,24.000000,0.333333,2.0,0.666667,4.0,0.00,0.0,0.0,0.0
165137,123536402,135000.0,450000.0,155414.744584,571624.325064,0.868643,-20414.744584,0.300000,0.124500,135000.0,...,243000.0,10.000000,1.000000,2.0,0.000000,0.0,0.00,0.0,0.0,0.0
165138,123718238,144900.0,514777.5,155414.744584,571624.325064,0.868643,-20414.744584,0.305853,0.163000,75000.0,...,450000.0,18.000000,0.500000,5.0,0.000000,0.0,0.50,5.0,0.0,0.0
165139,123631557,112500.0,350181.0,155414.744584,571624.325064,0.723870,-42914.744584,0.321262,0.326840,56250.0,...,495000.0,23.142857,1.000000,7.0,0.000000,0.0,0.00,0.0,0.0,0.0


In [None]:
cb_params_10000 = {
    "n_estimators": 10000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 300,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 100,
    "thread_count": 6,
    "random_seed": 27
}

to_drop = [
    "TARGET",
]

categorical = df_test_merged.select_dtypes(include=["object"]).columns.tolist()

In [None]:
x_train, x_valid = train_test_split(
    df_train_merged, train_size=0.7, random_state=27, shuffle=True
)
y_train, y_valid = train_test_split(
    df_train_merged["TARGET"], train_size=0.7, random_state=27, shuffle=True
)


x_train = prepare_data(x_train, categorical=categorical, to_drop=to_drop)
x_valid = prepare_data(x_valid, categorical=categorical, to_drop=to_drop)
x_test = prepare_data(df_test, categorical=categorical, to_drop=to_drop)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_test.shape))


model = fit_catboost(
    x_train, y_train, cb_params_10000, categorical, x_valid, y_valid)

In [None]:
y_pred = model.predict_proba(df_test_merged)[:, 1]
y_pred = pd.DataFrame(y_pred)
y_pred = y_pred.rename({0: 'TARGET'}, axis=1)
y_pred

In [None]:
df_result = pd.read_csv('/Kaggle/input/geekbrains-competitive-data-analysis/test.csv')
df_result = df_result.drop(['NAME_CONTRACT_TYPE'], axis=1)
df_result = pd.concat([df_result, y_pred], axis=1)

In [None]:
# 4.4. Экспорт результатов
import csv
filename = 'Kaldin_cb_prof_1_app.csv'
df_result.to_csv(filename, index=None)