In [24]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns
import scipy.stats as st
from scipy.stats import probplot, ks_2samp

from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
import missingno as msno
import xgboost as xgb
import catboost as cb

%matplotlib inline


import warnings
warnings.simplefilter("ignore")

In [2]:
df_train = pd.read_csv('/Kaggle/input/geekbrains-competitive-data-analysis/train.csv')
df_test = pd.read_csv('/Kaggle/input/geekbrains-competitive-data-analysis/test.csv')
df_application = pd.read_csv('/Kaggle/input/geekbrains-competitive-data-analysis/applications_history.csv')

In [3]:
df_train.head()

Unnamed: 0,APPLICATION_NUMBER,TARGET,NAME_CONTRACT_TYPE
0,123687442,0,Cash
1,123597908,1,Cash
2,123526683,0,Cash
3,123710391,1,Cash
4,123590329,1,Cash


In [4]:
df_test.head()

Unnamed: 0,APPLICATION_NUMBER,NAME_CONTRACT_TYPE
0,123724268,Cash
1,123456549,Cash
2,123428178,Credit Card
3,123619984,Cash
4,123671104,Cash


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110093 entries, 0 to 110092
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   APPLICATION_NUMBER  110093 non-null  int64 
 1   TARGET              110093 non-null  int64 
 2   NAME_CONTRACT_TYPE  110093 non-null  object
dtypes: int64(2), object(1)
memory usage: 2.5+ MB


In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165141 entries, 0 to 165140
Data columns (total 2 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   APPLICATION_NUMBER  165141 non-null  int64 
 1   NAME_CONTRACT_TYPE  165141 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.5+ MB


In [7]:
df_application.head()

Unnamed: 0,PREV_APPLICATION_NUMBER,APPLICATION_NUMBER,NAME_CONTRACT_TYPE,AMOUNT_ANNUITY,AMT_APPLICATION,AMOUNT_CREDIT,AMOUNT_PAYMENT,AMOUNT_GOODS_PAYMENT,NAME_CONTRACT_STATUS,DAYS_DECISION,...,NAME_PRODUCT_TYPE,SELLERPLACE_AREA,CNT_PAYMENT,NAME_YIELD_GROUP,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,49298709,123595216,,1730.43,17145.0,17145.0,0.0,17145.0,Approved,73,...,XNA,35,12.0,middle,365243.0,42.0,300.0,42.0,37.0,0.0
1,50070639,123431468,Cash,25188.615,607500.0,679671.0,,607500.0,Approved,164,...,x-sell,-1,36.0,low_action,365243.0,134.0,916.0,365243.0,365243.0,1.0
2,49791680,123445379,Cash,15060.735,112500.0,136444.5,,112500.0,Approved,301,...,x-sell,-1,12.0,high,365243.0,271.0,59.0,365243.0,365243.0,1.0
3,50087457,123499497,Cash,47041.335,450000.0,470790.0,,450000.0,Approved,512,...,x-sell,-1,12.0,middle,365243.0,482.0,152.0,182.0,177.0,1.0
4,49052479,123525393,Cash,31924.395,337500.0,404055.0,,337500.0,Refused,781,...,walk-in,-1,24.0,high,,,,,,


In [8]:
df_application.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1670214 entries, 0 to 1670213
Data columns (total 26 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   PREV_APPLICATION_NUMBER    1670214 non-null  int64  
 1   APPLICATION_NUMBER         1670214 non-null  int64  
 2   NAME_CONTRACT_TYPE         940717 non-null   object 
 3   AMOUNT_ANNUITY             1297979 non-null  float64
 4   AMT_APPLICATION            1670214 non-null  float64
 5   AMOUNT_CREDIT              1670213 non-null  float64
 6   AMOUNT_PAYMENT             774370 non-null   float64
 7   AMOUNT_GOODS_PAYMENT       1284699 non-null  float64
 8   NAME_CONTRACT_STATUS       1670214 non-null  object 
 9   DAYS_DECISION              1670214 non-null  int64  
 10  NAME_PAYMENT_TYPE          1670214 non-null  object 
 11  CODE_REJECT_REASON         1670214 non-null  object 
 12  NAME_TYPE_SUITE            849809 non-null   object 
 13  NAME_CLIENT_

In [9]:
df_application = df_application.groupby(['APPLICATION_NUMBER']).mean().reset_index()
df_train_merged = pd.merge(df_train, df_application, on=('APPLICATION_NUMBER'), how ='left')
df_test_merged = pd.merge(df_test, df_application, on=('APPLICATION_NUMBER'), how ='left')


In [10]:
df_train_merged

Unnamed: 0,APPLICATION_NUMBER,TARGET,NAME_CONTRACT_TYPE,PREV_APPLICATION_NUMBER,AMOUNT_ANNUITY,AMT_APPLICATION,AMOUNT_CREDIT,AMOUNT_PAYMENT,AMOUNT_GOODS_PAYMENT,DAYS_DECISION,SELLERPLACE_AREA,CNT_PAYMENT,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,123687442,0,Cash,4.977809e+07,7703.80500,68787.180000,67592.100000,4813.0800,68787.180,1221.000000,86.666667,10.666667,365243.00,1168.00,1002.666667,122688.00,122675.333333,0.333333
1,123597908,1,Cash,4.947161e+07,27919.00125,331908.750000,434949.750000,0.0000,331908.750,659.250000,38.750000,26.500000,365243.00,774.50,474.500000,474.50,465.500000,0.000000
2,123526683,0,Cash,4.933968e+07,32538.47625,353857.500000,402818.250000,18814.5000,707715.000,1423.166667,16.833333,18.000000,274586.75,1880.50,92406.750000,1344.00,1219.250000,0.250000
3,123710391,1,Cash,4.893297e+07,4237.69500,61206.750000,59661.000000,2250.0000,61206.750,1151.500000,2058.500000,14.000000,365243.00,1120.50,872.500000,183423.00,183419.500000,0.000000
4,123590329,1,Cash,4.913631e+07,14583.62250,266842.000000,308073.500000,1462.5000,300197.250,741.000000,55.777778,30.750000,365243.00,1396.00,1156.000000,1171.00,1167.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110088,123458312,0,Cash,4.952828e+07,13051.19700,66922.866000,74327.400000,0.8325,66922.866,1308.200000,1126.600000,7.200000,365243.00,1391.75,1226.750000,1264.25,1256.250000,0.500000
110089,123672463,0,Cash,4.895905e+07,9349.53300,68150.526923,67411.384615,6643.6500,88595.685,584.846154,26.692308,10.200000,292246.00,654.60,73537.600000,73529.60,73492.600000,0.200000
110090,123723001,0,Cash,4.935806e+07,4000.29750,30363.750000,28244.250000,4281.7500,30363.750,464.500000,156.000000,9.000000,365243.00,434.00,194.000000,299.00,280.000000,0.500000
110091,123554358,0,Cash,4.930920e+07,24760.71000,299340.000000,327861.000000,0.0000,299340.000,789.000000,66.000000,21.000000,365243.00,815.00,305.000000,305.00,303.000000,0.000000


In [11]:
correlation = df_train_merged.corr()
corr_with_target = correlation["TARGET"].sort_values(ascending = False)
corr_with_target

TARGET                       1.000000
DAYS_FIRST_DUE               0.003687
DAYS_LAST_DUE_1ST_VERSION    0.003098
DAYS_LAST_DUE                0.000856
DAYS_DECISION                0.000718
DAYS_FIRST_DRAWING           0.000161
DAYS_TERMINATION            -0.000149
SELLERPLACE_AREA            -0.000428
PREV_APPLICATION_NUMBER     -0.001342
AMOUNT_ANNUITY              -0.001518
CNT_PAYMENT                 -0.001674
NFLAG_INSURED_ON_APPROVAL   -0.002045
APPLICATION_NUMBER          -0.002239
AMOUNT_GOODS_PAYMENT        -0.002547
AMOUNT_CREDIT               -0.003129
AMT_APPLICATION             -0.003282
AMOUNT_PAYMENT              -0.005465
Name: TARGET, dtype: float64

In [18]:
def fit_catboost(x_train, y_train, model_params, categorical, *args):
    """
    Обучение модели CatBoostClassifier.

    Parameters
    ----------
    x_train: pandas.core.frame.DataFrame
        Матрица признаков для обучения модели.

    y_valid: pandas.core.frame.Series
        Вектор целевой переменной для валидации модели.

    model_params: dict
        Словарь со значением гиперпараметров модели.

    categorical: List[str]
        Список с названием категориальных признаков.

    Returns
    -------
    model: catboost.core.CatBoostClassifier
        Экземпляр обученного классификатора.

    """
    eval_set = [(x_train, y_train)]

    if args == 2:
        eval_set.append((args[0], args[1]))

    model = cb.CatBoostClassifier(**model_params)
    model.fit(
        X=x_train,
        y=y_train,
        eval_set=eval_set,
        cat_features=categorical
    )

    return model

def evaluate_model(model, *args):
    """
    Оценка качества модели.

    Parameters
    ----------
    model: catboost.core.CatBoostClassifier
        Экземпляр обученного классификатора.

    args: pandas.core.frame.DataFrame
        Пары из обучающей выборки и вектора истинных ответов.
        Опциональные параметры.

    """
    eval_data = [(args[i], args[i+1]) for i in range(0, len(args), 2)]
    if eval_data:
        for sample, target in eval_data:
            y_pred = model.predict_proba(sample)[:, 1]
            score = roc_auc_score(target, y_pred)
            print(f"score = {round(score, 6)}")

def prepare_data(X, categorical, to_drop):
    """
    Преобразование данных для передачи в модель.

    Parameters
    ----------
    X: pandas.core.frame.DataFrame
        Матрица признаков для передачи в модель.

    categorical: List[str]
        Список с названием категориальных признаков.

    drop_features: List[str]
        Список с названием признаков, которые не должны
        участвовать в обучении.

    Returns
    -------
    X_transformed: pandas.core.frame.DataFrame
        Матрица признаков, подготовленная для передачи в модель.

    """
    X_transformed = X.copy()
    to_drop = set(X.columns) & set(to_drop)

    if to_drop:
        X_transformed = X_transformed.drop(to_drop, axis=1)

    X_transformed[categorical] = X_transformed[categorical].astype(str)
    return X_transformed

In [22]:
cb_params_10000 = {
    "n_estimators": 10000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 300,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 27
}

to_drop = [
    "PREV_APPLICATION_NUMBER",
    "TARGET",
]

categorical = df_train_merged.select_dtypes(include=["object"]).columns.tolist()

In [25]:
x_train, x_valid = train_test_split(
    df_train_merged, train_size=0.7, random_state=27, shuffle=True
)
y_train, y_valid = train_test_split(
    df_train_merged["TARGET"], train_size=0.7, random_state=27, shuffle=True
)


x_train = prepare_data(x_train, categorical=categorical, to_drop=to_drop)
x_valid = prepare_data(x_valid, categorical=categorical, to_drop=to_drop)
x_test = prepare_data(df_test_merged, categorical=categorical, to_drop=to_drop)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_test.shape))


model = fit_catboost(
    x_train, y_train, cb_params_10000, categorical, x_valid, y_valid)

x_train.shape = 77065 rows, 16 cols
x_valid.shape = 33028 rows, 16 cols
x_test.shape = 165141 rows, 16 cols
0:	test: 0.5178868	best: 0.5178868 (0)	total: 172ms	remaining: 28m 39s
300:	test: 0.5980786	best: 0.5980786 (300)	total: 4.33s	remaining: 2m 19s
600:	test: 0.6344608	best: 0.6344608 (600)	total: 8.35s	remaining: 2m 10s
900:	test: 0.6564892	best: 0.6564892 (900)	total: 12.4s	remaining: 2m 5s
1200:	test: 0.6767542	best: 0.6767542 (1200)	total: 16.6s	remaining: 2m 1s
1500:	test: 0.6959606	best: 0.6959606 (1500)	total: 20.9s	remaining: 1m 58s
1800:	test: 0.7100924	best: 0.7100924 (1800)	total: 25.1s	remaining: 1m 54s
2100:	test: 0.7222824	best: 0.7222824 (2100)	total: 29.4s	remaining: 1m 50s
2400:	test: 0.7330347	best: 0.7330347 (2400)	total: 33.9s	remaining: 1m 47s
2700:	test: 0.7426558	best: 0.7426558 (2700)	total: 38.6s	remaining: 1m 44s
3000:	test: 0.7528028	best: 0.7528028 (3000)	total: 43.4s	remaining: 1m 41s
3300:	test: 0.7609957	best: 0.7609957 (3300)	total: 47.9s	remaining: 

In [26]:
y_pred = model.predict_proba(df_test_merged)[:, 1]

In [28]:
y_pred = pd.DataFrame(y_pred)

In [32]:
y_pred = y_pred.rename({0: 'TARGET'}, axis=1)

In [33]:
y_pred

Unnamed: 0,TARGET
0,0.054673
1,0.062248
2,0.033501
3,0.073959
4,0.073718
...,...
165136,0.116350
165137,0.125536
165138,0.056613
165139,0.080065


In [35]:
df_result = pd.read_csv('/Kaggle/input/geekbrains-competitive-data-analysis/test.csv')
df_result = df_result.drop(['NAME_CONTRACT_TYPE'], axis=1)
df_result = pd.concat([df_result, y_pred], axis=1)

In [36]:
df_result

Unnamed: 0,APPLICATION_NUMBER,TARGET
0,123724268,0.054673
1,123456549,0.062248
2,123428178,0.033501
3,123619984,0.073959
4,123671104,0.073718
...,...,...
165136,123487967,0.116350
165137,123536402,0.125536
165138,123718238,0.056613
165139,123631557,0.080065


In [38]:
# 4.4. Экспорт результатов
import csv
filename = 'Kaldin_cb_app_1.csv'
df_result.to_csv(filename, index=None)