In [2]:
import os
import time
from typing import Optional

import numpy as np
import pandas as pd
import catboost as cb
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from teacher_bot.risk_default_prediction import RiskDefaultPrediction
pd.set_option("display.max_columns", 150)

import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from lightgbm import LGBMClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, validation_curve, learning_curve

In [38]:
import lightgbm as lgt

In [3]:
def get_input(data_path: str) -> pd.DataFrame:
    """
    Считывание данных и вывод основной информации о наборе данных.

    Parameters
    ----------
    data_path: str
        Название файла.

    Returns
    -------
    data: pandas.core.frame.DataFrame
        Загруженный набор данных в pandas.DataFrame

    """
    base_path = "./data/"
    data = pd.read_csv(f"{base_path}/{data_path}")
    data.columns = [col.lower() for col in data.columns]
    print(f"{data_path}: shape = {data.shape[0]} rows, {data.shape[1]} cols")

    return data

def check_missings(X: pd.DataFrame):
    """
    Вычисление количества пропусков и пропусков в данных;

    Parameters
    ----------
    X: pandas.core.frame.DataFrame
        Матрица признаков для исследования.

    Returns
    -------
    na_stats: pandas.core.frame.DataFrame
        Матрица с со статистикой пропусков.
    
    """
    total = X.isnull().sum().sort_values(ascending = False)
    percent = (X.isnull().sum()/X.isnull().count()*100).sort_values(ascending = False)
    na_stats = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return na_stats.T


def plot_categorical(data, col, size=[8 ,4], xlabel_angle=0, title=''):
    """
    Визуализация распределения значений категориального признака.

    """
    plotdata = data[col].value_counts()
    plt.figure(figsize = size)
    sns.barplot(x = plotdata.index, y=plotdata.values, palette="viridis")
    plt.title(title)
    if xlabel_angle!=0: 
        plt.xticks(rotation=xlabel_angle)
        
        
def plot_kde_target(feature_name: str, data: pd.DataFrame):
    """
    Визуализация функции распределения признаков в зависимости
    от значения целевой переменной на обучающей выборке.
    Вывод коэффициента корреляции между значением признака
    и значением целевой переменной, вывод медианы
    значений признака в разрезе целевой переменной.

    Parameters
    ----------
    faeture_name: str
        Название анализируемого признака.

    data: pandas.core.frame.DataFrame
        Матрица признаков для обучения.

    """
    corr = data["target"].corr(data[feature_name])

    mask = data["target"] == 1
    avg_target = data.loc[mask, feature_name].median()
    avg_non_target = data.loc[~mask, feature_name].median()

    fig = plt.figure(figsize=(12, 6))
    plt.title(f"{feature_name} Distribution", size=14)
    sns.kdeplot(data.loc[mask, feature_name], linewidth=3, color="blue", label="TARGET = 1")
    sns.kdeplot(data.loc[~mask, feature_name], linewidth=3, color="green", label="TARGET = 0")
    plt.legend(loc="best", fontsize=14)
    plt.xlabel(feature_name, size=14)
    plt.ylabel("Density", size=14)

    print(f"The correlation between {feature_name} and target = {round(corr, 4)}")
    print(f"Median-value for default-loan = {round(avg_target, 4)}")
    print(f"Median-value for non default-loan = {round(avg_target, 4)}")


def catboost_hold_out_validation(params, X, y, split_params = [0.7, 0.2, 0.1], categorical = None):
    """
    Hold-Out валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    split_params: List[float], optional, default = [0.7, 0.2, 0.1]
        Параметры (доли) разбиения выборки.
        Опциональный параметр, по умолчанию, равен [0.7, 0.2, 0.1].
    
    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimator: catboost.core.CatBoostClassifier
        Обученный классификатор catboost.

    test_prediction: np.array, optional
        Вектор прогнозов для тестовой выборки.
        Опциональный объект, возвращается только, если split_params
        содержит 3 значения.

    """
    print(f"Starting at {time.ctime()}")
    numeric = list(set(X.columns) - set(categorical))
    if categorical:
        X[categorical] = X[categorical].astype(str)

    x_train, x_valid = train_test_split(
        X, train_size=split_params[0], random_state=27
    )
    y_train, y_valid = train_test_split(
        y, train_size=split_params[0], random_state=27
    )

    if len(split_params) == 3:
        test_size = int(split_params[2] * X.shape[0])

        x_valid, x_test = train_test_split(
            x_valid, test_size=test_size, random_state=72
        )
        y_valid, y_test = train_test_split(
            y_valid, test_size=test_size, random_state=72
        )

    model = cb.CatBoostClassifier(**params)
    model.fit(
        x_train, y_train, categorical,
        eval_set=[(x_train, y_train), (x_valid, y_valid)]
    )

    print("="*80)
    valid_score = roc_auc_score(y_valid, model.predict_proba(x_valid)[:, 1])
    print(f"Valid Score = {round(valid_score, 4)}")

    if len(split_params) == 3:

        test_prediction = model.predict_proba(x_test)[:, 1]
        test_score = roc_auc_score(y_test, test_prediction)
        print(f"Test Score = {round(test_score, 4)}")

        return model, test_prediction

    else:
        return model


def create_numerical_aggs(data: pd.DataFrame,
                          groupby_id: str,
                          aggs: dict,
                          prefix: Optional[str] = None,
                          suffix: Optional[str] = None,
                          ) -> pd.DataFrame:
    """
    Построение агрегаций для числовых признаков.

    Parameters
    ----------
    data: pandas.core.frame.DataFrame
        Выборка для построения агрегаций.

    groupby_id: str
        Название ключа, по которому нужно произвести группировку.

    aggs: dict
        Словарь с названием признака и списка функций.
        Ключ словаря - название признака, который используется для
        вычисления агрегаций, значение словаря - список с названием
        функций для вычисления агрегаций.

    prefix: str, optional, default = None
        Префикс для названия признаков.
        Опциональный параметр, по умолчанию, не используется.

    suffix: str, optional, default = None
        Суффикс для названия признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    stats: pandas.core.frame.DataFrame
        Выборка с рассчитанными агрегациями.

    """
    if not prefix:
        prefix = ""
    if not suffix:
        suffix = ""

    data_grouped = data.groupby(groupby_id)
    stats = data_grouped.agg(aggs)
    stats.columns = [f"{prefix}{feature}_{stat}{suffix}".upper() for feature, stat in stats]
    stats = stats.reset_index()

    return stats

In [4]:
def agg_numeric(df, group_var, df_name):
    """Aggregates the numeric values in a dataframe. This can
    be used to create features for each instance of the grouping variable.
    
    Parameters
    --------
        df (dataframe): 
            the dataframe to calculate the statistics on
        group_var (string): 
            the variable by which to group df
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated for 
            all numeric columns. Each instance of the grouping variable will have 
            the statistics (mean, min, max, sum; currently supported) calculated. 
            The columns are also renamed to keep track of features created.
    
    """
    # Remove id variables other than grouping variable
    for col in df:
        if col != group_var and 'sk_id' in col:
            df = df.drop(columns = col)
            
    group_ids = df[group_var]
    numeric_df = df.select_dtypes('number')
    numeric_df[group_var] = group_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(group_var).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()

    # Need to create new column names
    columns = [group_var]

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1][:-1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns
    return agg

In [5]:
def count_categorical(df, group_var, df_name):
    """Computes counts and normalized counts for each observation
    of `group_var` of each unique category in every categorical variable
    
    Parameters
    --------
    df : dataframe 
        The dataframe to calculate the value counts for.
        
    group_var : string
        The variable by which to group the dataframe. For each unique
        value of this variable, the final dataframe will have one row
        
    df_name : string
        Variable added to the front of column names to keep track of columns

    
    Return
    --------
    categorical : dataframe
        A dataframe with counts and normalized counts of each unique category in every categorical variable
        with one row for every unique value of the `group_var`.
        
    """
    
    # Select the categorical columns
    categorical = pd.get_dummies(df.select_dtypes('object'))

    # Make sure to put the identifying id on the column
    categorical[group_var] = df[group_var]

    # Groupby the group var and calculate the sum and mean
    categorical = categorical.groupby(group_var).agg(['sum', 'mean'])
    
    column_names = []
    
    # Iterate through the columns in level 0
    for var in categorical.columns.levels[0]:
        # Iterate through the stats in level 1
        for stat in ['count', 'count_norm']:
            # Make a new column name
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    
    categorical.columns = column_names
    
    return categorical

In [46]:
def agg_categorical(df, parent_var, df_name):
    """
    Aggregates the categorical features in a child dataframe
    for each observation of the parent variable.
    
    Parameters
    --------
    df : dataframe 
        The dataframe to calculate the value counts for.
        
    parent_var : string
        The variable by which to group and aggregate the dataframe. For each unique
        value of this variable, the final dataframe will have one row
        
    df_name : string
        Variable added to the front of column names to keep track of columns

    
    Return
    --------
    categorical : dataframe
        A dataframe with aggregated statistics for each observation of the parent_var
        The columns are also renamed and columns with duplicate values are removed.
        
    """
    
    # Select the categorical columns
    categorical = pd.get_dummies(df.select_dtypes('category'))

    # Make sure to put the identifying id on the column
    categorical[parent_var] = df[parent_var]

    # Groupby the group var and calculate the sum and mean
    categorical = categorical.groupby(parent_var).agg(['sum', 'count', 'mean'])
    
    column_names = []
    
    # Iterate through the columns in level 0
    for var in categorical.columns.levels[0]:
        # Iterate through the stats in level 1
        for stat in ['sum', 'count', 'mean']:
            # Make a new column name
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    
    categorical.columns = column_names
    
    # Remove duplicate columns by values
    _, idx = np.unique(categorical, axis = 1, return_index = True)
    categorical = categorical.iloc[:, idx]
    
    return categorical

In [6]:
train = get_input("train.csv")
test = get_input("test.csv")
#applications = get_input("previous_application.csv")
#payments = get_input("installments_payments.csv")
bureau = get_input("bureau.csv")
bureau_balance = get_input("bureau_balance.csv")
#credit_card_balance = get_input("credit_card_balance.csv")
#balance = get_input("POS_CASH_balance.csv")

train.csv: shape = 215257 rows, 122 cols
test.csv: shape = 92254 rows, 121 cols
bureau.csv: shape = 1716428 rows, 17 cols
bureau_balance.csv: shape = 27299925 rows, 3 cols


In [60]:
del bureau

In [7]:
# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in train:
    if train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(train[col].unique())) <= 2:
            # Train on the training data
            le.fit(train[col])
            # Transform both training and testing data
            train[col] = le.transform(train[col])
            test[col] = le.transform(test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

3 columns were label encoded.


In [8]:
# one-hot encoding of categorical variables
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [9]:
# Create an anomalous flag column
train['DAYS_EMPLOYED_ANOM'] = train["days_employed"] == 365243

# Replace the anomalous values with nan
train['days_employed'].replace({365243: np.nan}, inplace = True)

test['DAYS_EMPLOYED_ANOM'] = test["days_employed"] == 365243
test["days_employed"].replace({365243: np.nan}, inplace = True)

print('There are %d anomalies in the test data out of %d entries' % (test["DAYS_EMPLOYED_ANOM"].sum(), len(test)))

There are 16564 anomalies in the test data out of 92254 entries


In [10]:
# Groupby the client id (SK_ID_CURR), count the number of previous loans, and rename the column
previous_loan_counts = bureau.groupby('sk_id_curr', as_index=False)['sk_id_bureau'].count().rename(columns = {'sk_id_bureau': 'previous_loan_counts'})
previous_loan_counts.head()

Unnamed: 0,sk_id_curr,previous_loan_counts
0,100001,7
1,100002,8
2,100003,4
3,100004,2
4,100005,3


In [11]:
# Join to the training dataframe
train = train.merge(previous_loan_counts, on = 'sk_id_curr', how = 'left')
test = test.merge(previous_loan_counts, on = 'sk_id_curr', how = 'left')

# Fill the missing values with 0 
train['previous_loan_counts'] = train['previous_loan_counts'].fillna(0)
test['previous_loan_counts'] = test['previous_loan_counts'].fillna(0)
train.head()

Unnamed: 0,sk_id_curr,target,name_contract_type,flag_own_car,flag_own_realty,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,region_population_relative,days_birth,days_employed,days_registration,days_id_publish,own_car_age,flag_mobil,flag_emp_phone,flag_work_phone,flag_cont_mobile,flag_phone,flag_email,cnt_fam_members,region_rating_client,region_rating_client_w_city,hour_appr_process_start,reg_region_not_live_region,reg_region_not_work_region,live_region_not_work_region,reg_city_not_live_city,reg_city_not_work_city,live_city_not_work_city,ext_source_1,ext_source_2,ext_source_3,apartments_avg,basementarea_avg,years_beginexpluatation_avg,years_build_avg,commonarea_avg,elevators_avg,entrances_avg,floorsmax_avg,floorsmin_avg,landarea_avg,livingapartments_avg,livingarea_avg,nonlivingapartments_avg,nonlivingarea_avg,apartments_mode,basementarea_mode,years_beginexpluatation_mode,years_build_mode,commonarea_mode,elevators_mode,entrances_mode,floorsmax_mode,floorsmin_mode,landarea_mode,livingapartments_mode,livingarea_mode,nonlivingapartments_mode,nonlivingarea_mode,apartments_medi,basementarea_medi,years_beginexpluatation_medi,years_build_medi,commonarea_medi,elevators_medi,entrances_medi,floorsmax_medi,floorsmin_medi,landarea_medi,livingapartments_medi,livingarea_medi,...,organization_type_Agriculture,organization_type_Bank,organization_type_Business Entity Type 1,organization_type_Business Entity Type 2,organization_type_Business Entity Type 3,organization_type_Cleaning,organization_type_Construction,organization_type_Culture,organization_type_Electricity,organization_type_Emergency,organization_type_Government,organization_type_Hotel,organization_type_Housing,organization_type_Industry: type 1,organization_type_Industry: type 10,organization_type_Industry: type 11,organization_type_Industry: type 12,organization_type_Industry: type 13,organization_type_Industry: type 2,organization_type_Industry: type 3,organization_type_Industry: type 4,organization_type_Industry: type 5,organization_type_Industry: type 6,organization_type_Industry: type 7,organization_type_Industry: type 8,organization_type_Industry: type 9,organization_type_Insurance,organization_type_Kindergarten,organization_type_Legal Services,organization_type_Medicine,organization_type_Military,organization_type_Mobile,organization_type_Other,organization_type_Police,organization_type_Postal,organization_type_Realtor,organization_type_Religion,organization_type_Restaurant,organization_type_School,organization_type_Security,organization_type_Security Ministries,organization_type_Self-employed,organization_type_Services,organization_type_Telecom,organization_type_Trade: type 1,organization_type_Trade: type 2,organization_type_Trade: type 3,organization_type_Trade: type 4,organization_type_Trade: type 5,organization_type_Trade: type 6,organization_type_Trade: type 7,organization_type_Transport: type 1,organization_type_Transport: type 2,organization_type_Transport: type 3,organization_type_Transport: type 4,organization_type_University,organization_type_XNA,fondkapremont_mode_not specified,fondkapremont_mode_org spec account,fondkapremont_mode_reg oper account,fondkapremont_mode_reg oper spec account,housetype_mode_block of flats,housetype_mode_specific housing,housetype_mode_terraced house,wallsmaterial_mode_Block,wallsmaterial_mode_Mixed,wallsmaterial_mode_Monolithic,wallsmaterial_mode_Others,wallsmaterial_mode_Panel,"wallsmaterial_mode_Stone, brick",wallsmaterial_mode_Wooden,emergencystate_mode_No,emergencystate_mode_Yes,DAYS_EMPLOYED_ANOM,previous_loan_counts
0,342217,0,1,0,1,0,202500.0,585000.0,29250.0,585000.0,0.00712,-14937,-5026.0,-1.0,-4606,,1,1,0,1,0,0,2.0,2,2,10,0,0,0,0,0,0,,0.612914,0.408359,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,13.0
1,183133,0,0,0,1,0,112500.0,656811.0,30564.0,567000.0,0.016612,-22021,,-2979.0,-5036,,1,0,0,1,0,0,2.0,2,2,11,0,0,0,0,0,0,,0.703983,0.75574,0.3711,0.2556,0.9955,0.9388,,0.36,0.3103,0.375,,0.0,,0.3977,,0.0,0.3782,0.2653,0.9955,0.9412,,0.3625,0.3103,0.375,,0.0,,0.4144,,0.0,0.3747,0.2556,0.9955,0.9396,,0.36,0.3103,0.375,,0.0,,0.4049,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,True,11.0
2,449106,0,1,0,1,0,144000.0,450000.0,22500.0,450000.0,0.011703,-9989,-368.0,-356.0,-1635,,1,1,0,1,0,0,2.0,2,2,10,0,0,0,0,0,0,,0.631356,0.754406,0.1031,0.0,0.9752,0.66,,0.0,0.1724,0.1667,0.2083,0.0947,,0.0887,,0.0,0.105,0.0,0.9752,0.6733,,0.0,0.1724,0.1667,0.2083,0.0968,,0.0924,,0.0,0.1041,0.0,0.9752,0.6645,,0.0,0.1724,0.1667,0.2083,0.0963,,0.0903,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,False,5.0
3,351350,0,0,0,1,1,450000.0,1483231.5,51687.0,1354500.0,0.030755,-14216,-4475.0,-1051.0,-4977,,1,1,0,1,1,0,3.0,2,2,18,0,0,0,0,1,1,0.382578,0.775988,0.342529,0.1536,0.0744,0.9836,0.7756,0.0234,0.0,0.1724,0.1667,0.2083,0.0597,,0.092,,0.0016,0.1565,0.0772,0.9836,0.7844,0.0236,0.0,0.1724,0.1667,0.2083,0.0611,,0.0959,,0.0017,0.1551,0.0744,0.9836,0.7786,0.0235,0.0,0.1724,0.1667,0.2083,0.0608,,0.0937,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,False,26.0
4,226525,0,0,1,1,0,202500.0,454500.0,19255.5,454500.0,0.072508,-17518,-5826.0,-5500.0,-1069,10.0,1,1,0,1,1,0,2.0,1,1,9,0,0,0,0,0,0,,0.743458,0.740799,0.0575,0.0715,0.9722,0.6192,0.0217,0.0,0.1034,0.1417,0.1833,0.0,0.0462,0.046,0.0031,0.0152,0.0231,0.05,0.9737,0.6537,0.0008,0.0,0.069,0.1667,0.2083,0.0,0.0193,0.0163,0.0039,0.0019,0.0593,0.061,0.9737,0.6444,0.0122,0.0,0.1034,0.1667,0.2083,0.0,0.0487,0.0493,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,False,6.0


In [25]:
# Group by the client id, calculate aggregation statistics
bureau_agg = bureau.drop(columns = ['sk_id_bureau']).groupby('sk_id_curr', as_index = False).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()

In [26]:
# List of column names
columns = ['sk_id_curr']

# Iterate through the variables names
for var in bureau_agg.columns.levels[0]:
    # Skip the id name
    if var != 'sk_id_curr':
        
        # Iterate through the stat names
        for stat in bureau_agg.columns.levels[1][:-1]:
            # Make a new column name for the variable and stat
            columns.append('bureau_%s_%s' % (var, stat))

In [27]:
# Assign the list of columns names as the dataframe column names
bureau_agg.columns = columns

In [28]:
# Merge with the training data
train = train.merge(bureau_agg, on = 'sk_id_curr', how = 'left')
test = test.merge(bureau_agg, on = 'sk_id_curr', how = 'left')

In [48]:
# Counts of each type of status for each previous loan
bureau_balance_counts = count_categorical(bureau_balance, group_var = 'sk_id_bureau', df_name = 'bureau_balance')

In [49]:
# Calculate value count statistics for each `SK_ID_CURR` 
bureau_balance_agg = agg_numeric(bureau_balance, group_var = 'sk_id_bureau', df_name = 'bureau_balance')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numeric_df[group_var] = group_ids


In [50]:
# Dataframe grouped by the loan
bureau_by_loan = bureau_balance_agg.merge(bureau_balance_counts, right_index = True, left_on = 'sk_id_bureau', how = 'outer')

# Merge to include the SK_ID_CURR
bureau_by_loan = bureau_by_loan.merge(bureau[['sk_id_bureau', 'sk_id_curr']], on = 'sk_id_bureau', how = 'left')

In [51]:
bureau_balance_by_client = agg_numeric(bureau_by_loan.drop(columns = ['sk_id_bureau']), group_var = 'sk_id_curr', df_name = 'client')

In [52]:
train = train.merge(bureau_balance_by_client, on = 'sk_id_curr', how = 'left')
test = test.merge(bureau_balance_by_client, on = 'sk_id_curr', how = 'left')

In [84]:
train['app EXT_SOURCE mean'] = train[['ext_source_1', 'ext_source_2', 'ext_source_3']].mean(axis = 1)
train['app EXT_SOURCE std'] = train[['ext_source_1', 'ext_source_2', 'ext_source_3']].std(axis = 1)
train['app EXT_SOURCE prod'] = train['ext_source_1'] * train['ext_source_2'] * train['ext_source_3']
train['app EXT_SOURCE_1 * EXT_SOURCE_2'] = train['ext_source_1'] * train['ext_source_2']
train['app EXT_SOURCE_1 * EXT_SOURCE_3'] = train['ext_source_1'] * train['ext_source_3']
train['app EXT_SOURCE_2 * EXT_SOURCE_3'] = train['ext_source_2'] * train['ext_source_3']
train['app EXT_SOURCE_1 * DAYS_EMPLOYED'] = train['ext_source_1'] * train['days_employed']
train['app EXT_SOURCE_2 * DAYS_EMPLOYED'] = train['ext_source_2'] * train['days_employed']
train['app EXT_SOURCE_3 * DAYS_EMPLOYED'] = train['ext_source_3'] * train['days_employed']
train['app EXT_SOURCE_1 / DAYS_BIRTH'] = train['ext_source_1'] / train['days_birth']
train['app EXT_SOURCE_2 / DAYS_BIRTH'] = train['ext_source_2'] / train['days_birth']
train['app EXT_SOURCE_3 / DAYS_BIRTH'] = train['ext_source_3'] / train['days_birth']
    
train['app AMT_CREDIT / AMT_ANNUITY'] = train['amt_credit'] / train['amt_annuity']
train['app AMT_CREDIT / AMT_INCOME_TOTAL'] = train['amt_credit'] / train['amt_income_total']
    
train['app AMT_INCOME_TOTAL / 12 - AMT_ANNUITY'] = train['amt_income_total'] / 12. - train['amt_annuity']
train['app AMT_INCOME_TOTAL / AMT_ANNUITY'] = train['amt_income_total'] / train['amt_annuity']

train['app AMT_INCOME_TOTAL / CNT_CHILDREN'] = train['amt_income_total'] / (1 + train['cnt_children'])
    
train['app OWN_CAR_AGE / DAYS_BIRTH'] = train['own_car_age'] / train['days_birth']
train['app OWN_CAR_AGE / DAYS_EMPLOYED'] = train['own_car_age'] / train['days_employed']
    
train['app DAYS_LAST_PHONE_CHANGE / DAYS_BIRTH'] = train['days_last_phone_change'] / train['days_birth']
train['app DAYS_LAST_PHONE_CHANGE / DAYS_EMPLOYED'] = train['days_last_phone_change'] / train['days_employed']
train['app DAYS_EMPLOYED - DAYS_BIRTH'] = train['days_employed'] - train['days_birth']
train['app DAYS_EMPLOYED / DAYS_BIRTH'] = train['days_employed'] / train['days_birth']
    

In [86]:
test['app EXT_SOURCE mean'] = test[['ext_source_1', 'ext_source_2', 'ext_source_3']].mean(axis = 1)
test['app EXT_SOURCE std'] = test[['ext_source_1', 'ext_source_2', 'ext_source_3']].std(axis = 1)
test['app EXT_SOURCE prod'] = test['ext_source_1'] * test['ext_source_2'] * test['ext_source_3']
test['app EXT_SOURCE_1 * EXT_SOURCE_2'] = test['ext_source_1'] * test['ext_source_2']
test['app EXT_SOURCE_1 * EXT_SOURCE_3'] = test['ext_source_1'] * test['ext_source_3']
test['app EXT_SOURCE_2 * EXT_SOURCE_3'] = test['ext_source_2'] * test['ext_source_3']
test['app EXT_SOURCE_1 * DAYS_EMPLOYED'] = test['ext_source_1'] * test['days_employed']
test['app EXT_SOURCE_2 * DAYS_EMPLOYED'] = test['ext_source_2'] * test['days_employed']
test['app EXT_SOURCE_3 * DAYS_EMPLOYED'] = test['ext_source_3'] * test['days_employed']
test['app EXT_SOURCE_1 / DAYS_BIRTH'] = test['ext_source_1'] / test['days_birth']
test['app EXT_SOURCE_2 / DAYS_BIRTH'] = test['ext_source_2'] / test['days_birth']
test['app EXT_SOURCE_3 / DAYS_BIRTH'] = test['ext_source_3'] / test['days_birth']

test['app AMT_CREDIT / AMT_ANNUITY'] = test['amt_credit'] / test['amt_annuity']
test['app AMT_CREDIT / AMT_INCOME_TOTAL'] = test['amt_credit'] / test['amt_income_total']

test['app AMT_INCOME_TOTAL / 12 - AMT_ANNUITY'] = test['amt_income_total'] / 12. - test['amt_annuity']
test['app AMT_INCOME_TOTAL / AMT_ANNUITY'] = test['amt_income_total'] / test['amt_annuity']
test['app AMT_INCOME_TOTAL / CNT_CHILDREN'] = test['amt_income_total'] / (1 + test['cnt_children'])

test['app OWN_CAR_AGE / DAYS_BIRTH'] = test['own_car_age'] / test['days_birth']
test['app OWN_CAR_AGE / DAYS_EMPLOYED'] = test['own_car_age'] / test['days_employed']
    
test['app DAYS_LAST_PHONE_CHANGE / DAYS_BIRTH'] = test['days_last_phone_change'] / test['days_birth']
test['app DAYS_LAST_PHONE_CHANGE / DAYS_EMPLOYED'] = test['days_last_phone_change'] / test['days_employed']
test['app DAYS_EMPLOYED - DAYS_BIRTH'] = test['days_employed'] - test['days_birth']
test['app DAYS_EMPLOYED / DAYS_BIRTH'] = test['days_employed'] / test['days_birth']

In [76]:
train.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

Series([], dtype: float64)

In [87]:
# Calculate all correlations in dataframe
corrs = train.corr()
corrs = corrs.sort_values('target', ascending = False)

# Ten most positive correlations
pd.DataFrame(corrs['target'].head(10))
# Set the threshold
threshold = 0.8

# Empty dictionary to hold correlated variables
above_threshold_vars = {}

# For each column, record the variables that are above the threshold
for col in corrs:
    above_threshold_vars[col] = list(corrs.index[corrs[col] > threshold])

In [88]:
# Track columns to remove and columns already examined
cols_to_remove = []
cols_seen = []
cols_to_remove_pair = []

# Iterate through columns and correlated columns
for key, value in above_threshold_vars.items():
    # Keep track of columns already examined
    cols_seen.append(key)
    for x in value:
        if x == key:
            next
        else:
            # Only want to remove one in a pair
            if x not in cols_seen:
                cols_to_remove.append(x)
                cols_to_remove_pair.append(key)
            
cols_to_remove = list(set(cols_to_remove))
print('Number of columns to remove: ', len(cols_to_remove))

Number of columns to remove:  8


In [89]:
train = train.drop(columns = cols_to_remove)
test = test.drop(columns = cols_to_remove)

In [90]:
y_train = train["target"]
x_train = train.drop(["target", "sk_id_curr"], axis=1)

In [35]:
model = xgb.XGBClassifier(random_state=27)
model.fit(x_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=27,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [40]:
import re
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [91]:
model = lgt.LGBMClassifier(random_state=27)
model.fit(x_train, y_train)

LGBMClassifier(random_state=27)

In [92]:
prediction = model.predict_proba(test.drop("sk_id_curr", axis=1))[:, 1]

submit = pd.DataFrame({
    "sk_id_curr": test["sk_id_curr"],
    "score": prediction
})
submit.head(n=3)

Unnamed: 0,sk_id_curr,score
0,174545,0.034612
1,209898,0.048347
2,454938,0.028125


In [93]:
bot = RiskDefaultPrediction()
bot.production_quality(answer=submit)

Привет! Приятно познакомиться!
Запускаю тестирование...
Проверяю метрики...
Твой результат: 0.7721653082416754
Ура! Мы получили удовлетворительную по качеству модельку! Финальная кодовая фраза 'Data Scientist’ы делают этот мир лучше!' Это правда, мы с тобой сделали мир лучше, позволив компании уверенее принимать решения на основе данных и увереннее развивать свой бизнес!
