In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, CatBoostRegressor
import matplotlib.pyplot as plt
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import OneHotEncoder
import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, precision_score
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

In [2]:
class Encoder():
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def label_encoder(self, column):
        '''Method is called for label encoding'''
        le = LabelEncoder()
        self.dataframe[column] = le.fit_transform(self.dataframe[column])

    def hot_encoder(self, column):
        '''Method is calling for hot encoding'''
        ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
        aux_df = pd.DataFrame(ohe.fit_transform(self.dataframe[[column]]))
        aux_df.columns = ohe.get_feature_names([f'hot_{column}'])
        self.dataframe = self.dataframe.drop(col, axis=1)
        self.dataframe = pd.concat([self.dataframe, aux_df], axis=1)
        return self.dataframe

In [3]:
def missing_vars(data, column, random_proba=True):
    '''Function is called for filling of missing data'''
    # With using probability and random choise

    if random_proba:
        col_name = data[column].value_counts().index.to_list(
        )  # get list of values
        col_distr = data[column].value_counts(
            normalize=True).values  # get list of probs
        missing = data[col].isnull()  # flag of missing val
        # substitute values from the list of names in accordance with the probability of meeting the name
        data.loc[missing, [column]] = np.random.choice(col_name,
                                                       size=len(data[missing]),
                                                       p=col_distr)

    # Using  most common in  column
    data[col] = data[col].fillna(data[col].value_counts().index[0])

In [4]:
data_1 = pd.read_csv('data.csv')
model_ = pd.read_csv('model.csv')
sample_sub = pd.read_csv('sample_submission.csv')

In [5]:
data_1['Category_month'] = data_1['APPLICATION_MONTH'].apply(
    lambda x: pd.Timestamp(x).month)
data_1['Category_year'] = data_1['APPLICATION_MONTH'].apply(
    lambda x: pd.Timestamp(x).year)

In [6]:
data_1 = data_1.drop(['APPLICATION_MONTH', 'incident'], axis=1)

In [7]:
'''Готовим признаки под удаление, где количество пропусков больше половины'''
a = pd.DataFrame((data_1.isna().sum()))
cols_to_drop = a[a[0] > len(data_1) * 0.5].index
data_1 = data_1.drop(cols_to_drop, axis=1)

# Готовим признаки под заполнение
b = pd.DataFrame((data_1.isna().sum()))
cols_to_fill = b[b[0] > 0].index[1:]

In [8]:
# Заполняем нашей функицей пропуски
for col in cols_to_fill:
    missing_vars(data_1, col)

In [9]:
data_2 = pd.concat([data_1, model_], axis=1)

In [10]:
data_2['target'] = np.abs(data_2['PD'] - data_2['flg_90_12_add'])

In [11]:
data_2 = data_2.drop(['PD', 'incident', 'flg_90_12_add'], axis=1)

In [12]:
cat_cols = data_2.columns[:19].to_list() + ['Category_month', 'Category_year']

In [13]:
num_cols = data_2.columns[19:-3]

In [14]:
data_2[cat_cols] = data_2[cat_cols].astype(int)

In [15]:
cbr = CatBoostRegressor(random_seed=42,
                        cat_features=cat_cols,
                        loss_function='MAE')

In [16]:
data_2[~data_2['target'].isna()]

Unnamed: 0,Category_Feature_0,Category_Feature_1,Category_Feature_2,Category_Feature_3,Category_Feature_4,Category_Feature_5,Category_Feature_6,Category_Feature_7,Category_Feature_8,Category_Feature_9,...,Feature_1871,Feature_1875,Feature_1883,Feature_1884,Feature_1885,Feature_1886,Feature_1887,Category_month,Category_year,target
0,6,2,25,2,0,3,1,2,35000,0,...,0.016847,0.295133,0,0,0,1,20184,10,2018,0.022191
1,1,2,49,4,0,3,1,2,50000,0,...,0.043231,0.129693,0,1,1,1,20184,12,2018,0.019972
2,6,2,24,1,0,3,0,1,20000,0,...,0.106410,0.420321,0,1,0,1,20184,10,2018,0.043884
3,4,2,26,1,0,3,1,2,30000,0,...,0.079245,0.129872,0,1,1,1,20184,12,2018,0.013412
4,6,2,30,4,2,3,1,3,50000,0,...,0.011466,0.012740,0,1,0,1,20184,11,2018,0.033354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81612,1,2,56,2,0,3,0,4,66000,0,...,0.326544,0.129581,0,0,1,1,20201,2,2020,0.037867
81613,4,2,29,2,1,5,0,2,0,0,...,0.021559,0.065332,0,0,1,0,20201,2,2020,0.014266
81614,1,1,54,2,1,6,0,3,128615,0,...,0.018219,0.085771,0,0,1,0,20201,2,2020,0.103010
81615,1,2,55,1,0,3,0,3,34090,0,...,0.025393,0.157258,0,1,1,1,20201,2,2020,0.019119


In [18]:
# # Гридсёрч для катбуста
# param_grid = {'iterations': [400],
#               'learning_rate': [0.1, 0.05],
#               'l2_leaf_reg': [1, 2],
#               'depth': range(4,8),
#               'thread_count': [4],
#                 }

# cbr.grid_search(param_grid=param_grid,
#                 X=data_2[~data_2['target'].isna()],
#                 y=data_2[~data_2['target'].isna()]['target'],
#                 cv=5,
#                 partition_random_seed=42,
#                 calc_cv_statistics=True,
#                 search_by_train_test_split=True,
#                 refit=True,
#                 shuffle=True,
#                 stratified=True,
#                 train_size=0.8,
#                 verbose=False,
#                 plot=True,

#                )

In [None]:
import pickle

In [137]:
cbr = CatBoostRegressor(
    iterations=400,
    learning_rate=0.1,
    l2_leaf_reg=1,
    depth=4,
    thread_count=4,
)

In [285]:
data_2[~data_2['target'].isna()].drop(['target'], axis=1)

Unnamed: 0,Category_Feature_0,Category_Feature_4,Category_Feature_5,Feature_0,Feature_2,Feature_3,Feature_12,Feature_14,Feature_22,Feature_24,...,Feature_1778,Feature_1780,Feature_1783,Feature_1789,Feature_1837,Feature_1862,Feature_1865,Feature_1871,Feature_1875,Feature_1883
0,6,0,3,3.173118,0.121951,0.225000,1,0,0,0,...,0.500000,0.000000,0.500000,0.0,0.500000,5.053696,5.053696,0.016847,0.295133,0
1,1,0,3,2.236364,0.975610,0.333333,4,0,2,0,...,0.250000,0.000000,0.125000,0.0,0.375000,0.526316,0.526316,0.043231,0.129693,0
2,6,0,3,39.541929,2.500000,0.253165,10,2,1,2,...,0.500000,0.111111,0.222222,0.0,0.733333,1.128571,0.087778,0.106410,0.420321,0
3,4,0,3,12.204299,2.034286,0.650000,0,0,0,0,...,0.000000,0.000000,0.000000,0.0,1.000000,3.623976,1.486302,0.079245,0.129872,0
4,6,2,3,0.685766,2.666667,0.900000,3,0,2,0,...,0.125000,0.000000,0.125000,0.0,0.571429,2.883465,0.188679,0.011466,0.012740,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81612,1,0,3,4.200000,0.080645,2.520000,12,0,2,0,...,0.352941,0.000000,0.176471,0.0,0.714286,0.386056,0.386056,0.326544,0.129581,0
81613,4,1,5,2.200000,0.322581,0.330000,6,0,2,0,...,0.142857,0.000000,0.142857,0.0,0.357143,3.659251,3.659251,0.021559,0.065332,0
81614,1,1,6,10.620500,2.419355,0.212410,5,0,2,0,...,0.266667,0.000000,0.200000,0.0,0.416667,1.037466,1.037466,0.018219,0.085771,0
81615,1,0,3,1.636343,0.322581,0.245451,0,0,0,0,...,0.000000,0.000000,0.000000,0.0,0.666667,4.174242,0.581013,0.025393,0.157258,0


In [208]:
cbr.fit(data_2[~data_2['target'].isna()].drop(['target'], axis=1),
        np.log(data_2[~data_2['target'].isna()]['target']))

0:	learn: 1.0542926	total: 60.5ms	remaining: 24.1s
1:	learn: 1.0394362	total: 95.2ms	remaining: 18.9s
2:	learn: 1.0266249	total: 127ms	remaining: 16.9s
3:	learn: 1.0156616	total: 163ms	remaining: 16.1s
4:	learn: 1.0054536	total: 197ms	remaining: 15.6s
5:	learn: 0.9969652	total: 229ms	remaining: 15s
6:	learn: 0.9889242	total: 265ms	remaining: 14.9s
7:	learn: 0.9816909	total: 304ms	remaining: 14.9s
8:	learn: 0.9746101	total: 344ms	remaining: 14.9s
9:	learn: 0.9680494	total: 383ms	remaining: 14.9s
10:	learn: 0.9625446	total: 416ms	remaining: 14.7s
11:	learn: 0.9572990	total: 452ms	remaining: 14.6s
12:	learn: 0.9528558	total: 486ms	remaining: 14.5s
13:	learn: 0.9487444	total: 522ms	remaining: 14.4s
14:	learn: 0.9437738	total: 563ms	remaining: 14.5s
15:	learn: 0.9399888	total: 599ms	remaining: 14.4s
16:	learn: 0.9365751	total: 633ms	remaining: 14.3s
17:	learn: 0.9332629	total: 668ms	remaining: 14.2s
18:	learn: 0.9301114	total: 701ms	remaining: 14s
19:	learn: 0.9272804	total: 729ms	remaining

162:	learn: 0.8537545	total: 5.57s	remaining: 8.1s
163:	learn: 0.8536029	total: 5.61s	remaining: 8.07s
164:	learn: 0.8533866	total: 5.65s	remaining: 8.04s
165:	learn: 0.8532755	total: 5.68s	remaining: 8.01s
166:	learn: 0.8531182	total: 5.72s	remaining: 7.98s
167:	learn: 0.8529717	total: 5.75s	remaining: 7.95s
168:	learn: 0.8528051	total: 5.79s	remaining: 7.92s
169:	learn: 0.8526834	total: 5.83s	remaining: 7.88s
170:	learn: 0.8525988	total: 5.85s	remaining: 7.84s
171:	learn: 0.8524531	total: 5.89s	remaining: 7.81s
172:	learn: 0.8523292	total: 5.93s	remaining: 7.78s
173:	learn: 0.8522015	total: 5.96s	remaining: 7.74s
174:	learn: 0.8520962	total: 5.99s	remaining: 7.7s
175:	learn: 0.8519788	total: 6.03s	remaining: 7.67s
176:	learn: 0.8519049	total: 6.06s	remaining: 7.63s
177:	learn: 0.8517833	total: 6.09s	remaining: 7.6s
178:	learn: 0.8516668	total: 6.12s	remaining: 7.55s
179:	learn: 0.8514895	total: 6.15s	remaining: 7.52s
180:	learn: 0.8513907	total: 6.18s	remaining: 7.48s
181:	learn: 0.8

326:	learn: 0.8377905	total: 10.8s	remaining: 2.4s
327:	learn: 0.8377168	total: 10.8s	remaining: 2.37s
328:	learn: 0.8376178	total: 10.8s	remaining: 2.34s
329:	learn: 0.8375199	total: 10.9s	remaining: 2.31s
330:	learn: 0.8374514	total: 10.9s	remaining: 2.27s
331:	learn: 0.8373650	total: 10.9s	remaining: 2.24s
332:	learn: 0.8373150	total: 11s	remaining: 2.2s
333:	learn: 0.8372382	total: 11s	remaining: 2.17s
334:	learn: 0.8371464	total: 11s	remaining: 2.14s
335:	learn: 0.8370756	total: 11.1s	remaining: 2.1s
336:	learn: 0.8370279	total: 11.1s	remaining: 2.07s
337:	learn: 0.8369445	total: 11.1s	remaining: 2.04s
338:	learn: 0.8368758	total: 11.1s	remaining: 2s
339:	learn: 0.8368125	total: 11.2s	remaining: 1.97s
340:	learn: 0.8367394	total: 11.2s	remaining: 1.94s
341:	learn: 0.8366668	total: 11.2s	remaining: 1.9s
342:	learn: 0.8365569	total: 11.3s	remaining: 1.87s
343:	learn: 0.8364770	total: 11.3s	remaining: 1.84s
344:	learn: 0.8363765	total: 11.3s	remaining: 1.8s
345:	learn: 0.8362888	tota

<catboost.core.CatBoostRegressor at 0x22b62db4fd0>

In [209]:
useless_cols = [
    1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25,
    26, 27, 28, 29, 30, 32, 34, 35, 36, 37, 38, 39, 40, 42, 44, 45, 46, 49, 50,
    51, 52, 53, 54, 55, 57, 59, 60, 61, 62, 64, 65, 67, 69, 70, 72, 74, 75, 76,
    79, 80, 81, 83, 87, 89, 91, 92, 93, 94, 96, 97, 98, 99, 100, 101, 103, 104,
    106, 107, 108, 111, 113, 114, 116, 117, 118, 122, 123, 124, 127, 129, 130,
    131, 133, 134, 135, 136, 137, 139, 140, 144, 146, 147, 148, 149, 150, 154,
    155, 158, 159, 160, 161, 162, 163, 165, 166, 167, 168, 170, 171, 172, 174,
    175, 176, 177, 179, 181, 182, 184, 186, 187, 189, 191, 193, 194, 195, 197,
    198, 199, 200, 203, 205, 207, 208, 210, 211, 212, 213, 214, 216, 217, 218,
    220, 222, 223, 224, 226, 227, 228, 229, 230, 232, 233, 234, 235, 236, 237,
    238, 239, 240, 242, 243, 244, 246, 247, 248, 249, 250, 251, 254, 255, 256,
    258, 260, 261, 262, 263, 264, 266, 268, 271, 274, 275, 276, 277, 278, 279,
    280, 281, 283, 284, 285, 286, 287, 289, 294, 295, 296, 297, 298, 300, 301,
    302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
    317, 318, 320, 321, 324, 325, 326, 327, 328, 330, 331, 332, 334, 336, 337,
    340, 341, 342, 343, 344, 345, 346, 348, 349, 350, 351, 352, 353, 354, 356,
    357, 359, 361, 362, 363, 364, 365, 367, 368, 369, 370, 371, 372, 373, 374,
    375, 378, 380, 381, 383, 384, 385, 386, 387, 389, 390, 391, 392, 393, 394,
    395, 397, 398, 399, 400, 401, 402, 404, 405, 407, 408, 412, 413, 414, 415,
    416, 417
]

In [None]:
cat_columns = data_2.columns[:3]

In [211]:
cat = CatBoostRegressor(iterations=400,
                        learning_rate=0.1,
                        l2_leaf_reg=1,
                        depth=4,
                        thread_count=4,
                        cat_features=cat_columns)

In [215]:
cat.fit(data_2[~data_2['target'].isna()].drop(['target'], axis=1),
        np.log(data_2[~data_2['target'].isna()]['target']))

0:	learn: 1.0542628	total: 95.8ms	remaining: 38.2s
1:	learn: 1.0391719	total: 174ms	remaining: 34.7s
2:	learn: 1.0269853	total: 275ms	remaining: 36.4s
3:	learn: 1.0159765	total: 377ms	remaining: 37.4s
4:	learn: 1.0062136	total: 467ms	remaining: 36.9s
5:	learn: 0.9973058	total: 544ms	remaining: 35.7s
6:	learn: 0.9889434	total: 635ms	remaining: 35.7s
7:	learn: 0.9820775	total: 721ms	remaining: 35.3s
8:	learn: 0.9747838	total: 811ms	remaining: 35.2s
9:	learn: 0.9683310	total: 905ms	remaining: 35.3s
10:	learn: 0.9626437	total: 989ms	remaining: 35s
11:	learn: 0.9573866	total: 1.06s	remaining: 34.3s
12:	learn: 0.9523244	total: 1.15s	remaining: 34.4s
13:	learn: 0.9478544	total: 1.23s	remaining: 34s
14:	learn: 0.9438689	total: 1.33s	remaining: 34.3s
15:	learn: 0.9398939	total: 1.43s	remaining: 34.4s
16:	learn: 0.9366107	total: 1.52s	remaining: 34.3s
17:	learn: 0.9335726	total: 1.6s	remaining: 33.9s
18:	learn: 0.9305031	total: 1.69s	remaining: 33.9s
19:	learn: 0.9274836	total: 1.78s	remaining: 

163:	learn: 0.8536612	total: 13.7s	remaining: 19.7s
164:	learn: 0.8535611	total: 13.8s	remaining: 19.7s
165:	learn: 0.8535112	total: 13.9s	remaining: 19.5s
166:	learn: 0.8534118	total: 13.9s	remaining: 19.5s
167:	learn: 0.8532736	total: 14s	remaining: 19.4s
168:	learn: 0.8531568	total: 14.1s	remaining: 19.3s
169:	learn: 0.8530053	total: 14.2s	remaining: 19.2s
170:	learn: 0.8528259	total: 14.3s	remaining: 19.1s
171:	learn: 0.8527190	total: 14.3s	remaining: 19s
172:	learn: 0.8525879	total: 14.4s	remaining: 18.9s
173:	learn: 0.8524739	total: 14.5s	remaining: 18.8s
174:	learn: 0.8523375	total: 14.6s	remaining: 18.8s
175:	learn: 0.8521895	total: 14.7s	remaining: 18.7s
176:	learn: 0.8521119	total: 14.8s	remaining: 18.6s
177:	learn: 0.8519605	total: 14.8s	remaining: 18.5s
178:	learn: 0.8518677	total: 14.9s	remaining: 18.4s
179:	learn: 0.8517903	total: 15s	remaining: 18.3s
180:	learn: 0.8516894	total: 15.1s	remaining: 18.2s
181:	learn: 0.8515720	total: 15.2s	remaining: 18.1s
182:	learn: 0.8514

322:	learn: 0.8387729	total: 26.4s	remaining: 6.3s
323:	learn: 0.8386896	total: 26.5s	remaining: 6.22s
324:	learn: 0.8386338	total: 26.6s	remaining: 6.14s
325:	learn: 0.8385468	total: 26.7s	remaining: 6.06s
326:	learn: 0.8384865	total: 26.8s	remaining: 5.98s
327:	learn: 0.8384138	total: 26.9s	remaining: 5.89s
328:	learn: 0.8383584	total: 26.9s	remaining: 5.81s
329:	learn: 0.8382844	total: 27s	remaining: 5.73s
330:	learn: 0.8382055	total: 27.1s	remaining: 5.65s
331:	learn: 0.8381299	total: 27.2s	remaining: 5.57s
332:	learn: 0.8380628	total: 27.3s	remaining: 5.49s
333:	learn: 0.8380328	total: 27.4s	remaining: 5.41s
334:	learn: 0.8379551	total: 27.4s	remaining: 5.33s
335:	learn: 0.8378938	total: 27.5s	remaining: 5.24s
336:	learn: 0.8377835	total: 27.6s	remaining: 5.16s
337:	learn: 0.8377023	total: 27.7s	remaining: 5.08s
338:	learn: 0.8376348	total: 27.8s	remaining: 5s
339:	learn: 0.8375508	total: 27.8s	remaining: 4.91s
340:	learn: 0.8374671	total: 27.9s	remaining: 4.83s
341:	learn: 0.8374

<catboost.core.CatBoostRegressor at 0x22b635fd580>

In [216]:
predictions = np.exp(cat.predict(data_2[data_2['target'].isna()]))

In [None]:
predictions

In [170]:
# xxx = pd.Series(predictions).values.reshape(1,-1)[0].reshape(1,-1)[0]

In [171]:
predictions[1:]

array([0.00802764, 0.0080207 , 0.00805162, 0.00802939, 0.00804445,
       0.0079822 , 0.00805338, 0.00800581, 0.00797345, 0.00799482,
       0.00803394, 0.00805495, 0.00806302, 0.0080474 , 0.00802201,
       0.00803376, 0.00803043, 0.00798739, 0.00802917, 0.00802991,
       0.00798793, 0.00804056, 0.00803157, 0.00804751, 0.00802276,
       0.00803209, 0.00807226, 0.0080491 , 0.00801529, 0.00807692,
       0.00802959, 0.00801973, 0.00801109, 0.00802532, 0.00797232,
       0.0080733 , 0.0080327 , 0.0080449 , 0.00799027, 0.00805026,
       0.00794872, 0.00783794, 0.00800283, 0.00805807, 0.00805065,
       0.00806885, 0.00802574, 0.00805679, 0.0080234 , 0.00782295,
       0.00805925, 0.00805238, 0.0080428 , 0.00800809, 0.00800686,
       0.00802819, 0.00804154, 0.00803753, 0.0079991 , 0.00804595,
       0.00795908, 0.0079456 , 0.00801786, 0.00799767, 0.00804114,
       0.00803948, 0.00802344, 0.008036  , 0.00803027, 0.0080457 ,
       0.00806882, 0.00798078, 0.00804806, 0.00806068, 0.00802

In [217]:
fin = pd.DataFrame(data=pd.Series(predictions[1:]),
                   index=range(0, 999),
                   columns=pd.Series(predictions[0]))

In [218]:
fin.to_csv('score.csv', index=False)

In [219]:
pd.read_csv('score.csv')

Unnamed: 0,0.017166253887422586
0,0.032551
1,0.023580
2,0.058145
3,0.039274
4,0.103970
...,...
994,0.043013
995,0.015167
996,0.053416
997,0.020634


In [None]:
# save
# with open('model_cat_reg.pkl','wb') as f:
#     pickle.dump(cbr,f)

In [None]:
# # load
# with open('model_cat_reg.pkl', 'rb') as f:
#     clf2 = pickle.load(f)

In [220]:
cols_to_hot = data_2.columns[:3]

In [221]:
encoder = Encoder(data_2)

In [222]:
# hot it out
for col in cols_to_hot:
    data_2a = encoder.hot_encoder(col)



In [225]:
X = data_2a[~data_2a['target'].isna()].drop(['target'], axis=1)
y = data_2a[~data_2a['target'].isna()]['target']

In [226]:
val = data_2a[data_2a['target'].isna()].drop(['target'], axis=1)

In [227]:
lin = LinearRegression()

In [271]:
lin.fit(X, np.log(y))

LinearRegression()

In [273]:
lin_predict = np.exp(lin.predict(val))

In [276]:
lin_score = pd.DataFrame(data=pd.Series(lin_predict[1:]),
                         index=range(0, 999),
                         columns=pd.Series(lin_predict[0]))

In [278]:
lin_score.to_csv('lin_score.csv', index=False)

In [279]:
fin_predict = (lin_predict + predictions) / 2

In [280]:
final = pd.DataFrame(data=pd.Series(fin_predict[1:]),
                     index=range(0, 999),
                     columns=pd.Series(fin_predict[0]))

In [281]:
final.to_csv('FINAL_PREDICTION', index=False)

In [249]:
final

Unnamed: 0,0.023621
0,0.052728
1,0.036258
2,0.098011
3,0.073846
4,0.128002
...,...
994,0.066596
995,0.026721
996,0.075336
997,0.041245
