In [99]:
import pandas as pd
import numpy as np

import gc

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, \
    recall_score, f1_score, log_loss, auc, classification_report, confusion_matrix, \
    precision_recall_curve, roc_curve

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

import pickle

In [2]:
cat_feat = ['B_30', 'B_38', 'D_114', 'D_116',
            'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_68']

In [55]:
columns = ['S_2', 'D_63', 'P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41', 'B_3',
           'D_43', 'D_44', 'B_4', 'D_45', 'B_5', 'R_2', 'D_46', 'D_47', 'D_48', 'B_6',
           'B_7', 'B_8', 'D_51', 'B_9', 'R_3', 'D_52', 'P_3', 'B_10', 'S_5', 'S_6',
           'D_54', 'R_4', 'S_7', 'B_12', 'S_8', 'D_55', 'B_13', 'R_5', 'D_58', 'B_14',
           'D_59', 'D_60', 'D_61', 'B_15', 'S_11', 'D_62',  'D_64', 'D_65', 'B_16',
           'B_18', 'B_19', 'B_20', 'D_68', 'S_12', 'R_6', 'S_13', 'B_21', 'D_69',
           'B_22', 'D_70', 'D_71', 'D_72', 'S_15', 'P_4', 'D_74', 'B_24', 'R_7',
           'B_25', 'B_26', 'D_78', 'D_79', 'R_8', 'S_16', 'D_80', 'R_10', 'R_11',
           'B_27', 'D_81', 'S_17', 'R_12', 'B_28', 'R_13', 'D_83', 'R_14', 'R_15',
           'D_84', 'R_16', 'B_30', 'S_18', 'D_86', 'R_17', 'R_18', 'B_31', 'S_19',
           'R_19', 'B_32', 'S_20', 'R_20', 'R_21', 'B_33', 'D_89', 'R_22', 'R_23',
           'D_91', 'D_92', 'D_93', 'D_94', 'R_24', 'R_25', 'D_96', 'S_22', 'S_23',
           'S_24', 'S_25', 'S_26', 'D_102', 'D_103', 'D_107', 'B_36', 'R_27', 'B_38',
           'D_109', 'D_112', 'B_40', 'S_27', 'D_113', 'D_114', 'D_115', 'D_116', 'D_117',
           'D_118', 'D_120', 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_126', 'D_127',
           'D_128', 'D_129', 'B_41', 'D_130', 'D_131', 'D_133', 'R_28', 'D_139', 'D_140',
           'D_144', 'D_145']

In [56]:
num_feat = ['B_36', 'S_5', 'B_24', 'R_7', 'S_26', 'D_139', 'D_43', 'B_18', 'B_7', 'R_2',
            'S_15', 'R_27', 'S_11', 'D_47', 'S_8', 'B_40', 'D_118', 'B_4', 'B_5', 'B_28',
            'R_3', 'D_72', 'D_129', 'B_1', 'B_9', 'B_20', 'D_39', 'P_3', 'S_12', 'D_41',
            'D_102', 'B_12', 'D_51', 'D_44', 'D_45', 'B_21', 'D_133', 'D_46', 'D_62',
            'S_24', 'B_3', 'D_121', 'D_60', 'R_5', 'S_23', 'D_52', 'D_79', 'S_3', 'B_10',
            'B_2', 'P_2', 'R_11', 'D_65', 'B_14', 'R_1', 'D_55', 'D_59', 'D_123', 'D_144']

In [57]:
diff_columns = ['B_1', 'D_43', 'B_4', 'D_48', 'B_9', 'S_5']

Так как тестовые данные имеют большой размер даже после преобразования в parquet, то читать и проводить предварительную обработку будем по столбцам.

In [58]:
def get_difference(data: pd.DataFrame, num_features: list) ->: pd.DataFrame:
    """
    Находим разницы в значение признака между месяцами
    params:
    - data: pd.DataFrame - набор данных
    - num_features: list - признаки, для которых нужно найти разницу
    return: pd.DataFrame
    """
    df1 = []
    customer_ids = []
    for customer_id, df in data.groupby(['customer_ID']):
        diff_df1 = df[num_features].diff(
            1).iloc[[-1]].values.astype(np.float32)
        df1.append(diff_df1)
        customer_ids.append(customer_id)
    df1 = np.concatenate(df1, axis=0)
    df1 = pd.DataFrame(
        df1, columns=[col + '_diff1' for col in df[num_features].columns])
    df1['customer_ID'] = customer_ids
    del diff_df1
    gc.collect()
    return df1

In [69]:
def read_file(path: str):
    """
    Чтение и предварительная обработка тестовый данных
    """
    test_data = pd.read_parquet(
        path, columns=['customer_ID']).groupby('customer_ID').tail(1)

    for c in columns:
        print(f'Read and preproc {c}')
        df = pd.read_parquet(path, columns=['customer_ID', c])
        if c == 'S_2':
            first_day = df.groupby('customer_ID')['S_2'].min()
            last_day = df.groupby('customer_ID')['S_2'].max()
            temp = last_day - first_day
            temp = temp.reset_index()
            temp.columns = ['customer_ID', 'S_2']
            temp['S_2'] = temp.S_2.dt.days
            test_data = test_data.merge(temp, how='inner', on='customer_ID')
        else:
            df.fillna(method="ffill", inplace=True)
            df.fillna(method="bfill", inplace=True)

            if c in num_feat:
                df = df.groupby("customer_ID")[c].agg(
                    ['mean', 'first', 'min', 'max', 'last'])
                df.columns = [c + '_' + x for x in df.columns]
                df[c+'_diff'] = df[c + '_last'] - df[c + '_first']
                df.drop(columns=[c+'_first'], inplace=True)
                df.reset_index(inplace=True)
                test_data = test_data.merge(df, how='inner', on='customer_ID')

            if c in cat_feat:
                df = df.groupby("customer_ID")[c].agg(
                    ['count', 'last', 'nunique'])
                df.columns = [c + '_' + x for x in df.columns]
                df.reset_index(inplace=True)
                test_data = test_data.merge(df, how='inner', on='customer_ID')

        del df
        gc.collect()

    print('shape of data:', test_data.shape)

    return test_data

In [70]:
test_data = read_file('test_data.parquet')

Read and preproc S_2
Read and preproc D_63
Read and preproc P_2
Read and preproc D_39
Read and preproc B_1
Read and preproc B_2
Read and preproc R_1
Read and preproc S_3
Read and preproc D_41
Read and preproc B_3
Read and preproc D_43
Read and preproc D_44
Read and preproc B_4
Read and preproc D_45
Read and preproc B_5
Read and preproc R_2
Read and preproc D_46
Read and preproc D_47
Read and preproc D_48
Read and preproc B_6
Read and preproc B_7
Read and preproc B_8
Read and preproc D_51
Read and preproc B_9
Read and preproc R_3
Read and preproc D_52
Read and preproc P_3
Read and preproc B_10
Read and preproc S_5
Read and preproc S_6
Read and preproc D_54
Read and preproc R_4
Read and preproc S_7
Read and preproc B_12
Read and preproc S_8
Read and preproc D_55
Read and preproc B_13
Read and preproc R_5
Read and preproc D_58
Read and preproc B_14
Read and preproc D_59
Read and preproc D_60
Read and preproc D_61
Read and preproc B_15
Read and preproc S_11
Read and preproc D_62
Read and p

In [31]:
df = pd.read_parquet('test_data.parquet', columns=[
                     'customer_ID'] + diff_columns)
df.fillna(method="ffill", inplace=True)
df.fillna(method="bfill", inplace=True)
df_diff = get_difference(df, diff_columns)

In [32]:
df_diff.head()

Unnamed: 0,B_1_diff1,D_43_diff1,B_4_diff1,D_48_diff1,B_9_diff1,S_5_diff1,customer_ID
0,0.007588,0.001244,-0.000406,-0.014788,-0.002645,-0.001144,-9223277493928322471
1,0.029376,-0.006746,0.029038,0.0,0.044008,0.004502,-9223220269070810982
2,-0.010553,-0.008076,0.006431,-0.001409,3.8e-05,-0.000982,-9223219380479694318
3,0.001282,-0.005443,-0.014822,-0.017232,-0.002894,0.005626,-9223202973368451495
4,-0.003382,0.0,-0.011262,-0.022798,-0.02035,0.095084,-9223190037945288673


In [72]:
test_data = test_data.merge(df_diff, how='inner', on='customer_ID')

In [73]:
test_data.head()

Unnamed: 0,customer_ID,S_2,D_63_count,D_63_last,D_63_nunique,P_2_mean,P_2_min,P_2_max,P_2_last,P_2_diff,...,D_144_min,D_144_max,D_144_last,D_144_diff,B_1_diff1,D_43_diff1,B_4_diff1,D_48_diff1,B_9_diff1,S_5_diff1
0,8717704911770597815,235,9,CR,1,0.601387,0.56893,0.631315,0.56893,-0.062385,...,0.002156,0.009667,0.00369,-0.004591,-0.006647,0.001126,-0.00799,-0.002172,0.003786,0.002261
1,4783907996972277493,358,13,CO,1,0.862166,0.794469,0.913501,0.841177,-0.053018,...,0.000247,0.009734,0.000247,-0.008189,-0.0374,0.0,-0.057167,0.037209,-0.112125,0.000668
2,4616129756878093544,378,13,CR,1,0.748955,0.673112,0.835114,0.697522,-0.076755,...,0.322121,0.457819,0.457819,0.135698,-0.005343,-0.012185,-0.066633,-0.093532,0.002275,0.003907
3,-1916505587365783916,382,13,CL,1,0.474728,0.428457,0.514222,0.513186,-0.001036,...,0.333893,0.508652,0.500924,0.163436,-0.125495,-0.005654,-0.129752,0.078476,-0.251865,0.001812
4,7583456031722841431,370,13,CO,1,0.3241,0.254478,0.425764,0.254478,-0.031825,...,0.000907,0.009656,0.001558,0.000337,0.054508,-0.059755,0.027623,0.010918,0.025947,0.01059


In [76]:
test_data = test_data.set_index('customer_ID')

In [78]:
test_data = test_data[['P_2_mean', 'P_2_last', 'D_39_max', 'D_39_last', 'B_1_max', 'B_1_last',
                       'B_2_mean', 'B_2_last', 'R_1_mean', 'R_1_last', 'S_3_mean', 'S_3_max',
                       'S_3_last', 'D_41_last', 'B_3_max', 'B_3_last', 'D_43_last', 'D_44_last',
                       'B_4_max', 'B_4_last', 'D_45_mean', 'D_45_max', 'B_5_min', 'B_5_last',
                       'R_2_last', 'D_46_last', 'D_47_mean', 'D_47_max', 'D_47_last', 'B_7_last',
                       'D_51_mean', 'B_9_last', 'R_3_mean', 'R_3_max', 'R_3_last', 'D_52_last',
                       'P_3_max', 'B_10_last', 'S_5_max', 'B_12_max', 'S_8_max', 'R_5_last',
                       'B_14_last', 'D_60_last', 'S_11_mean', 'S_11_max', 'D_62_min', 'D_65_mean',
                       'D_65_max', 'B_18_mean', 'B_20_last','S_12_mean', 'S_12_max', 'B_21_max',
                       'D_72_mean', 'S_15_mean', 'B_24_last', 'R_7_last', 'D_79_last', 'R_11_mean',
                       'R_11_last', 'B_28_min', 'S_23_max', 'S_23_last', 'S_24_max', 'S_24_last',
                       'S_26_mean', 'S_26_max', 'D_102_max', 'B_36_mean', 'R_27_max', 'B_40_max',
                       'D_118_mean', 'D_118_min', 'D_121_mean', 'D_121_min', 'D_121_last',
                       'D_129_mean', 'D_129_last', 'D_133_last', 'D_139_max', 'D_39_diff',
                       'D_41_diff', 'B_3_diff', 'D_44_diff', 'B_4_diff', 'D_55_diff', 'D_59_diff',
                       'D_79_diff', 'S_24_diff', 'D_123_diff', 'D_144_diff', 'D_63_last',
                       'B_1_diff1', 'D_43_diff1', 'B_4_diff1', 'D_48_diff1', 'B_9_diff1',
                       'S_5_diff1', 'S_2']]

In [91]:
test_data.columns = ['P_2_mean', 'P_2_last', 'D_39_max', 'D_39_last', 'B_1_max', 'B_1_last',
                     'B_2_mean', 'B_2_last', 'R_1_mean', 'R_1_last', 'S_3_mean', 'S_3_max',
                     'S_3_last', 'D_41_last', 'B_3_max', 'B_3_last', 'D_43_last', 'D_44_last',
                     'B_4_max', 'B_4_last', 'D_45_mean', 'D_45_max', 'B_5_min', 'B_5_last',
                     'R_2_last', 'D_46_last', 'D_47_mean', 'D_47_max', 'D_47_last', 'B_7_last',
                     'D_51_mean', 'B_9_last', 'R_3_mean', 'R_3_max', 'R_3_last', 'D_52_last',
                     'P_3_max', 'B_10_last', 'S_5_max', 'B_12_max', 'S_8_max', 'R_5_last',
                     'B_14_last', 'D_60_last', 'S_11_mean', 'S_11_max', 'D_62_min', 'D_65_mean',
                     'D_65_max', 'B_18_mean', 'B_20_last', 'S_12_mean', 'S_12_max', 'B_21_max',
                     'D_72_mean', 'S_15_mean', 'B_24_last', 'R_7_last', 'D_79_last', 'R_11_mean',
                     'R_11_last', 'B_28_min', 'S_23_max', 'S_23_last', 'S_24_max', 'S_24_last',
                     'S_26_mean', 'S_26_max', 'D_102_max', 'B_36_mean', 'R_27_max', 'B_40_max',
                     'D_118_mean', 'D_118_min', 'D_121_mean', 'D_121_min', 'D_121_last',
                     'D_129_mean', 'D_129_last', 'D_133_last', 'D_139_max', 'D_39_diff',
                     'D_41_diff', 'B_3_diff', 'D_44_diff', 'B_4_diff', 'D_55_diff', 'D_59_diff',
                     'D_79_diff', 'S_24_diff', 'D_123_diff', 'D_144_diff', 'D_63_last',
                     'B_1_diff1', 'D_43_diff1', 'B_4_diff1', 'D_48_diff1', 'B_9_diff1',
                     'S_5_diff1', 'S2']

In [92]:
test_data.to_parquet('./prepared_test.parquet')

In [5]:
test_data = pd.read_parquet('prepared_test.parquet')

In [87]:
class CatBoostEvalMetricCustom(object):
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        # the larger metric value the better
        return True

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])
        preds = np.array(approxes[0])
        target = np.array(target)
        score = amex_metric(target, preds)
        return score, 0

In [88]:
cat_model = pickle.load(open('cat_model.sav', 'rb'))
lgb_model = pickle.load(open('lgb_model.sav', 'rb'))
final_model = pickle.load(open('final_model.sav', 'rb'))

In [89]:
meta_X_test = pd.DataFrame()

In [93]:
pred_score_val = cat_model.predict_proba(test_data)
meta_X_test['cat_01'] = pred_score_val[:, 1]

In [94]:
pred_score_val = lgb_model.predict_proba(test_data)
meta_X_test['lgb_01'] = pred_score_val[:, 1]

In [95]:
preds = final_model.predict_proba(meta_X_test)

In [96]:
submission = pd.read_csv("sample_submission.csv")

In [97]:
submission.loc[:, "prediction"] = preds[:, 1]

In [98]:
submission.to_csv("submission.csv", index=False)