In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [7]:
import pyarrow.parquet as pq
from sklearn import metrics
import scipy.stats
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier, Pool, CatBoostRanker
import torch
import time  # для оценки времени
import torch.utils.data as data_utils
import tqdm
from typing import List
import os
import torch
import numpy as np
import pandas as pd
import random
from sklearn.metrics import roc_auc_score
from torchvision.ops import sigmoid_focal_loss
from torch.nn.utils import clip_grad_norm_
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score

In [10]:
TRAIN_DATA_PATH = "/data/train_data"
TEST_DATA_PATH = "/data/test_data/test_data"
TRAIN_TARGET_PATH = "/data/train_target.csv"
TRAIN_FEATURES_PATH = "../data/train_features_gb/"
TEST_FEATURES_PATH = "../data/test_features_gb/"

In [11]:
path_to_dataset = TRAIN_DATA_PATH
dataset_paths = sorted([os.path.join(path_to_dataset, filename) for filename in os.listdir(path_to_dataset)])
dataset_paths

FileNotFoundError: [WinError 3] Системе не удается найти указанный путь: '/data/train_data'

In [None]:
import os
import pandas as pd
import tqdm
def read_parquet_dataset_from_local(path_to_dataset: str, start_from: int = 0,
                                     num_parts_to_read: int = 2, columns=None, verbose=False) -> pd.DataFrame:
    """
    читает num_parts_to_read партиций, преобразует их к pd.DataFrame и возвращает
    :param path_to_dataset: путь до директории с партициями
    :param start_from: номер партиции, с которой начать чтение
    :param num_parts_to_read: количество партиций, которые требуется прочитать
    :param columns: список колонок, которые нужно прочитать из партиции
    :return: pd.DataFrame
    """

    res = []
    dataset_paths = sorted([os.path.join(path_to_dataset, filename) for filename in os.listdir(path_to_dataset)])
    
    start_from = max(0, start_from)
    chunks = dataset_paths[start_from: start_from + num_parts_to_read]
    if verbose:
        print('Reading chunks:\n')
        for chunk in chunks:
            print(chunk)
    for chunk_path in chunks:
        chunk = pd.read_parquet(chunk_path,columns=columns)
        res.append(chunk)
    return pd.concat(res).reset_index(drop=True)

In [None]:
class OHEAggregator(object):
    
    def __init__(self):
        self.encoded_features = None
        
    def __extract_count_aggregations(self, data_frame: pd.DataFrame, mode: str) -> pd.DataFrame:
        feature_columns = list(data_frame.columns.values)
        feature_columns.remove("id")
        feature_columns.remove("rn")

        dummies = pd.get_dummies(data_frame[feature_columns], columns=feature_columns)
        dummy_features = dummies.columns.values
        
        ohe_features = pd.concat([data_frame, dummies], axis=1)
        ohe_features = ohe_features.drop(columns=feature_columns)
        return ohe_features
        
    def __transform_data(self, path_to_dataset: str, num_parts_to_preprocess_at_once: int = 1, num_parts_total: int=50,
                                     mode: str = "fit_transform", save_to_path=None, verbose: bool=False):
        assert mode in ["fit_transform", "transform"], f"Unrecognized mode: {mode}! Please use one of the following modes: \"fit_transform\", \"transform\""
        preprocessed_frames = []
        for step in tqdm.notebook.tqdm(range(0, num_parts_total, num_parts_to_preprocess_at_once), 
                                       desc="Transforming sequential credit data"):
            data_frame = read_parquet_dataset_from_local(path_to_dataset, start_from=step, 
                                                         num_parts_to_read=num_parts_to_preprocess_at_once, 
                                                         verbose=verbose)
            features = self.__extract_count_aggregations(data_frame, mode=mode)
            if save_to_path:
                features.to_parquet(os.path.join(save_to_path, f"processed_chunk_{step}.pq"))
            preprocessed_frames.append(features)
        
        features = pd.concat(preprocessed_frames)
        features.fillna(np.uint8(0), inplace=True)
        dummy_features = list(features.columns.values)
        dummy_features.remove("id")
        if mode == "fit_transform":
            self.encoded_features = dummy_features
        else:
            assert not self.encoded_features is None, "Transformer not fitted"
            for col in self.encoded_features:
                if not col in dummy_features:
                    features[col] = np.uint8(0)
        return features[["id"]+self.encoded_features]
    
    def fit_transform(self, path_to_dataset: str, num_parts_to_preprocess_at_once: int = 1, num_parts_total: int = 50,
                      save_to_path=None, verbose: bool=False):
        return self.__transform_data(path_to_dataset=path_to_dataset,
                                     num_parts_to_preprocess_at_once=num_parts_to_preprocess_at_once,
                                     num_parts_total=num_parts_total, mode="fit_transform",
                                     save_to_path=save_to_path, verbose=verbose)
    def transform(self, path_to_dataset: str, num_parts_to_preprocess_at_once: int = 1, num_parts_total: int=50,
                  save_to_path=None, verbose: bool=False):
        return self.__transform_data(path_to_dataset=path_to_dataset,
                                     num_parts_to_preprocess_at_once=num_parts_to_preprocess_at_once,
                                     num_parts_total=num_parts_total, mode="transform",
                                     save_to_path=save_to_path, verbose=verbose)

In [None]:
class CountAggregator(object):
    
    def __init__(self):
        self.encoded_features = None
        
    def __extract_count_aggregations(self, data_frame: pd.DataFrame, mode: str) -> pd.DataFrame:
        # one-hot-encoding
        feature_columns = list(data_frame.columns.values)
        feature_columns.remove("id")
        feature_columns.remove("rn")

        dummies = pd.get_dummies(data_frame[feature_columns], columns=feature_columns)
        dummy_features = dummies.columns.values
        
        ohe_features = pd.concat([data_frame, dummies], axis=1)
        ohe_features = ohe_features.drop(columns=feature_columns)
        
        # count aggregation
        ohe_features.groupby("id")
        features = ohe_features.groupby("id")[dummy_features].sum().reset_index(drop=False)
        return features
        
    def __transform_data(self, path_to_dataset: str, num_parts_to_preprocess_at_once: int = 1, num_parts_total: int=50,
                                     mode: str = "fit_transform", save_to_path=None, verbose: bool=False):
        assert mode in ["fit_transform", "transform"], f"Unrecognized mode: {mode}! Please use one of the following modes: \"fit_transform\", \"transform\""
        preprocessed_frames = []
        for step in tqdm.notebook.tqdm(range(0, num_parts_total, num_parts_to_preprocess_at_once), 
                                       desc="Transforming sequential credit data"):
            data_frame = read_parquet_dataset_from_local(path_to_dataset, start_from=step, 
                                                         num_parts_to_read=num_parts_to_preprocess_at_once, 
                                                         verbose=verbose)
            features = self.__extract_count_aggregations(data_frame, mode=mode)
            if save_to_path:
                features.to_parquet(os.path.join(save_to_path, f"processed_chunk_{step}.pq"))
            preprocessed_frames.append(features)
        
        features = pd.concat(preprocessed_frames)
        features.fillna(np.uint8(0), inplace=True)
        dummy_features = list(features.columns.values)
        dummy_features.remove("id")
        if mode == "fit_transform":
            self.encoded_features = dummy_features
        else:
            assert not self.encoded_features is None, "Transformer not fitted"
            for col in self.encoded_features:
                if not col in dummy_features:
                    features[col] = np.uint8(0)
        return features[["id"]+self.encoded_features]
    
    def fit_transform(self, path_to_dataset: str, num_parts_to_preprocess_at_once: int = 1, num_parts_total: int = 50,
                      save_to_path=None, verbose: bool=False):
        return self.__transform_data(path_to_dataset=path_to_dataset,
                                     num_parts_to_preprocess_at_once=num_parts_to_preprocess_at_once,
                                     num_parts_total=num_parts_total, mode="fit_transform",
                                     save_to_path=save_to_path, verbose=verbose)
    def transform(self, path_to_dataset: str, num_parts_to_preprocess_at_once: int = 1, num_parts_total: int=50,
                  save_to_path=None, verbose: bool=False):
        return self.__transform_data(path_to_dataset=path_to_dataset,
                                     num_parts_to_preprocess_at_once=num_parts_to_preprocess_at_once,
                                     num_parts_total=num_parts_total, mode="transform",
                                     save_to_path=save_to_path, verbose=verbose)

In [None]:
class WeightMeanAggregator(object):
    
    def __init__(self, alpha=1):
        self.encoded_features = None
        self.alpha = alpha
        
    def __extract_aggregations(self, data_frame: pd.DataFrame, mode: str) -> pd.DataFrame:
        # one-hot-encoding
        feature_columns = list(data_frame.columns.values)
        feature_columns.remove("id")
        feature_columns.remove("rn")

        dummies = pd.get_dummies(data_frame[feature_columns], columns=feature_columns)
        dummy_features = dummies.columns.values
        
        ohe_features = pd.concat([data_frame, dummies], axis=1)
        ohe_features = ohe_features.drop(columns=feature_columns)
        
        # lenght of history
        history_lenght = ohe_features.groupby('id')['rn'].max().reset_index(drop=False)
        history_lenght = history_lenght.rename(columns={'rn': 'history_lenght'})
        ohe_features = ohe_features.merge(history_lenght, on='id')

        ohe_features['weight'] = (ohe_features['rn'] / ohe_features['history_lenght'])**self.alpha
        sum_weight = ohe_features.groupby('id')['weight'].sum().reset_index(drop=False).rename(columns={'weight': 'sum_weight'})
        ohe_features = ohe_features.merge(sum_weight, on='id')
        
        generated_features = list(ohe_features.columns)
        generated_features.remove("id")
        generated_features.remove("rn")
        generated_features.remove("history_lenght")
        generated_features.remove("weight")
        generated_features.remove("sum_weight")
        
        for feature in generated_features:
            ohe_features[feature] = ohe_features[feature] * ohe_features['weight'] / ohe_features['sum_weight']
        
        # weight aggregation
        ohe_features.groupby("id")
        features = ohe_features.groupby("id")[generated_features].sum().reset_index(drop=False)
        features = features.merge(history_lenght, on='id')
        
        return features  
        
    def __transform_data(self, path_to_dataset: str, num_parts_to_preprocess_at_once: int = 1, num_parts_total: int=50,
                                     mode: str = "fit_transform", save_to_path=None, verbose: bool=False):
        assert mode in ["fit_transform", "transform"], f"Unrecognized mode: {mode}! Please use one of the following modes: \"fit_transform\", \"transform\""
        preprocessed_frames = []
        for step in tqdm.notebook.tqdm(range(0, num_parts_total, num_parts_to_preprocess_at_once), 
                                       desc="Transforming sequential credit data"):
            data_frame = read_parquet_dataset_from_local(path_to_dataset, start_from=step, 
                                                         num_parts_to_read=num_parts_to_preprocess_at_once, 
                                                         verbose=verbose)
            features = self.__extract_aggregations(data_frame, mode=mode)
            if save_to_path:
                features.to_parquet(os.path.join(save_to_path, f"processed_chunk_{step}.pq"))
            preprocessed_frames.append(features)
        
        features = pd.concat(preprocessed_frames)
        features.fillna(np.uint8(0), inplace=True)
        dummy_features = list(features.columns.values)
        dummy_features.remove("id")
        if mode == "fit_transform":
            self.encoded_features = dummy_features
        else:
            assert not self.encoded_features is None, "Transformer not fitted"
            for col in self.encoded_features:
                if not col in dummy_features:
                    features[col] = np.uint8(0)
        return features[["id"]+self.encoded_features]
    
    def fit_transform(self, path_to_dataset: str, num_parts_to_preprocess_at_once: int = 1, num_parts_total: int = 50,
                      save_to_path=None, verbose: bool=False):
        return self.__transform_data(path_to_dataset=path_to_dataset,
                                     num_parts_to_preprocess_at_once=num_parts_to_preprocess_at_once,
                                     num_parts_total=num_parts_total, mode="fit_transform",
                                     save_to_path=save_to_path, verbose=verbose)
    def transform(self, path_to_dataset: str, num_parts_to_preprocess_at_once: int = 1, num_parts_total: int=50,
                  save_to_path=None, verbose: bool=False):
        return self.__transform_data(path_to_dataset=path_to_dataset,
                                     num_parts_to_preprocess_at_once=num_parts_to_preprocess_at_once,
                                     num_parts_total=num_parts_total, mode="transform",
                                     save_to_path=save_to_path, verbose=verbose)

In [None]:
aggregator_count = CountAggregator()
aggregator_weight_mean = WeightMeanAggregator(alpha=1.4)
aggregator_ohe = OHEAggregator()

In [None]:
%%time

train_data_ohe = aggregator_ohe.fit_transform(path_to_dataset = TRAIN_DATA_PATH, num_parts_to_preprocess_at_once=1, num_parts_total=3, 
                                      save_to_path=None, verbose=True)

In [None]:
%%time

train_data_weight_mean = aggregator_weight_mean.fit_transform(TRAIN_DATA_PATH, num_parts_to_preprocess_at_once=1, num_parts_total=3, 
                                      save_to_path=None, verbose=True)

In [None]:
train_data = train_data_weight_mean.merge(train_data_count,how='left', on='id')

In [None]:
train_data_weight_mean

In [None]:
train_data_count = aggregator_count.fit_transform(TRAIN_DATA_PATH, num_parts_to_preprocess_at_once=1, num_parts_total=3, 
                                      save_to_path=None, verbose=True)

In [None]:
train_target = pd.read_csv('/kaggle/input/alfa-bank-pd-credit-history/data_for_competition/train_target.csv')

In [None]:
train_data = train_data_ohe.merge(train_target,how='left', on='id')

In [None]:
train_data

In [None]:
train_data = train_data_count.merge(train_target,how='left', on='id')
train_data

In [None]:
train_data = train_data_weight_mean.merge(train_target,how='left', on='id')
train_data

подготовка данных закончилась, начинается обучение

In [None]:
data_frame = read_parquet_dataset_from_local(path_to_dataset,num_parts_to_read=3)

In [None]:
feature_cols = list(data_frame.columns.values)
feature_cols

In [None]:
feature_names = feature_cols

target = 'flag'  
targets = train_data["flag"].values

In [None]:
lr = LogisticRegression(class_weight='balanced')

In [None]:
train_target
data_frame = data_frame.merge(train_target,how = 'left', on = 'id')
data_frame

In [None]:
features_all = ['id', 'pre_since_opened', 'pre_since_confirmed', 'pre_pterm',
       'pre_fterm', 'pre_till_pclose', 'pre_till_fclose',
       'pre_loans_credit_limit', 'pre_loans_next_pay_summ',
       'pre_loans_outstanding', 'pre_loans_total_overdue',
       'pre_loans_max_overdue_sum', 'pre_loans_credit_cost_rate', 
       'is_zero_loans5', 'is_zero_loans530', 'is_zero_loans3060',
       'is_zero_loans6090', 'is_zero_loans90', 'pre_util', 'pre_over2limit',
       'pre_maxover2limit', 'is_zero_util', 'is_zero_over2limit',
       'is_zero_maxover2limit', 
       'enc_loans_account_holder_type', 'enc_loans_credit_status',
       'enc_loans_credit_type', 'enc_loans_account_cur', 'pclose_flag',
       'fclose_flag']

lr.fit(data_frame[feature_cols], data_frame['flag'])

In [None]:
y = lr.predict(data_frame[feature_cols])
metrics.roc_auc_score(data_frame['flag'], y)

In [None]:
preds_class = lr.predict(data_frame[feature_cols])
unique, counts = np.unique(preds_class, return_counts=True)
print (unique, counts)

In [None]:
unique, counts = np.unique( data_frame['flag'], return_counts=True)
print (unique, counts)

In [None]:
test_data = catboost_pool = Pool(data_frame[feature_cols],
                                 data_frame['flag'])

model = CatBoostClassifier(iterations=20,
                           depth=2,
                           learning_rate=1,
                           loss_function='Logloss',
                           verbose=True)
# train the model
model.fit(data_frame[feature_cols], data_frame['flag'])
# make the prediction using the resulting model
preds_class = model.predict(test_data)

In [None]:
unique, counts = np.unique(preds_class, return_counts=True)
print (unique, counts)

In [None]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install catboost==1.2.5

In [None]:
feature_cols = list(train_data.columns.values)
feature_cols.remove("flag")
len(feature_cols)

In [None]:
feature_names = feature_cols
target = 'flag'  
targets = train_data["flag"].values

In [None]:
cv = StratifiedKFold(n_splits=10, random_state=596, shuffle=True)

oof = np.zeros(len(train_data))
train_preds = np.zeros(len(train_data))

models_catboost_base = []

for fold_, (train_idx, val_idx) in enumerate(cv.split(train_data, targets), 1):
    print(f"Training with fold {fold_} started")
    
    model_catboost = CatBoostClassifier(
                    verbose=3000,
                    loss_function='Logloss',
                    eval_metric='AUC',
                    early_stopping_rounds=500,
                    #task_type="GPU",
                    task_type="CPU",
                    iterations=50000,
                    learning_rate=0.01, 
                    auto_class_weights = 'Balanced',
                    depth=5, 
                    l2_leaf_reg= 1,
                    random_state=42
    )
      
    train, val = train_data.iloc[train_idx], train_data.iloc[val_idx]
    
    model_catboost.fit(
                  train[feature_names], train[target], 
                  eval_set=(val[feature_names], val[target]),
                  plot=False
    )
    oof[val_idx] = model_catboost.predict_proba(val[feature_cols])[:, 1]
    train_preds[train_idx] += model_catboost.predict_proba(train[feature_cols])[:, 1] / (cv.n_splits-1)
    models_catboost_base.append(model_catboost)
    print(f"Training with fold {fold_} completed")

In [None]:
#models_catboost_ranker
#занял на порядок больше времени, примерно с такими же результатами

In [None]:
train_data['Group_id'] = train_data['id']//187500
train_data['Group_id'] = train_data['Group_id'].apply(lambda x: int(x)) 

In [None]:
feature_cols = list(train_data.columns.values)
feature_cols.remove("flag")
len(feature_cols)

In [None]:
train_target = pd.read_csv('/kaggle/input/alfa-bank-pd-credit-history/data_for_competition/train_target.csv')

In [None]:
train_data = train_data.merge(train_target,how='left', on='id')
train_data

In [None]:
feature_names = feature_cols
target = 'flag'  
targets = train_data["flag"].values

In [None]:
cv = StratifiedKFold(n_splits=10, random_state=754, shuffle=True)

oof = np.zeros(len(train_data))

models_catboost_ranker = []

for fold_, (train_idx, val_idx) in enumerate(cv.split(train_data, targets), 1):
    print(f"Training with fold {fold_} started")
    
    model_catboost = CatBoostRanker(
                    verbose=3000,
                    loss_function='PairLogit:max_pairs=1000000',
                    eval_metric='AUC',
                    early_stopping_rounds=500,
                    task_type="CPU",
                    iterations=50000,
                    learning_rate=0.01,       
                    depth=7, 
                    l2_leaf_reg= 1,
                    random_state=42,
    )
    
    train, val = train_data.iloc[train_idx], train_data.iloc[val_idx]
    
    train_pool = Pool(
            data=train[feature_names],
            label=train[target].values
            ,group_id=train["Group_id"].values
            )

    val_pool = Pool(
           data=val[feature_names],
           label=val[target].values
           ,group_id=val["Group_id"].values
        )
    
    
    model_catboost.fit(
                  train_pool, 
                  eval_set=val_pool,
                  plot=False
    )
    oof[val_idx] = model_catboost.predict(val[feature_cols])

    models_catboost_ranker.append(model_catboost)
    print(f"Training with fold {fold_} completed")

In [None]:
df1.describe()

In [None]:
df1.info()

In [None]:
#corr = df1.corr()

In [None]:
plt.figure(figsize=(200,100))
sns.heatmap(corr,annot=True)
plt.show()

In [None]:
subs = df1[['pre_loans5','pre_loans530','pre_loans3060','pre_loans6090','pre_loans90']]
subs

In [None]:
corr_subs = subs.corr()

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(corr_subs,annot=True)
plt.show()

In [None]:
subs.value_counts()

In [None]:
df1['pre_loans5'].value_counts()

In [None]:
df1['pre_loans530'].value_counts()

In [None]:
#pre_loans не имеет смысла включать в features потому что они все по-разному закодированы, а значение 16 может быть ничем не лучше 6

In [None]:
subs_enc = df1[['enc_paym_0',
 'enc_paym_1',
 'enc_paym_2',
 'enc_paym_3',
 'enc_paym_4',
 'enc_paym_5',
 'enc_paym_6',
 'enc_paym_7',
 'enc_paym_8',
 'enc_paym_9',
 'enc_paym_10',
 'enc_paym_11',
 'enc_paym_12',
 'enc_paym_13',
 'enc_paym_14',
 'enc_paym_15',
 'enc_paym_16',
 'enc_paym_17',
 'enc_paym_18',
 'enc_paym_19',
 'enc_paym_20',
 'enc_paym_21',
 'enc_paym_22',
 'enc_paym_23',
 'enc_paym_24' ]]

In [None]:
#corr_enc = subs_enc.corr()

In [None]:
#plt.figure(figsize=(20,10))
#sns.heatmap(corr_enc,annot=True)
#plt.show()

In [None]:
df1['enc_paym_0'].value_counts()

In [None]:
df1['enc_paym_24'].value_counts()

In [None]:
df1['enc_paym_1'].value_counts()

In [None]:
df1['enc_paym_23'].value_counts()

In [None]:
df1['enc_paym_22'].value_counts()

In [None]:
df1['enc_paym_21'].value_counts()

In [None]:
df1['enc_paym_20'].value_counts()

In [None]:
df1['enc_paym_19'].value_counts()

In [None]:
df1['is_zero_loans5'].value_counts()

In [None]:
df1['is_zero_loans530'].value_counts()

In [None]:
corr_zero = df1[['is_zero_loans5','is_zero_loans530','is_zero_loans3060','is_zero_loans6090','is_zero_loans90']]
corr_zero = corr_zero.corr()
plt.figure(figsize=(20,10))
sns.heatmap(corr_zero,annot=True)
plt.show()

In [None]:
df1.columns

In [None]:
df1['pre_pterm'].value_counts()


In [None]:
features_all = ['id', 'pre_since_opened', 'pre_since_confirmed', 'pre_pterm',
       'pre_fterm', 'pre_till_pclose', 'pre_till_fclose',
       'pre_loans_credit_limit', 'pre_loans_next_pay_summ',
       'pre_loans_outstanding', 'pre_loans_total_overdue',
       'pre_loans_max_overdue_sum', 'pre_loans_credit_cost_rate', 
       'is_zero_loans5', 'is_zero_loans530', 'is_zero_loans3060',
       'is_zero_loans6090', 'is_zero_loans90', 'pre_util', 'pre_over2limit',
       'pre_maxover2limit', 'is_zero_util', 'is_zero_over2limit',
       'is_zero_maxover2limit', 
       'enc_loans_account_holder_type', 'enc_loans_credit_status',
       'enc_loans_credit_type', 'enc_loans_account_cur', 'pclose_flag',
       'fclose_flag']

In [None]:
features = ['id','rn','pre_till_pclose','pre_loans_credit_limit',
'pre_loans_next_pay_summ',
'pre_loans_outstanding',
'pre_loans_total_overdue',
'pre_loans_max_overdue_sum',
'pre_loans_credit_cost_rate','pre_util','enc_loans_account_cur']

In [None]:
df_train = df1[features_all].merge(df_target,how = 'left', on = 'id')
df_train

In [None]:
TRAIN_FEATURES_PATH = "../data/train_features_gb/"
TEST_FEATURES_PATH = "../data/test_features_gb/"

In [None]:
df_train2 = df_train

In [None]:
df_train['flag'].value_counts()

In [None]:
lr = LogisticRegression(class_weight='balanced')

In [None]:
lr.fit(df1[features], df_train['flag'])

In [None]:
y = lr.predict(df1[features])

In [None]:
unique, counts = np.unique(y, return_counts=True)
print (unique, counts)

In [None]:
print(df_train['flag'].value_counts())

In [None]:
metrics.roc_auc_score(df_train['flag'], y)

In [None]:
metrics.classification_report(df_train['flag'], y)

In [None]:
lr2 = LogisticRegression(class_weight = {0:29,1:1})
lr2.fit(df1[features], df_train['flag'])
y2 = lr2.predict(df1[features])
unique, counts = np.unique(y2, return_counts=True)
print (unique, counts)

In [None]:
metrics.classification_report(df_train['flag'], y2)

In [None]:
#не могу понять как выставить правильно веса. при class_weight='balanced' весь прогноз уходит в дефолт, при задании с помощью словаря {0:29,1:1} весь 
#прогноз уходит в недефолт, а при {0:1,1:29} в получается аналогично balanced

In [None]:
test_data = catboost_pool = Pool(df1[features],
                                 df_train['flag'])

model = CatBoostClassifier(iterations=5,
                           depth=2,
                           learning_rate=1,
                           loss_function='Logloss',
                           verbose=True)
# train the model
model.fit(df1[features], df_train['flag'])
# make the prediction using the resulting model
preds_class = model.predict(test_data)
#preds_proba = model.predict_proba(test_data)
unique, counts = np.unique(preds_class, return_counts=True)
print (unique, counts)

In [None]:
print(metrics.roc_auc_score(df_train['flag'], preds_class))
metrics.classification_report(df_train['flag'], preds_class)