In [None]:
!nvidia-smi

Mon Aug  1 15:46:57 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    25W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# ! git clone --recursive https://github.com/Microsoft/LightGBM
# ! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;

Cloning into 'LightGBM'...
remote: Enumerating objects: 26980, done.[K
remote: Counting objects: 100% (4103/4103), done.[K
remote: Compressing objects: 100% (433/433), done.[K
remote: Total 26980 (delta 3866), reused 3752 (delta 3663), pack-reused 22877[K
Receiving objects: 100% (26980/26980), 19.38 MiB | 17.69 MiB/s, done.
Resolving deltas: 100% (19993/19993), done.
Submodule 'include/boost/compute' (https://github.com/boostorg/compute) registered for path 'external_libs/compute'
Submodule 'eigen' (https://gitlab.com/libeigen/eigen.git) registered for path 'external_libs/eigen'
Submodule 'external_libs/fast_double_parser' (https://github.com/lemire/fast_double_parser.git) registered for path 'external_libs/fast_double_parser'
Submodule 'external_libs/fmt' (https://github.com/fmtlib/fmt.git) registered for path 'external_libs/fmt'
Cloning into '/content/LightGBM/external_libs/compute'...
remote: Enumerating objects: 21733, done.        
remote: Counting objects: 100% (5/5), done.  

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


In [None]:
import gc
import os
import joblib
import random
import warnings
import itertools
import scipy as sp
import numpy as np
import pandas as pd
from tqdm import tqdm
import lightgbm as lgb
from itertools import combinations
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
from sklearn.preprocessing import LabelEncoder
import warnings; warnings.filterwarnings('ignore')
from sklearn.model_selection import StratifiedKFold, train_test_split

def get_difference(data, num_features):
    df1 = []
    customer_ids = []
    for customer_id, df in tqdm(data.groupby(['customer_ID'])):
        diff_df1 = df[num_features].diff(1).iloc[[-1]].values.astype(np.float32)
        df1.append(diff_df1)
        customer_ids.append(customer_id)
    df1 = np.concatenate(df1, axis = 0)
    df1 = pd.DataFrame(df1, columns = [col + '_diff1' for col in df[num_features].columns])
    df1['customer_ID'] = customer_ids
    return df1

def read_preprocess_data():
    train = pd.read_parquet('/content/drive/MyDrive/Amex Kaggle/Derived Features/train.parquet')
    features = train.drop(['customer_ID', 'S_2'], axis = 1).columns.to_list()
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]
    num_features = [col for col in features if col not in cat_features]
    print('Starting training feature engineer...')
    train_num_agg = train.groupby("customer_ID")[num_features].agg(['first', 'mean', 'std', 'min', 'max', 'last','quantile'])
    train_num_agg.columns = ['_'.join(x) for x in train_num_agg.columns]
    train_num_agg.reset_index(inplace = True)

    # Lag Features
    for col in train_num_agg:
        if 'last' in col and col.replace('last', 'first') in train_num_agg:
            train_num_agg[col + '_lag_sub'] = train_num_agg[col] - train_num_agg[col.replace('last', 'first')]
            train_num_agg[col + '_lag_div'] = train_num_agg[col] / train_num_agg[col.replace('last', 'first')]

    train_cat_agg = train.groupby("customer_ID")[cat_features].agg(['first', 'last', 'nunique'])
    train_cat_agg.columns = ['_'.join(x) for x in train_cat_agg.columns]
    train_cat_agg.reset_index(inplace = True)
    
    train_labels = pd.read_csv('/content/drive/MyDrive/Amex Kaggle/train_labels.csv')
    # Transform float64 columns to float32
    cols = list(train_num_agg.dtypes[train_num_agg.dtypes == 'float64'].index)
    for col in tqdm(cols):
        train_num_agg[col] = train_num_agg[col].astype(np.float32)
    # Transform int64 columns to int32
    cols = list(train_cat_agg.dtypes[train_cat_agg.dtypes == 'int64'].index)
    for col in tqdm(cols):
        train_cat_agg[col] = train_cat_agg[col].astype(np.int32)
    # Get the difference
    train_diff = get_difference(train, num_features)
    train1 = train_num_agg.merge(train_cat_agg, how = 'inner', on = 'customer_ID').merge(train_diff, how = 'inner', on = 'customer_ID').merge(train_labels, how = 'inner', on = 'customer_ID')
    del train_num_agg, train_cat_agg, train_diff
    gc.collect()
    num_features.append('customer_ID')
    cat_features.append('customer_ID')
    train_num_agg = train[num_features].groupby("customer_ID").tail(3).groupby("customer_ID").agg(['mean', 'min', 'max', 'std','quantile'])
    train_num_agg.columns = ['_L3M'.join(x) for x in train_num_agg.columns]
    train_num_agg.reset_index(inplace = True)
    train_cat_agg = train[cat_features].groupby("customer_ID").tail(3).groupby("customer_ID").agg([ 'min', 'max'])
    train_cat_agg.columns = ['_L3M'.join(x) for x in train_cat_agg.columns]
    train_cat_agg.reset_index(inplace = True)

    num_features.remove('customer_ID')
    cat_features.remove('customer_ID')
    train2 = train_num_agg.merge(train_cat_agg, how = 'inner', on = 'customer_ID')
    del train_num_agg, train_cat_agg
    train = train1.merge(train2, how = 'left', on = 'customer_ID')
    gc.collect()

    train_slope=pd.read_csv('/content/drive/MyDrive/Amex Kaggle/Derived Features/cust_id_wise_slope.csv')
    train=train.merge(train_slope,how='left',on='customer_ID')



    # Test FE
    test = pd.read_parquet('/content/drive/MyDrive/Amex Kaggle/Derived Features/test.parquet')
    print('Starting test feature engineer...')
    test_num_agg = test.groupby("customer_ID")[num_features].agg(['first', 'mean', 'std', 'min', 'max', 'last','quantile'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
    test_num_agg.reset_index(inplace = True)

    # Lag Features
    for col in test_num_agg:
        if 'last' in col and col.replace('last', 'first') in test_num_agg:
            test_num_agg[col + '_lag_sub'] = test_num_agg[col] - test_num_agg[col.replace('last', 'first')]
            test_num_agg[col + '_lag_div'] = test_num_agg[col] / test_num_agg[col.replace('last', 'first')]

    test_cat_agg = test.groupby("customer_ID")[cat_features].agg(['first', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
    test_cat_agg.reset_index(inplace = True)
    # Transform float64 columns to float32
    cols = list(test_num_agg.dtypes[test_num_agg.dtypes == 'float64'].index)
    for col in tqdm(cols):
        test_num_agg[col] = test_num_agg[col].astype(np.float32)
    # Transform int64 columns to int32
    cols = list(test_cat_agg.dtypes[test_cat_agg.dtypes == 'int64'].index)
    for col in tqdm(cols):
        test_cat_agg[col] = test_cat_agg[col].astype(np.int32)
    # Get the difference
    test_diff = get_difference(test, num_features)
    test1 = test_num_agg.merge(test_cat_agg, how = 'inner', on = 'customer_ID').merge(test_diff, how = 'inner', on = 'customer_ID')
    del test_num_agg, test_cat_agg, test_diff
    gc.collect()

    num_features.append('customer_ID')
    cat_features.append('customer_ID')
    test_num_agg = test[num_features].groupby("customer_ID").tail(3).groupby("customer_ID").agg(['mean', 'min', 'max', 'std','quantile'])
    test_num_agg.columns = ['_L3M'.join(x) for x in test_num_agg.columns]
    test_num_agg.reset_index(inplace = True)
    test_cat_agg = test[cat_features].groupby("customer_ID").tail(3).groupby("customer_ID").agg([ 'min', 'max'])
    test_cat_agg.columns = ['_L3M'.join(x) for x in test_cat_agg.columns]
    test_cat_agg.reset_index(inplace = True)

    num_features.remove('customer_ID')
    cat_features.remove('customer_ID')
    test2 = test_num_agg.merge(test_cat_agg, how = 'inner', on = 'customer_ID')
    del test_num_agg, test_cat_agg
    test = test1.merge(test2, how = 'left', on = 'customer_ID')
    gc.collect()
    # Save files to disk
    test_slope=pd.read_csv('/content/drive/MyDrive/Amex Kaggle/Derived Features/cust_id_wise_slope_test.csv')
    test=test.merge(test_slope,how='left',on='customer_ID')

    print(train.shape)
    print(test.shape)
    # train = reduce_mem_usage(train)
    # test = reduce_mem_usage(test)
    train.to_parquet('/content/drive/MyDrive/Amex Kaggle/Derived Features/train_fe_plus_plus1.parquet')
    test.to_parquet('/content/drive/MyDrive/Amex Kaggle/Derived Features/test_fe_plus_plus1.parquet')
    
# Read & Preprocess Data
read_preprocess_data()

Starting training feature engineer...


100%|██████████| 522/522 [01:35<00:00,  5.47it/s] 
100%|██████████| 11/11 [00:00<00:00, 137.71it/s]
100%|██████████| 458913/458913 [11:19<00:00, 675.64it/s]


Starting test feature engineer...


100%|██████████| 522/522 [03:25<00:00,  2.54it/s] 
100%|██████████| 11/11 [00:00<00:00, 64.68it/s]
100%|██████████| 924621/924621 [23:01<00:00, 669.50it/s]


(458913, 2889)
(924621, 2888)


In [None]:
import gc
class CFG:
    seed = 13
    n_folds = 10
    target = 'target'
    input_dir = '/content/data/'

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

def read_data():
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68"
    ]
    train = pd.read_parquet('/content/drive/MyDrive/Amex Kaggle/Derived Features/train_fe_plus_plus1.parquet')
    # cat_features = [f"{cf}_last" for cf in cat_features]
    # train=train[cat_features]
    # test = pd.read_parquet('/content/drive/MyDrive/Amex Kaggle/Derived Features/test_fe_plus_plus1.parquet')
    # del test['customer_ID']
    # gc.collect()
    return train

def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

def amex_metric_np(preds, target):
    indices = np.argsort(preds)[::-1]
    preds, target = preds[indices], target[indices]
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_mask = cum_norm_weight <= 0.04
    d = np.sum(target[four_pct_mask]) / np.sum(target)
    weighted_target = target * weight
    lorentz = (weighted_target / weighted_target.sum()).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()
    n_pos = np.sum(target)
    n_neg = target.shape[0] - n_pos
    gini_max = 10 * n_neg * (n_pos + 20 * n_neg - 19) / (n_pos + 20 * n_neg)
    g = gini / gini_max
    return 0.5 * (g + d)

In [None]:
# !mkdir '/content/drive/MyDrive/LGBModels_dart6'
# !mkdir '/content/drive/MyDrive/LGBModels_dart6/fold_0'
# !mkdir '/content/drive/MyDrive/LGBModels_dart6/fold_1'
# !mkdir '/content/drive/MyDrive/LGBModels_dart6/fold_2'
# !mkdir '/content/drive/MyDrive/LGBModels_dart6/fold_3'
# !mkdir '/content/drive/MyDrive/LGBModels_dart6/fold_4'
# !mkdir '/content/drive/MyDrive/LGBModels_dart6/fold_5'
# !mkdir '/content/drive/MyDrive/LGBModels_dart6/fold_6'
# !mkdir '/content/drive/MyDrive/LGBModels_dart6/fold_7'
# !mkdir '/content/drive/MyDrive/LGBModels_dart6/fold_8'
# !mkdir '/content/drive/MyDrive/LGBModels_dart6/fold_9'

In [None]:
import gc
import os
import joblib
import random
import warnings
import itertools
import scipy as sp
import numpy as np
import pandas as pd
from tqdm import tqdm
import lightgbm as lgb
from itertools import combinations
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
from sklearn.preprocessing import LabelEncoder
import warnings; warnings.filterwarnings('ignore')
from sklearn.model_selection import StratifiedKFold, train_test_split

def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True

def train_and_evaluate(train):
    # Label encode categorical features
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68"
    ]
    cat_features = [f"{cf}_last" for cf in cat_features]
    for cat_col in cat_features:
        encoder = LabelEncoder()
        train[cat_col] = encoder.fit_transform(train[cat_col])
        # test[cat_col] = encoder.transform(test[cat_col])
    # Round last float features to 2 decimal place
    # num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
    # num_cols = [col for col in num_cols if 'last' in col]
    # for col in num_cols:
    #     train[col + '_round2'] = train[col].round(2)
        # test[col + '_round2'] = test[col].round(2)
    # Get the difference between last and mean
    num_cols = [col for col in train.columns if 'last' in col]
    num_cols = [col[:-5] for col in num_cols if 'round' not in col]
    for col in num_cols:
        try:
            train[f'{col}_last_first_R'] = train[f'{col}_last']/train[f'{col}_first']
            train[f'{col}_last_mean_diff'] = train[f'{col}_last'] - train[f'{col}_mean']
            train[f'{col}_last_mean_R'] = train[f'{col}_last']/train[f'{col}_mean']
            train[f'{col}_last_max_diff'] = train[f'{col}_last'] - train[f'{col}_max']
            train[f'{col}_last_max_R'] = train[f'{col}_last']/train[f'{col}_max']
            train[f'{col}_max_mean_R'] = train[f'{col}_max']/train[f'{col}_mean']
            # test[f'{col}_last_mean_diff'] = test[f'{col}_last'] - test[f'{col}_mean']
        except:
            pass
    # Transform float64 and float32 to float16
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
    for col in tqdm(num_cols):
        train[col] = train[col].astype(np.float16)
        # test[col] = test[col].astype(np.float16)
    # Get feature list
    features = [col for col in train.columns if col not in ['customer_ID', CFG.target]]
    params = {
        'objective': 'binary',
        'metric': "amex_metric",
        'boosting': 'dart',
        'seed': CFG.seed,
        'num_leaves': 100,
        'learning_rate': 0.01,
        'feature_fraction': 0.20,
        'bagging_freq': 10,
        'bagging_fraction': 0.50,
        'n_jobs': -1,
        'lambda_l2': 2,
        'min_data_in_leaf': 40
        }
    # Create a numpy array to store test predictions
    # test_predictions = np.zeros(len(test))
    # Create a numpy array to store out of folds predictions
    # oof_predictions = np.zeros(len(train))
    # train=reduce_mem_usage(train)
    kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):
      if fold in [4,5]:
        print(' ')
        print('-'*50)
        print(f'Training fold {fold} with {len(features)} features...')
        x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
        y_train, y_val = train[CFG.target].iloc[trn_ind], train[CFG.target].iloc[val_ind]
        lgb_train = lgb.Dataset(x_train, y_train, categorical_feature = cat_features)
        lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature = cat_features)
        global max_score 
        max_score = 0.75
        def save_model():
          def callback(env):
              global max_score
              iteration = env.iteration
              score = env.evaluation_result_list[0][2]
              if iteration % 100 == 0:
                    print('iteration {}, score= {:.05f}'.format(iteration,score))
              if score > max_score:
                    max_score = score
                    path = f'/content/drive/MyDrive/LGBModels_dart7/fold_{fold}'
                    for fname in os.listdir(path):
                          if fname.endswith(".pkl"):
                            os.remove(os.path.join(path, fname))
                    # print('High Score: iteration {}, score={:.05f}'.format(iteration, score))
                    joblib.dump(env.model,os.path.join(path,f"{score}.pkl"))
          callback.order = 0
          return callback
        model = lgb.train(
            params = params,
            train_set = lgb_train,
            num_boost_round = 15000,
            valid_sets = [lgb_valid],
            # early_stopping_rounds = 2000,
            # verbose_eval = 500,
            feval = lgb_amex_metric,
            callbacks=[save_model()]
            )
        feat_imp=pd.DataFrame({'Variables':features,'Importance':model.feature_importance()})
        feat_imp=feat_imp.sort_values(by='Importance',ascending=False)
        print(feat_imp.head(20))
        # Save best model
        # joblib.dump(model, f'lgbm_fold{fold}_seed{CFG.seed}.pkl')
        # # Predict validation
        # val_pred = model.predict(x_val)
        # # Add to out of folds array
        # oof_predictions[val_ind] = val_pred
        # # Predict the test set
        # test_pred = model.predict(test[features])
        # test_predictions += test_pred / CFG.n_folds
        # # Compute fold metric
        # score = amex_metric(y_val, val_pred)
        # print(f'Our fold {fold} CV score is {score}')
        del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
        gc.collect()
    # Compute out of folds metric
    # score = amex_metric(train[CFG.target], oof_predictions)
    # print(f'Our out of folds CV score is {score}')
    # # Create a dataframe to store out of folds predictions
    # oof_df = pd.DataFrame({'customer_ID': train['customer_ID'], 'target': train[CFG.target], 'prediction': oof_predictions})
    # oof_df.to_csv(f'oof_lgbm_baseline_{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)
    # # Create a dataframe to store test prediction
    # test_df = pd.DataFrame({'customer_ID': test['customer_ID'], 'prediction': test_predictions})
    # test_df.to_csv(f'test_lgbm_baseline_{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)

seed_everything(CFG.seed)
train = read_data()
train_and_evaluate(train)

100%|██████████| 3233/3233 [11:55<00:00,  4.52it/s] 


 
--------------------------------------------------
Training fold 4 with 3960 features...
[LightGBM] [Info] Number of positive: 106946, number of negative: 306076
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 631981
[LightGBM] [Info] Number of data points in the train set: 413022, number of used features: 3943
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258935 -> initscore=-1.051509
[LightGBM] [Info] Start training from score -1.051509
iteration 0, score= 0.69055
iteration 100, score= 0.75980
iteration 200, score= 0.76070
iteration 300, score= 0.76305
iteration 400, score= 0.76548
iteration 500, score= 0.76636
iteration 600, score= 0.76911
iteration 700, score= 0.77116
iteration 800, score= 0.77289
iteration 900, score= 0.77500
iteration 1000, score= 0.77827
iteration 1100, score= 0.78059
iteration 1200, score= 0.78157
iteration 1300, score= 0.78346
iteration 1400, score= 0.78377
iteration 1500, score= 0.78540
iteration 1600, score= 0.786