In [1]:
import gc
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import sys
sys.path.append("../")
import time
import warnings
warnings.simplefilter("ignore")
from lightgbm import LGBMClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, fbeta_score, make_scorer
from tqdm import tqdm

In [2]:
from utils.common import (
    sigmoid, reverse_sigmoid,
    pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, 
    get_cols
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS
)

In [3]:
%load_ext autoreload
%autoreload

### Read Data

In [4]:
train_score1 = read_file(f"{EXP_PATH}/1.lgbm_dart_923/train_scores_df.csv")
train_score2 = read_file(f"{EXP_PATH}/2.lgbm_dart_1020/train_scores_df.csv")
train_score3 = read_file(f"{EXP_PATH}/3.lgbm_dart_5197/train_scores_df.csv")
train_score4 = read_file(f"{EXP_PATH}/4.lgbm_dart_6666/train_scores_df.csv")
train_score4 = read_file(f"{EXP_PATH}/5.lgbm_dart_923_half_fix/train_scores_df.csv")

Shape of data: (458913, 8)
Shape of data: (458913, 8)
Shape of data: (458913, 8)
Shape of data: (458913, 8)
Shape of data: (458913, 8)


In [5]:
sum(train_score3["target"].values != train_score4["target"].values)

0

In [6]:
t1 = train_score1["cv_score"].values
t2 = train_score2["cv_score"].values
t3 = train_score3["cv_score"].values
t4 = train_score4["cv_score"].values
t5 = train_score4["cv_score"].values

In [8]:
## 1ST - 0.799 (medium)
amex_metric(train_score1["target"], t1)

(0.7986772206700369, 0.9259891065704908, 0.6713653347695829)

In [9]:
## 3RD - 0.800 (low)
amex_metric(train_score1["target"], t2)

(0.7987062094874274, 0.9260807463051137, 0.6713316726697411)

In [10]:
## 5TH - 0.799 (low)
amex_metric(train_score1["target"], t3)

(0.797788577455728, 0.925642459385149, 0.6699346955263069)

In [11]:
### 6TH - 0.800 (very low)
amex_metric(train_score1["target"], t4)

(0.7987959566465116, 0.9262181629984798, 0.6713737502945434)

In [7]:
### 7TH - 0.798 (high)
amex_metric(train_score1["target"], t5)

(0.7981401067995282, 0.9259920660244106, 0.6702881475746457)

In [46]:
from collections import defaultdict
from itertools import product

In [47]:
m = MinMaxScaler()

In [48]:
t1_ = m.fit_transform(t1.reshape(-1, 1))[:, 0]
t2_ = m.fit_transform(t2.reshape(-1, 1))[:, 0]
t3_ = m.fit_transform(t3.reshape(-1, 1))[:, 0]
t4_ = m.fit_transform(t4.reshape(-1, 1))[:, 0]
t5_ = m.fit_transform(t5.reshape(-1, 1))[:, 0]

In [54]:
sign_series = np.where(sum([(t1 - 20) > 0, (t2 - 20) > 0, t3 > 0, (t4 > 0)]) >= 2, 1, -1)

In [55]:
abs_series = t1 * t2 * t3 * t4

In [56]:
blend = abs_series * sign_series

In [57]:
amex_metric(train_score1["target"], blend)

(0.7994422213643748, 0.9263072723648623, 0.6725771703638873)

In [58]:
x = np.arange(0.1, 0.6, 0.1)

In [59]:
d = defaultdict(list)
for i, j, k, l, m in product(x, x, x, x, x):
    d["i"].append(i)
    d["j"].append(j)
    d["k"].append(k)
    d["l"].append(l)
    d["m"].append(m)
    # blend = i * sigmoid(t1) + j * sigmoid(t2) + k * sigmoid(t3) + l * sigmoid(t4)
    blend = i * t1_ + j * t2_ + k * t3_ + l * t4_ + m * t5_
    overall, gini, top4pct = amex_metric(train_score1["target"], blend)
    d["score"].append(overall)
    d["gini"].append(gini)
    d["top4pct"].append(top4pct)

In [60]:
result = pd.DataFrame(d)
# result.nsmallest(50, "score")

In [61]:
result.nlargest(10, "score")

Unnamed: 0,i,j,k,l,m,score,gini,top4pct
1900,0.4,0.1,0.2,0.1,0.1,0.799844,0.926446,0.673242
2877,0.5,0.4,0.1,0.1,0.3,0.799818,0.92652,0.673116
2881,0.5,0.4,0.1,0.2,0.2,0.799818,0.92652,0.673116
2885,0.5,0.4,0.1,0.3,0.1,0.799818,0.92652,0.673116
2626,0.5,0.2,0.1,0.1,0.2,0.799782,0.926482,0.673082
2630,0.5,0.2,0.1,0.2,0.1,0.799782,0.926482,0.673082
2651,0.5,0.2,0.2,0.1,0.2,0.799776,0.926479,0.673074
2655,0.5,0.2,0.2,0.2,0.1,0.799776,0.926479,0.673074
2752,0.5,0.3,0.1,0.1,0.3,0.799772,0.926505,0.67304
2756,0.5,0.3,0.1,0.2,0.2,0.799772,0.926505,0.67304


In [123]:
result.nlargest(10, "score")

Unnamed: 0,i,j,k,l,score,gini,top4pct
6341,0.9,0.7,0.3,0.6,0.800075,0.926604,0.673545
6342,0.9,0.7,0.3,0.7,0.800062,0.926613,0.673511
5531,0.8,0.6,0.3,0.6,0.800051,0.926608,0.673494
5367,0.8,0.4,0.3,0.4,0.800043,0.926575,0.673511
6331,0.9,0.7,0.2,0.5,0.800043,0.9266,0.673486
5521,0.8,0.6,0.2,0.5,0.800012,0.926606,0.673419
1540,0.3,0.2,0.1,0.2,0.800011,0.926604,0.673419
3900,0.6,0.4,0.2,0.4,0.800011,0.926604,0.673419
6260,0.9,0.6,0.3,0.6,0.800011,0.926604,0.673419
4711,0.7,0.5,0.2,0.5,0.800007,0.926612,0.673402


In [14]:
dd = defaultdict(list)
for i in np.arange(0.46, 0.54, 0.01):
    for j in np.arange(0.22, 0.28, 0.01):
        for k in np.arange(0.22, 0.28, 0.01):
            dd["i"].append(i)
            dd["j"].append(j)
            dd["k"].append(k)
            blend = i*t1 + j*t2 + k*t3
            overall, gini, top4pct = amex_metric(train_score1["target"], blend)
            dd["score"].append(overall)
            dd["gini"].append(gini)
            dd["top4pct"].append(top4pct)

In [15]:
result = pd.DataFrame(dd)
result.nlargest(5, "score")

Unnamed: 0,i,j,k,score,gini,top4pct
15,0.46,0.24,0.23,0.799824,0.926431,0.673217
72,0.47,0.25,0.24,0.799816,0.926431,0.6732
22,0.46,0.25,0.23,0.799812,0.926433,0.673192
128,0.48,0.26,0.24,0.799812,0.926433,0.673192
340,0.52,0.28,0.26,0.799812,0.926433,0.673192


In [19]:
sub = pd.read_csv(f"{SUBMISSION_DATA_PATH}/empty.csv")
sub["prediction"] = pd.Series(
    (0.9 * t1) + 
    (0.7 * t2) + 
    (0.4 * t3) + 
    (0.6 * t4)
)
sub.to_csv(f"{SUBMISSION_DATA_PATH}/blended_train_scores.csv", index=False)

### BLEND TEST

In [9]:
test_score1 = read_file(f"{EXP_PATH}/1.lgbm_dart_923/scores_df.csv")
test_score2 = read_file(f"{EXP_PATH}/2.lgbm_dart_1020/scores_df.csv")
test_score3 = read_file(f"{EXP_PATH}/3.lgbm_dart_5197/scores_df.csv")
test_score4 = read_file(f"{EXP_PATH}/4.lgbm_dart_6666/scores_df.csv")
test_score5 = read_file(f"{EXP_PATH}/5.lgbm_dart_923_half_fix/scores_df.csv")

Shape of data: (924621, 5)
Shape of data: (924621, 5)
Shape of data: (924621, 5)
Shape of data: (924621, 5)
Shape of data: (924621, 5)


In [20]:
s = read_file(f"{SUBMISSION_DATA_PATH}/submission3_lgbm_dart_full_fix_1020.csv")["prediction"]

Shape of data: (924621, 2)


In [26]:
s2 = test_score2.mean(axis=1)

In [64]:
m = MinMaxScaler()

In [33]:
test_score2_ = m.fit_transform(test_score2)

In [35]:
np.mean(test_score2_, axis=1)

array([0.32552628, 0.14096738, 0.35586654, ..., 0.52899043, 0.49217374,
       0.36775012])

In [36]:
sub = pd.read_csv(f"{SUBMISSION_DATA_PATH}/empty.csv")
sub["prediction"] = np.mean(test_score2_, axis=1)
sub.to_csv(f"{SUBMISSION_DATA_PATH}/submission_9TH_seed_1020_average_fold_after_minmaxscaler.csv", index=False)

In [67]:
sub = pd.read_csv(f"{SUBMISSION_DATA_PATH}/empty.csv")
sub["prediction"] = (
    (0.4 * m.fit_transform(test_score1).mean(axis=1)) + 
    (0.1 * m.fit_transform(test_score2).mean(axis=1)) + 
    (0.2 * m.fit_transform(test_score3).mean(axis=1)) + 
    (0.1 * m.fit_transform(test_score4).mean(axis=1)) +
    (0.1 * m.fit_transform(test_score5).mean(axis=1))
) / 0.9
sub.to_csv(f"{SUBMISSION_DATA_PATH}/submission_10TH_blend_5_own_seeds_after_min_max_scaled.csv", index=False)

In [68]:
sub["prediction"].describe()

count    924621.000000
mean          0.385767
std           0.194353
min           0.000000
25%           0.220337
50%           0.344674
75%           0.538963
max           0.994977
Name: prediction, dtype: float64