In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Check file path

In [None]:
# hmdhmd model
oof_1_path = "/kaggle/input/hmd-no-d-agg-lb-09563/hmd_NO_D_agg_LB_0.9563/oof_hmd_NO_D_agg.csv"
pred_1_path = "/kaggle/input/hmd-no-d-agg-lb-09563/hmd_NO_D_agg_LB_0.9563/pred_hmd_NO_D_agg.csv"

In [None]:
# ML_Bear model
oof_2_path = "/kaggle/input/20191004-ml-bear-withdcolumns/20190925_all_uid_agg_stats_lr001_oof_features1381_oof0.95_pub0.984_pri0.989.csv"
pred_2_path = "/kaggle/input/20191004-ml-bear-withdcolumns/20190925_all_uid_agg_stats_lr001_pred_features1381_oof0.95_pub0.984_pri0.989.csv"

In [None]:
# hakubishin model
oof_3_path = "/kaggle/input/model-23-include-d-agg/oof_preds.npy"    # numpy file!!!
pred_3_path = "/kaggle/input/model-23-include-d-agg/submission.csv"

In [None]:
# holygo model
oof_4_path = "/kaggle/input/holygo-best-lb-include-d-agg/20191003__holygo__oof__CV0.959106__LB0.9598.csv"
pred_4_path = "/kaggle/input/holygo-best-lb-include-d-agg/20191003__holygo__pred__CV0.959106__LB0.9598.csv"

### Load Data

In [None]:
oof_1 = pd.read_csv(oof_1_path).sort_values("TransactionID")["isFraud"].values
oof_2 = pd.read_csv(oof_2_path).sort_values("TransactionID")["isFraud"].values
oof_3 = np.load(oof_3_path)
oof_4 = pd.read_csv(oof_4_path).sort_values("TransactionID")["isFraud"].values

oof_1.shape, oof_2.shape, oof_3.shape, oof_4.shape

In [None]:
pred_1 = pd.read_csv(pred_1_path).sort_values("TransactionID").reset_index(drop=True)
pred_2 = pd.read_csv(pred_2_path).sort_values("TransactionID").reset_index(drop=True)
pred_3 = pd.read_csv(pred_3_path).sort_values("TransactionID").reset_index(drop=True)
pred_4 = pd.read_csv(pred_4_path).sort_values("TransactionID").reset_index(drop=True)

pred_1.shape, pred_2.shape, pred_3.shape, pred_4.shape

In [None]:
train = pd.read_csv("/kaggle/input/ieee-fraud-detection/train_transaction.csv")
y_train = train["isFraud"].values

y_train.shape

### Check score

In [None]:
def calc_bear_score(df):
    df_probing = pd.read_csv('/kaggle/input/20190929-probing-result/20190929_probing.csv').loc[:, ['TransactionID', 'data_type', 'Probing_isFraud']]
    df = pd.merge(df_probing, df, on='TransactionID', how='left')

    # test public score
    public_score = roc_auc_score(
        df[df.data_type=="test_public"]['Probing_isFraud'],
        df[df.data_type=="test_public"]['isFraud']
    )
    # test private score
    private_score = roc_auc_score(
        df[df.data_type=="test_private"]['Probing_isFraud'],
        df[df.data_type=="test_private"]['isFraud']
    )
    return public_score, private_score

In [None]:
cv = roc_auc_score(y_train, oof_1)
pub, prv = calc_bear_score(pred_1)
print(f"hmd model: cv{cv}, pub{pub}, prv{prv}")

cv = roc_auc_score(y_train, oof_2)
pub, prv = calc_bear_score(pred_2)
print(f"bear model: cv{cv}, pub{pub}, prv{prv}")

cv = roc_auc_score(y_train, oof_3)
pub, prv = calc_bear_score(pred_3)
print(f"hakubishin model: cv{cv}, pub{pub}, prv{prv}")

cv = roc_auc_score(y_train, oof_4)
pub, prv = calc_bear_score(pred_4)
print(f"holygo model: cv{cv}, pub{pub}, prv{prv}")

### user info

In [None]:
thres = 2
print(f"user count thres: {thres}")

In [None]:
predicted_user = pd.read_csv('/kaggle/input/20190901-user-ids-share/20190901_user_ids_share.csv').sort_values("TransactionID").reset_index(drop=True)
user_count = predicted_user["predicted_user_id"].value_counts()
target_user_id = user_count[user_count <= thres].index.tolist()
train_predicted_user = predicted_user.iloc[:len(oof_3)]
train_target_df = train_predicted_user.query("predicted_user_id in @target_user_id")
train_target_index = train_target_df.index

In [None]:
cv = roc_auc_score(y_train[train_target_index], oof_1[train_target_index])
print(f"hmd model: cv{cv}")

cv = roc_auc_score(y_train[train_target_index], oof_2[train_target_index])
print(f"bear model: cv{cv}")

cv = roc_auc_score(y_train[train_target_index], oof_3[train_target_index])
print(f"hakubishin model: cv{cv}")

cv = roc_auc_score(y_train[train_target_index], oof_4[train_target_index])
print(f"holygo model: cv{cv}")

### Hand Made

In [None]:
# weight
x_opt = [0.0542, 0.1064, 0.7588, 0.0806]
print(f"hmdhmd:{x_opt[0]}, ml_bear:{x_opt[1]}, hakubishin:{x_opt[2]}, holygo:{x_opt[3]}") 

In [None]:
sum(x_opt)

In [None]:
oof = oof_1 * x_opt[0] + oof_2 * x_opt[1] + oof_3 * x_opt[2] + oof_4 * x_opt[3]
cv = roc_auc_score(y_train, oof)
print(f"ensemble model: cv{cv}")
cv = roc_auc_score(y_train[train_target_index], oof[train_target_index])
print(f"ensemble model count<={thres}: cv{cv}")

In [None]:
sub = pred_3.copy()
sub["isFraud"] = pred_1["isFraud"] * x_opt[0] + pred_2["isFraud"] * x_opt[1] + pred_3["isFraud"] * x_opt[2] + pred_4["isFraud"] * x_opt[3]
pub, prv = calc_bear_score(sub)
print(f"ensemble model: pub{pub}, prv{prv}")

### optimize

(関数だけ実装して置いておく、必要に応じて使う）

In [None]:
optimize = False

In [None]:
if optimize:
    !pip install Gpy
    !pip install GpyOpt

In [None]:
bounds = [
    {'name': 'x0', 'type': 'continuous', 'domain': (0.05, 1)},
    {'name': 'x1', 'type': 'continuous', 'domain': (0.05, 1)},
    {'name': 'x2', 'type': 'continuous', 'domain': (0.05, 1)},
    {'name': 'x3', 'type': 'continuous', 'domain': (0.05, 1)},
]

constraints = [
    {
        'name': 'constr_1',
        'constraint': '(x[:,0] + x[:,1] + x[:,2] + x[:,3]) - 1 - 0.001'
    },
    {
        'name': 'constr_2',
        'constraint': '1 - (x[:,0] + x[:,1] + x[:,2] + x[:,3]) - 0.001'
    }
]

def f(x):
    x0 = x[:, 0]
    x1 = x[:, 1]
    x2 = x[:, 2]
    x3 = x[:, 3]
    
    sub = pred_3.copy()
    sub["isFraud"] = pred_1["isFraud"] * x0 + pred_2["isFraud"] * x1 + pred_3["isFraud"] * x2 + pred_4["isFraud"] * x3
    public_score, private_score = calc_bear_score(sub)
    opt_value = -1 * private_score
    
    return opt_value

In [None]:
if optimize:
    import GPyOpt
    myBopt = GPyOpt.methods.BayesianOptimization(f=f, domain=bounds, constraints=constraints)
    myBopt.run_optimization(max_iter=10)
    print(f"rate: {myBopt.x_opt}") 
    print(f"value: {myBopt.fx_opt}")

### override probing_isfraud=1 of probing.csv

In [None]:
# override probing value and save
df_probing = pd.read_csv('/kaggle/input/20190929-probing-result/20190929_probing.csv').loc[:, ['TransactionID', 'data_type', 'Probing_isFraud']]
sub = pd.merge(sub, df_probing, on="TransactionID", how="left")

# override only probing_isfraud = 1
sub.loc[sub.Probing_isFraud == 1, "isFraud"] = 1
sub = sub[["TransactionID", "isFraud"]]
pub, prv = calc_bear_score(sub)

print(f"ensemble model after override proving value: pub{pub}, prv{prv}")

### override probing_isfraud=1 of no_probing.csv

In [None]:
# 20190929_probing.csvでoverrideされているか確認
num_1 = (sub["isFraud"]==1).sum()
num_0 = (sub["isFraud"]==0).sum()
num_0, num_1

In [None]:
no_prob = pd.read_csv("/kaggle/input/20190929-probing-result/20190929_no_probing.csv").loc[:, ['TransactionID', 'data_type', 'Probing_isFraud']]
print(no_prob.shape)
print((no_prob.Probing_isFraud == 1).sum())

In [None]:
no_prob.isnull().sum()

In [None]:
sub = pd.merge(sub, no_prob, on="TransactionID", how="left")
sub.head()

In [None]:
# override only probing_isfraud = 1
sub.loc[sub.Probing_isFraud == 1, "isFraud"] = 1
num_1 = (sub["isFraud"]==1).sum()
num_0 = (sub["isFraud"]==0).sum()
num_0, num_1

### save

In [None]:
sub = sub[["TransactionID", "isFraud"]]
sub.to_csv("sub_avg_with_no_probing.csv", header=True, index=False)