## Preliminary Note 

We ran a limited number of iterations in GPU mode because the configurations in this notebook took a long time in our training with the device=CPU parameter.

The full training in CPU mode took about 12 hours.

**If you want to do full training in CPU**
* set ITERATION to 10000 
* set DEVICE to CPU

## Imports

In [1]:
%pip install nb_black

import warnings
from tqdm import tqdm
from pathlib import Path
from itertools import repeat

import numpy as np
import pandas as pd
from scipy.stats import skew

from sklearn.metrics import f1_score
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedGroupKFold

import lightgbm as lgbm
from catboost import CatBoostClassifier, Pool
from catboost.utils import get_roc_curve, select_threshold

%load_ext lab_black
warnings.filterwarnings("ignore")

pd.options.display.max_rows = 250
pd.options.display.max_columns = 250

Collecting nb_black
  Downloading nb_black-1.0.7.tar.gz (4.8 kB)
Building wheels for collected packages: nb-black
  Building wheel for nb-black (setup.py): started
  Building wheel for nb-black (setup.py): finished with status 'done'
  Created wheel for nb-black: filename=nb_black-1.0.7-py3-none-any.whl size=5298 sha256=3094c6aebb916e316a3a7db4fbc7d4eb6c4bda38ffe806d8e70663eecd83bc87
  Stored in directory: c:\users\batuhan\appdata\local\pip\cache\wheels\25\9d\fc\6312e3d8a66c164cc7f9d80bc66cc25de03a362a30b9f84aa3
Successfully built nb-black
Installing collected packages: nb-black
Successfully installed nb-black-1.0.7
Note: you may need to restart the kernel to use updated packages.


ModuleNotFoundError: No module named 'catboost'

## Definitions

In [2]:
SEED = 42
FOLD_CNT = 9
sparsity_threshold = 0.0003

ITERATION = 1000
DEVICE = "GPU"

label = "ARTIS_DURUMU"
id_substr_list = ["ID"]
drop_cols = pd.Index(["POLICY_ID", "BASLANGIC_TARIHI"])
cat_col_diff = ["BASLANGIC_TARIHI", "GELIR"]
cat_col_union = []

sparse_categories = [
    "UYRUK",
    "MESLEK",
    "MESLEK_KIRILIM",
    "KAPSAM_TIPI",
    "DAGITIM_KANALI",
    "POLICE_SEHIR",
]

caterogical_cols = [
    "OFFICE_ID",
    "YATIRIM_KARAKTERI",
    "POLICE_SEHIR",
    "SIGORTA_TIP",
    "MUSTERI_SEGMENTI",
    "MESLEK_KIRILIM",
    "SOZLESME_KOKENI",
    "SOZLESME_KOKENI_DETAY",
    "KAPSAM_TIPI",
    "KAPSAM_GRUBU",
    "DAGITIM_KANALI",
    "uyruk_memleket",
    "dogumtarihi_cinsiyet",
    "meslek_medenihal_egitim",
    "dogumtarihi_meslek_medenihal",
    "sigortatip_kapsamtipi",
    "gelir_medenihal_egitim",
    "GELIR_qcut",
]

## Preprocess Utilities

In [3]:
def preprocess_dataset(df_):

    df = df_.copy()

    df["GELIR"] = df["GELIR"].str.replace(",", ".").astype(float)
    df.loc[df["GELIR"] >= 20000, "GELIR"] = np.nan
    df.loc[df["GELIR"] <= 100, "GELIR"] = np.nan

    row_cnt, col_cnt = df.shape

    cat_col_diff = ["baslangic_tarihi"]
    cat_col_union = []

    train.dtypes[train.dtypes == "object"].index

    id_cols = (
        (df.nunique() == row_cnt).to_frame("is_matched").query("is_matched").index
    ).union(df.columns.str.contains("|".join(id_substr_list)))

    cat_cols = df.dtypes[df.dtypes == "object"].index.difference(
        id_cols.union(cat_col_diff)
    )

    df.loc[:, cat_cols] = df.loc[:, cat_cols].fillna("UNKNOWN").astype("category")
    return df


def replace_non_recurred_categorical_values(cat_col, train, test, inplace=True):
    train_vals = pd.Index(train[cat_col].unique())
    test_vals = pd.Index(test[cat_col].unique())

    exclude_vars = train_vals.union(test_vals).difference(train_vals.intersection(test_vals))

    replace_mappings = dict(
        zip(
            exclude_vars,
            repeat(np.nan),
        )
    )

    # print(" - ".join(map(str, exclude_vars)))

    if replace_mappings:
        train[cat_col] = train[cat_col].replace(replace_mappings)
        test[cat_col] = test[cat_col].replace(replace_mappings)

## FE utilities 

In [4]:
def extract_month_year(df):
    assert "BASLANGIC_TARIHI" in df.columns

    datetime_obj = pd.to_datetime(df.BASLANGIC_TARIHI)
    df["baslangic_month"] = datetime_obj.dt.month
    df["baslangic_year"] = datetime_obj.dt.year


def increase_features(df, cols, groupname=""):
    comparison_df = pd.DataFrame(
        np.argwhere(df[cols[1:]].values > df[cols[:-1]].values),
        columns=["index", "monthnum"],
    )
    df[groupname + "_increase_month_count"] = comparison_df.groupby("index")["monthnum"].count()
    df[groupname + "_increase_month_count"].fillna(0, inplace=True)
    
    df[groupname + "_increase_last_month"] = comparison_df.groupby("index")["monthnum"].max()
    df[groupname + "_increase_last_month"].fillna(-1, inplace=True)


def extract_tutar_stats(df, cols=[], groupname="None"):
    df[groupname + "_max"] = df[cols].max(axis=1)
    df[groupname + "_std"] = df[cols].std(axis=1)
    df[groupname + "_var"] = df[cols].var(axis=1)
    df[groupname + "_mad"] = df[cols].mad(axis=1)
    df[groupname + "_mean"] = df[cols].mean(axis=1)
    df[groupname + "_sum"] = df[cols].sum(axis=1)
    df[groupname + "_min"] = df[cols].min(axis=1)
    df[groupname + "_q05"] = df[cols].quantile(0.05, axis=1)
    df[groupname + "_q95"] = df[cols].quantile(0.95, axis=1)
    df[groupname + "_q25"] = df[cols].quantile(0.25, axis=1)
    df[groupname + "_q75"] = df[cols].quantile(0.75, axis=1)
    #     df[groupname+"_nunq"] = df[cols].nunique(axis=1)
    df[groupname + "_skew"] = df[cols].skew(axis=1)
    df[groupname + "_kurt"] = df[cols].kurt(axis=1)


## Definitions 

## FE

In [5]:
def aggregate_catcols(df, to_group, comb_size):
    from itertools import combinations

    return pd.concat(
        [
            df.loc[:, grp.split(" - ")].astype(str).sum(axis=1).to_frame(grp)
            for idx, grp in enumerate(
                list(
                    map(
                        lambda x: " - ".join(x), list(
                            combinations(to_group, comb_size))
                    )
                )
            )
        ],
        axis=1,
    )


In [6]:
def extract_feature_expressions(df_):
    assert all(
        [
            col in df_.columns
            for col in [
                "baslangic_year",
                "DOGUM_TARIHI",
                "SENE_SONU_HESAP_DEGERI",
                "SENE_BASI_HESAP_DEGERI",
            ]
        ]
    )
    df = df_.copy()

    df["AGE_ON_POLICY_START"] = df["baslangic_year"] - df["DOGUM_TARIHI"]
    df["SENE_DEGER_FARKI"] = df["SENE_SONU_HESAP_DEGERI"] - df["SENE_BASI_HESAP_DEGERI"]
    # df["odeme_existence_count"] = (df[odenen_tutar_cols]>0).astype(int).sum(axis=1)

    return df

In [7]:
def extract_features(df_):
    df = df_.copy()

    assert all(
        [
            col in df_.columns
            for col in [
                "UYRUK",
                "MEMLEKET",
                "CINSIYET",
                "DOGUM_TARIHI",
                "MESLEK",
                "MEDENI_HAL",
                "EGITIM_DURUM",
                "SIGORTA_TIP",
                "KAPSAM_TIPI",
                "EKIM_ODENEN_TUTAR",
                "KASIM_ODENEN_TUTAR",
                "ARALIK_ODENEN_TUTAR",
            ]
        ]
    )

    GELIR_bins = [
        -np.inf,
        1000.0,
        1200.0,
        1800.0,
        2000.0,
        2500.0,
        3000.0,
        3500.0,
        5000.0,
        6000.0,
        np.inf,
    ]

    odenen_tutar_cols = [col for col in df_.columns if "ODENEN_TU" in col]
    vade_tutar_cols = [col for col in df_.columns if "VADE_TU" in col]

    extract_month_year(df)

    df["GELIR_qcut"] = pd.cut(df.GELIR, GELIR_bins)
    df["GELIR_qcut"] = df["GELIR_qcut"].cat.codes.astype(str).fillna("UNK")

    df["uyruk_memleket"] = df["UYRUK"].astype(str) + "_" + df["MEMLEKET"].astype(str)
    df["dogumtarihi_cinsiyet"] = (
        df["DOGUM_TARIHI"].astype(str) + "_" + df["CINSIYET"].astype(str)
    )
    df["meslek_medenihal_egitim"] = (
        df["MESLEK"].astype(str)
        + "_"
        + df["MEDENI_HAL"].astype(str)
        + "_"
        + df["EGITIM_DURUM"].astype(str)
    )
    df["dogumtarihi_meslek_medenihal"] = (
        df["DOGUM_TARIHI"].astype(str)
        + "_"
        + df["MESLEK"].astype(str)
        + "_"
        + df["MEDENI_HAL"].astype(str)
    )
    df["sigortatip_kapsamtipi"] = (
        df["SIGORTA_TIP"].astype(str) + "_" + df["KAPSAM_TIPI"].astype(str)
    )

    df["gelir_medenihal_egitim"] = (
        df["GELIR_qcut"].astype(str)
        + "_"
        + df["MEDENI_HAL"].astype(str)
        + "_"
        + df["EGITIM_DURUM"].astype(str)
    )

    increase_features(df, odenen_tutar_cols, "odenen")
    increase_features(df, vade_tutar_cols, "vade")

    extract_tutar_stats(df, odenen_tutar_cols, "odenen")
    extract_tutar_stats(df, vade_tutar_cols, "vade")

    df["odeme_existence_count"] = (df[odenen_tutar_cols] > 0).astype(int).sum(axis=1)

    df["odeme_diff_vade_max"] = (df[vade_tutar_cols].values - df[odenen_tutar_cols].values).max(axis=1)
    df["odeme_diff_vade_min"] = (df[vade_tutar_cols].values - df[odenen_tutar_cols].values).min(axis=1)
    df["odeme_diff_vade_std"] = (df[vade_tutar_cols].values - df[odenen_tutar_cols].values).std(axis=1)
    df["odeme_diff_vade_sum"] = (df[vade_tutar_cols].values - df[odenen_tutar_cols].values).sum(axis=1)

    # orneklem varyansı - varyans / mean
    # TODO: ask to author
    df["vade_std_mean_rat"] = df["vade_std"] / df["vade_mean"]

    df["last_three_month_sum_vade"] = df[["EKIM_VADE_TUTARI", "KASIM_VADE_TUTARI", "ARALIK_VADE_TUTARI"]].sum(axis=1)
    df["last_three_month_sum_odenen"] = df[["EKIM_ODENEN_TUTAR", "KASIM_ODENEN_TUTAR", "ARALIK_ODENEN_TUTAR"]].sum(axis=1)

    df.drop(
        ["UYRUK", "MEMLEKET", "CINSIYET", "MESLEK", "MEDENI_HAL", "EGITIM_DURUM"]
        + odenen_tutar_cols[:-3]
        + vade_tutar_cols,
        axis=1,
        inplace=True,
    )

    return df

## I/O

In [8]:
# A routine to automatically map datasets contained in the Kaggle input folder to variables.
# For example, /kaggle/input/anadolu-hayat-emeklilik-datathon-coderspace/samplesubmission.csv --> samplesubmission

competition = "anadolu-hayat-emeklilik-datathon-coderspace"

INPUT_DIR = Path.home().parent / "kaggle" / "input"
OUTPUT_DIR = Path.home().parent / "kaggle" / "working"

for path in (INPUT_DIR / competition).iterdir():
    var_name, file_format = path.parts[-1].split(".")
    print(f"Assign data located in {path} to variable named {var_name}")
    exec(f'{var_name} = pd.read_{file_format}("{path}")')

Assign data located in /kaggle/input/anadolu-hayat-emeklilik-datathon-coderspace/samplesubmission.csv to variable named samplesubmission
Assign data located in /kaggle/input/anadolu-hayat-emeklilik-datathon-coderspace/train.csv to variable named train
Assign data located in /kaggle/input/anadolu-hayat-emeklilik-datathon-coderspace/test.csv to variable named test


## Check dtypes & id_cols

In [9]:
row_cnt, col_cnt = train.shape

print(row_cnt, col_cnt)

id_cols = (
    (train.nunique() == row_cnt).to_frame("is_matched").query("is_matched").index
).union(train.columns[train.columns.str.contains("|".join(id_substr_list))])

print("Identifier columns are {}".format(", ".join(id_cols)))

cat_cols = train.dtypes[train.dtypes == "object"].index.difference(id_cols.union(cat_col_diff))
print("Categorical columns are {}".format(", ".join(cat_cols)))

drop_cols = drop_cols.union(id_cols)

634112 49
Identifier columns are OFFICE_ID, POLICY_ID
Categorical columns are DAGITIM_KANALI, EGITIM_DURUM, KAPSAM_GRUBU, KAPSAM_TIPI, MEDENI_HAL, MESLEK, MESLEK_KIRILIM, POLICE_SEHIR, SOZLESME_KOKENI, SOZLESME_KOKENI_DETAY, UYRUK, YATIRIM_KARAKTERI


## Feature Engineering

In [10]:
## Replace non-overlapping values to UNKNOWN placeholder(-1)
## For example, if "A" exists in training data for column UYRUK but not exists in test data, replace A to -1
## Apply this rule to all categorical columns to increase similarity neighborhood in sparse values.

for cat_col in cat_cols:
    replace_non_recurred_categorical_values(cat_col, train, test)
    # print(cat_col)

# Rare encoding

for sparse_category in sparse_categories:
    to_replace = list(
        train[sparse_category]
        .value_counts()[
            ((train[sparse_category].value_counts() / len(train)) < sparsity_threshold)
        ]
        .index
    )
    train[sparse_category].replace(to_replace=to_replace, value="-1", inplace=True)
    test[sparse_category].replace(to_replace=to_replace, value="-1", inplace=True)


train = extract_feature_expressions(extract_features(preprocess_dataset(train)))
test = extract_feature_expressions(extract_features(preprocess_dataset(test)))

In [11]:
train.shape

(634112, 71)

In [12]:
train.head(15).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
POLICY_ID,7203428,2857821,7833031,4474944,4660085,4736561,8167943,2420140,5548250,1127870,8216966,6218600,1831063,8691355,3628583
OFFICE_ID,10099,10100,10042,10056,10068,10168,10006,10213,10222,10230,10232,10234,10238,10129,10238
SIGORTA_TIP,7,7,1,7,7,7,8,1,1,6,7,7,1,1,1
SOZLESME_KOKENI,NEW,NEW,NEW,NEW,NEW,NEW,NEW,NEW,NEW,NEW,NEW,NEW,NEW,NEW,NEW
SOZLESME_KOKENI_DETAY,NEW,NEW,NEW,NEW,NEW,NEW_MRG,NEW,NEW,NEW,NEW,NEW,NEW,NEW,NEW,NEW
BASLANGIC_TARIHI,2014-04,2016-09,2016-07,2014-09,2008-04,2010-08,2018-12,2015-12,2017-01,2016-02,2015-08,2007-03,2013-03,2006-03,2016-12
KAPSAM_TIPI,PENSION318,PENSION419,PENSION250,-1,PENSION100,PENSION059,-1,PENSION194,PENSION250,PENSION028,-1,PENSION101,PENSION194,PENSION056,PENSION250
KAPSAM_GRUBU,STD B?REYSEL,KATILIM STD B?REYSEL,EV HANIMI,KATILIM GRUP,STD GRUP,STD GRUP,OKS,EV HANIMI,EV HANIMI,STD GRUP,GRUP,STD GRUP,EV HANIMI,EV HANIMI,EV HANIMI
DAGITIM_KANALI,Kanal7,Kanal4 + Kanal2,Kanal4 + Kanal2,Kanal4 + Kanal2,Kanal4 + Kanal2,Kanal4 + Kanal2,Kanal4 + Kanal2,Kanal4 + Kanal2,Kanal4 + Kanal2,Kanal4 + Kanal2,Kanal4 + Kanal2,Kanal1 + Kanal2,Kanal4 + Kanal2,Kanal4 + Kanal2,Kanal4 + Kanal2
POLICE_SEHIR,ESK??EH?R,BALIKES?R,BALIKES?R,SAMSUN,?STANBUL,?STANBUL,?STANBUL,UNKNOWN,NEV?EH?R,ANKARA,BALIKES?R,MAN?SA,KOCAEL?,?ZM?R,KOCAEL?


## Modelling 

In [13]:
cat_features = list(
    cat_cols.intersection(train.columns).union(
        train.dtypes[train.dtypes == object].index
    )
)

feat_cols = list(train.columns.difference(drop_cols.union([label])))

print("Categorical columns after categorical FE are {}".format(", ".join(cat_features)))
print(
    "Length of categorical columns after categorical FE are {}".format(
        len(cat_features)
    )
)

params = dict(
    allow_writing_files=False,
    cat_features=cat_features,
    iterations=ITERATION,
    random_state=SEED,
    scale_pos_weight=4.25,
    early_stopping_rounds=1000,
    depth=7,
    learning_rate=0.06,
    max_ctr_complexity=6,
    eval_metric="F1:use_weights=False",
    task_type=DEVICE,
    devices="0:1",
)

Categorical columns after categorical FE are DAGITIM_KANALI, GELIR_qcut, KAPSAM_GRUBU, KAPSAM_TIPI, MESLEK_KIRILIM, POLICE_SEHIR, SOZLESME_KOKENI, SOZLESME_KOKENI_DETAY, YATIRIM_KARAKTERI, dogumtarihi_cinsiyet, dogumtarihi_meslek_medenihal, gelir_medenihal_egitim, meslek_medenihal_egitim, sigortatip_kapsamtipi, uyruk_memleket
Length of categorical columns after categorical FE are 15


In [14]:
f1_errs = []
models = []

In [15]:
cv = StratifiedKFold(n_splits=FOLD_CNT, shuffle=True, random_state=SEED)
cv_splits = list(cv.split(train.index, train[label]))

In [16]:
for split_train, split_val in tqdm(cv_splits):

    model = CatBoostClassifier(
        **params,
    )

    model.fit(
        train.loc[split_train, feat_cols],
        train.loc[split_train, label],
        eval_set=(
            train.loc[split_val, feat_cols],
            train.loc[split_val, label],
        ),
        verbose=25,
    )

    preds = model.predict(train.loc[split_val, feat_cols])
    f1_err = f1_score(train.loc[split_val, label], preds)
    f1_errs.append(f1_err)
    models.append(model)

  0%|          | 0/9 [00:00<?, ?it/s]

0:	learn: 0.4013476	test: 0.3986121	best: 0.3986121 (0)	total: 122ms	remaining: 2m 1s
25:	learn: 0.4104774	test: 0.4064133	best: 0.4064133 (25)	total: 2.69s	remaining: 1m 40s
50:	learn: 0.4224244	test: 0.4195467	best: 0.4196340 (48)	total: 5.22s	remaining: 1m 37s
75:	learn: 0.4287491	test: 0.4288715	best: 0.4288715 (75)	total: 7.71s	remaining: 1m 33s
100:	learn: 0.4345968	test: 0.4347524	best: 0.4357565 (96)	total: 10.6s	remaining: 1m 33s
125:	learn: 0.4411278	test: 0.4427895	best: 0.4434408 (122)	total: 13s	remaining: 1m 30s
150:	learn: 0.4458514	test: 0.4479588	best: 0.4479588 (150)	total: 15.5s	remaining: 1m 27s
175:	learn: 0.4483134	test: 0.4494226	best: 0.4495930 (174)	total: 18s	remaining: 1m 24s
200:	learn: 0.4499800	test: 0.4512399	best: 0.4512399 (200)	total: 20.8s	remaining: 1m 22s
225:	learn: 0.4527835	test: 0.4538343	best: 0.4546853 (220)	total: 24.3s	remaining: 1m 23s
250:	learn: 0.4551282	test: 0.4562386	best: 0.4563242 (246)	total: 26.8s	remaining: 1m 19s
275:	learn: 0.4

 11%|█         | 1/9 [03:42<29:36, 222.09s/it]

0:	learn: 0.4005149	test: 0.3984167	best: 0.3984167 (0)	total: 99.4ms	remaining: 1m 39s
25:	learn: 0.4101519	test: 0.4093559	best: 0.4093559 (25)	total: 2.67s	remaining: 1m 40s
50:	learn: 0.4201414	test: 0.4225690	best: 0.4225690 (50)	total: 5.47s	remaining: 1m 41s
75:	learn: 0.4305669	test: 0.4335147	best: 0.4335147 (75)	total: 7.93s	remaining: 1m 36s
100:	learn: 0.4383241	test: 0.4448767	best: 0.4450007 (97)	total: 10.5s	remaining: 1m 33s
125:	learn: 0.4431484	test: 0.4511330	best: 0.4520300 (122)	total: 12.9s	remaining: 1m 29s
150:	learn: 0.4462270	test: 0.4552970	best: 0.4555749 (149)	total: 15.8s	remaining: 1m 28s
175:	learn: 0.4497405	test: 0.4583681	best: 0.4583681 (175)	total: 18.2s	remaining: 1m 25s
200:	learn: 0.4517910	test: 0.4597301	best: 0.4601795 (198)	total: 20.7s	remaining: 1m 22s
225:	learn: 0.4533233	test: 0.4610236	best: 0.4619675 (216)	total: 23.2s	remaining: 1m 19s
250:	learn: 0.4561640	test: 0.4649147	best: 0.4649147 (250)	total: 26.1s	remaining: 1m 17s
275:	lear

 22%|██▏       | 2/9 [05:49<19:23, 166.17s/it]

0:	learn: 0.4017768	test: 0.3908012	best: 0.3908012 (0)	total: 98.3ms	remaining: 1m 38s
25:	learn: 0.4116133	test: 0.4016608	best: 0.4022795 (22)	total: 3.57s	remaining: 2m 13s
50:	learn: 0.4228212	test: 0.4177907	best: 0.4177907 (50)	total: 6.23s	remaining: 1m 55s
75:	learn: 0.4302971	test: 0.4300072	best: 0.4305745 (74)	total: 8.8s	remaining: 1m 46s
100:	learn: 0.4365549	test: 0.4376293	best: 0.4377181 (97)	total: 11.6s	remaining: 1m 42s
125:	learn: 0.4398293	test: 0.4410490	best: 0.4412371 (122)	total: 14s	remaining: 1m 37s
150:	learn: 0.4429713	test: 0.4434383	best: 0.4439411 (146)	total: 16.5s	remaining: 1m 32s
175:	learn: 0.4458635	test: 0.4447445	best: 0.4454552 (163)	total: 19s	remaining: 1m 29s
200:	learn: 0.4489008	test: 0.4491248	best: 0.4494302 (199)	total: 21.8s	remaining: 1m 26s
225:	learn: 0.4517021	test: 0.4536554	best: 0.4536746 (224)	total: 24.3s	remaining: 1m 23s
250:	learn: 0.4544909	test: 0.4564672	best: 0.4571429 (247)	total: 26.8s	remaining: 1m 19s
275:	learn: 0.

 33%|███▎      | 3/9 [07:55<14:46, 147.82s/it]

0:	learn: 0.3980387	test: 0.4029048	best: 0.4029048 (0)	total: 99.2ms	remaining: 1m 39s
25:	learn: 0.4080370	test: 0.4184963	best: 0.4184963 (25)	total: 2.71s	remaining: 1m 41s
50:	learn: 0.4192799	test: 0.4301226	best: 0.4301226 (50)	total: 5.36s	remaining: 1m 39s
75:	learn: 0.4286792	test: 0.4416534	best: 0.4416534 (75)	total: 7.85s	remaining: 1m 35s
100:	learn: 0.4347393	test: 0.4463184	best: 0.4466365 (97)	total: 10.9s	remaining: 1m 37s
125:	learn: 0.4404134	test: 0.4542898	best: 0.4542898 (125)	total: 13.4s	remaining: 1m 32s
150:	learn: 0.4437856	test: 0.4565083	best: 0.4565083 (150)	total: 16.1s	remaining: 1m 30s
175:	learn: 0.4465833	test: 0.4569700	best: 0.4574490 (171)	total: 18.6s	remaining: 1m 27s
200:	learn: 0.4485840	test: 0.4584708	best: 0.4585051 (195)	total: 21.1s	remaining: 1m 23s
225:	learn: 0.4526719	test: 0.4628887	best: 0.4632190 (223)	total: 23.5s	remaining: 1m 20s
250:	learn: 0.4553882	test: 0.4660275	best: 0.4662312 (249)	total: 26s	remaining: 1m 17s
275:	learn:

 44%|████▍     | 4/9 [10:00<11:35, 139.13s/it]

0:	learn: 0.3995221	test: 0.4024816	best: 0.4024816 (0)	total: 102ms	remaining: 1m 41s
25:	learn: 0.4112781	test: 0.4108406	best: 0.4113224 (24)	total: 2.7s	remaining: 1m 41s
50:	learn: 0.4234465	test: 0.4275538	best: 0.4275538 (50)	total: 5.26s	remaining: 1m 37s
75:	learn: 0.4304235	test: 0.4351188	best: 0.4358015 (66)	total: 7.73s	remaining: 1m 33s
100:	learn: 0.4380702	test: 0.4438368	best: 0.4440393 (99)	total: 10.4s	remaining: 1m 32s
125:	learn: 0.4414633	test: 0.4468323	best: 0.4470835 (120)	total: 13.1s	remaining: 1m 30s
150:	learn: 0.4460055	test: 0.4528591	best: 0.4531697 (149)	total: 16.4s	remaining: 1m 32s
175:	learn: 0.4483388	test: 0.4545836	best: 0.4547742 (174)	total: 18.9s	remaining: 1m 28s
200:	learn: 0.4507835	test: 0.4565628	best: 0.4568670 (192)	total: 21.6s	remaining: 1m 25s
225:	learn: 0.4527227	test: 0.4589291	best: 0.4589291 (225)	total: 24.2s	remaining: 1m 22s
250:	learn: 0.4552632	test: 0.4589980	best: 0.4602751 (235)	total: 26.8s	remaining: 1m 19s
275:	learn:

 56%|█████▌    | 5/9 [12:07<08:58, 134.62s/it]

0:	learn: 0.4013254	test: 0.3951451	best: 0.3951451 (0)	total: 103ms	remaining: 1m 43s
25:	learn: 0.4120162	test: 0.4021436	best: 0.4021436 (25)	total: 2.71s	remaining: 1m 41s
50:	learn: 0.4225724	test: 0.4184207	best: 0.4184207 (50)	total: 5.26s	remaining: 1m 37s
75:	learn: 0.4311071	test: 0.4286309	best: 0.4286309 (75)	total: 7.93s	remaining: 1m 36s
100:	learn: 0.4364828	test: 0.4349674	best: 0.4349674 (100)	total: 10.4s	remaining: 1m 32s
125:	learn: 0.4397877	test: 0.4391376	best: 0.4391509 (124)	total: 12.9s	remaining: 1m 29s
150:	learn: 0.4444192	test: 0.4427868	best: 0.4433032 (149)	total: 15.3s	remaining: 1m 26s
175:	learn: 0.4476778	test: 0.4450689	best: 0.4452815 (172)	total: 18s	remaining: 1m 24s
200:	learn: 0.4502591	test: 0.4477387	best: 0.4477387 (199)	total: 21.2s	remaining: 1m 24s
225:	learn: 0.4522323	test: 0.4488059	best: 0.4490804 (221)	total: 23.6s	remaining: 1m 20s
250:	learn: 0.4550180	test: 0.4495092	best: 0.4498935 (246)	total: 26.1s	remaining: 1m 17s
275:	learn:

 67%|██████▋   | 6/9 [14:12<06:34, 131.38s/it]

0:	learn: 0.3995253	test: 0.4011828	best: 0.4011828 (0)	total: 101ms	remaining: 1m 40s
25:	learn: 0.4106971	test: 0.4142422	best: 0.4142422 (25)	total: 2.99s	remaining: 1m 52s
50:	learn: 0.4235221	test: 0.4335114	best: 0.4336078 (47)	total: 5.59s	remaining: 1m 44s
75:	learn: 0.4281622	test: 0.4381443	best: 0.4383142 (72)	total: 8.07s	remaining: 1m 38s
100:	learn: 0.4343972	test: 0.4463507	best: 0.4463507 (100)	total: 10.6s	remaining: 1m 34s
125:	learn: 0.4381591	test: 0.4522315	best: 0.4522315 (125)	total: 13.1s	remaining: 1m 30s
150:	learn: 0.4420783	test: 0.4586066	best: 0.4586066 (150)	total: 15.9s	remaining: 1m 29s
175:	learn: 0.4457891	test: 0.4624828	best: 0.4627700 (174)	total: 18.4s	remaining: 1m 26s
200:	learn: 0.4487169	test: 0.4652126	best: 0.4652126 (200)	total: 20.9s	remaining: 1m 23s
225:	learn: 0.4510893	test: 0.4656853	best: 0.4661093 (219)	total: 23.5s	remaining: 1m 20s
250:	learn: 0.4538901	test: 0.4681381	best: 0.4681381 (250)	total: 26.6s	remaining: 1m 19s
275:	lear

 78%|███████▊  | 7/9 [16:19<04:19, 129.80s/it]

0:	learn: 0.3990693	test: 0.4037752	best: 0.4037752 (0)	total: 99.8ms	remaining: 1m 39s
25:	learn: 0.4102046	test: 0.4145961	best: 0.4148770 (24)	total: 2.65s	remaining: 1m 39s
50:	learn: 0.4226905	test: 0.4244620	best: 0.4244620 (50)	total: 5.33s	remaining: 1m 39s
75:	learn: 0.4313516	test: 0.4322391	best: 0.4327740 (74)	total: 8.05s	remaining: 1m 37s
100:	learn: 0.4356133	test: 0.4372770	best: 0.4374913 (99)	total: 10.6s	remaining: 1m 34s
125:	learn: 0.4405926	test: 0.4436526	best: 0.4436526 (125)	total: 13s	remaining: 1m 30s
150:	learn: 0.4443670	test: 0.4459799	best: 0.4469274 (145)	total: 15.5s	remaining: 1m 27s
175:	learn: 0.4476993	test: 0.4498914	best: 0.4501855 (174)	total: 18s	remaining: 1m 24s
200:	learn: 0.4502796	test: 0.4492084	best: 0.4501855 (174)	total: 20.8s	remaining: 1m 22s
225:	learn: 0.4533777	test: 0.4527932	best: 0.4528434 (224)	total: 23.3s	remaining: 1m 19s
250:	learn: 0.4555085	test: 0.4531535	best: 0.4534851 (249)	total: 25.7s	remaining: 1m 16s
275:	learn: 0

 89%|████████▉ | 8/9 [18:26<02:08, 128.86s/it]

0:	learn: 0.3989897	test: 0.4084607	best: 0.4084607 (0)	total: 97.6ms	remaining: 1m 37s
25:	learn: 0.4099191	test: 0.4187120	best: 0.4187120 (25)	total: 2.9s	remaining: 1m 48s
50:	learn: 0.4215826	test: 0.4313616	best: 0.4314414 (48)	total: 5.97s	remaining: 1m 51s
75:	learn: 0.4281233	test: 0.4372267	best: 0.4379167 (74)	total: 8.97s	remaining: 1m 49s
100:	learn: 0.4357804	test: 0.4467363	best: 0.4472755 (98)	total: 11.4s	remaining: 1m 41s
125:	learn: 0.4403481	test: 0.4503475	best: 0.4506152 (116)	total: 14.2s	remaining: 1m 38s
150:	learn: 0.4428928	test: 0.4531078	best: 0.4532379 (145)	total: 16.8s	remaining: 1m 34s
175:	learn: 0.4462608	test: 0.4566897	best: 0.4569263 (174)	total: 19.3s	remaining: 1m 30s
200:	learn: 0.4487066	test: 0.4580015	best: 0.4584400 (192)	total: 21.7s	remaining: 1m 26s
225:	learn: 0.4513095	test: 0.4599695	best: 0.4601942 (224)	total: 24.2s	remaining: 1m 22s
250:	learn: 0.4539520	test: 0.4636294	best: 0.4636294 (250)	total: 27s	remaining: 1m 20s
275:	learn: 

100%|██████████| 9/9 [20:32<00:00, 136.97s/it]


In [17]:
f1_errs

[0.4770721828439173,
 0.48059894895975813,
 0.4748201438848921,
 0.4837116059282594,
 0.47711711711711713,
 0.46793958392704477,
 0.4847317351598174,
 0.4740923255308356,
 0.4804461285479374]

In [18]:
(np.array(f1_errs).mean(), np.array(f1_errs).std())

(0.47783664132217546, 0.004929168395967689)

## **Fold thresholding**

In [19]:
f1_errs = []
test_preds = []
fold_thresholds = []

thresholds = np.arange(0.3, 0.7, 0.0025)

for idx, (split_train, split_val) in enumerate(tqdm(cv_splits)):

    proba = models[idx].predict_proba(train.loc[split_val, feat_cols])[:, 1]

    scores = []

    for t in thresholds:
        scores.append(f1_score(train.loc[split_val, label], proba >= t))
    ix = np.argmax(scores)

    print("Threshold=%.3f, F-Score=%.5f" % (thresholds[ix], scores[ix]))

    f1_err = f1_score(
        train.loc[split_val, label], (proba >= thresholds[ix]).astype(int)
    )

    print("F1 w/ selected threshold:", f1_err)

    # f1_err = f1_score(test_y, preds)
    f1_errs.append(f1_err)

    test_preds.append(models[idx].predict_proba(test.loc[:, feat_cols])[:, 1])
    fold_thresholds.append(thresholds[ix])

  0%|          | 0/9 [00:00<?, ?it/s]

Threshold=0.520, F-Score=0.48207
F1 w/ selected threshold: 0.4820692812452312


 11%|█         | 1/9 [00:12<01:43, 12.97s/it]

Threshold=0.535, F-Score=0.48221
F1 w/ selected threshold: 0.48220598633042655


 22%|██▏       | 2/9 [00:27<01:36, 13.84s/it]

Threshold=0.545, F-Score=0.47772
F1 w/ selected threshold: 0.47772012190099666


 33%|███▎      | 3/9 [00:40<01:20, 13.45s/it]

Threshold=0.543, F-Score=0.48792
F1 w/ selected threshold: 0.48792231763100874


 44%|████▍     | 4/9 [00:53<01:07, 13.46s/it]

Threshold=0.560, F-Score=0.48026
F1 w/ selected threshold: 0.48026260415789923


 56%|█████▌    | 5/9 [01:06<00:52, 13.18s/it]

Threshold=0.535, F-Score=0.47119
F1 w/ selected threshold: 0.4711877695021561


 67%|██████▋   | 6/9 [01:19<00:38, 13.00s/it]

Threshold=0.543, F-Score=0.48814
F1 w/ selected threshold: 0.48814371257485034


 78%|███████▊  | 7/9 [01:33<00:26, 13.31s/it]

Threshold=0.548, F-Score=0.47581
F1 w/ selected threshold: 0.4758084322554237


 89%|████████▉ | 8/9 [01:46<00:13, 13.23s/it]

Threshold=0.568, F-Score=0.48875
F1 w/ selected threshold: 0.4887525562372188


100%|██████████| 9/9 [01:59<00:00, 13.31s/it]


In [20]:
f1_errs

[0.4820692812452312,
 0.48220598633042655,
 0.47772012190099666,
 0.48792231763100874,
 0.48026260415789923,
 0.4711877695021561,
 0.48814371257485034,
 0.4758084322554237,
 0.4887525562372188]

In [21]:
(np.array(f1_errs).mean(), np.array(f1_errs).std())

(0.48156364242613453, 0.005708914717441986)

### **Creating the submission file**

In [22]:
test["pred"] = (
    np.mean(
        [
            (np.array(test_pred) >= fold_thresholds[test_pred_i]).astype(int)
            for test_pred_i, test_pred in enumerate(test_preds)
        ],
        axis=0,
    )
    >= 0.5
).astype(int)

In [23]:
sub = (
    samplesubmission.drop("ARTIS_DURUMU", 1)
    .merge(test[["POLICY_ID", "pred"]], how="left", on="POLICY_ID")
    .rename(columns={"pred": "ARTIS_DURUMU"})
)

In [24]:
sub.ARTIS_DURUMU.value_counts(normalize=True)

0    0.917783
1    0.082217
Name: ARTIS_DURUMU, dtype: float64

In [25]:
sub.shape

(243137, 2)

In [26]:
sub.head()

Unnamed: 0,POLICY_ID,ARTIS_DURUMU
0,6005558,0
1,1227288,0
2,5694750,0
3,3150098,0
4,4622228,0


In [27]:
sub.to_csv("submission.csv", index=False)