In [1]:
import os
os.environ['NUMEXPR_MAX_THREADS'] = '32'
os.environ['NUMEXPR_NUM_THREADS'] = '30'
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

In [2]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import random
import warnings

warnings.simplefilter('ignore')

pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

random.seed(42)
np.random.seed(42)


libgomp: Invalid value for environment variable OMP_NUM_THREADS


In [4]:
main = pd.read_csv('../data/train.csv')

sample = pd.read_csv('../data/sample_submit_naive.csv').drop('predict', axis=1)
sample['target'] = -1
test_ids = pd.read_csv('../data/test_ids.csv')

main = pd.concat([main, sample])

In [7]:
from sklearn.model_selection import train_test_split
predict = np.zeros(12800)
metadatas = []
for i in range(1, 6):
    frames_for_metamodel = []
    embeddings_path = f'../embeddings/coles_{i}.csv'
    main_embs = pd.read_csv(embeddings_path)
    new_embs = pd.read_csv(f'../embeddings/wtte_coles_{i}.csv')
    main_embs = main_embs.merge(new_embs, on='user_id')
    train = main[main.target != -1]
    test = main[main.target == -1]
    train_with_embs = train.merge(main_embs, on='user_id')
    test_with_embs = test.merge(main_embs, on='user_id')
    train, test = train_with_embs.loc[~train_with_embs['user_id'].isin(test_ids['user_id'])], train_with_embs.loc[train_with_embs['user_id'].isin(test_ids['user_id'])]
    strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)

    X, y = train.drop(['time', 'target'], axis=1), train['target']
    scores = []

    models = []
    for train_index, valid_index in strat_kfold.split(train, train['target']):

        X_train, X_val = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_val = y.iloc[train_index], y.iloc[valid_index]
        X_train1 = X_train.drop(['user_id'], axis=1)
        X_val1 = X_val.drop(['user_id'], axis=1)
        model = CatBoostClassifier(
            iterations = 21,
            depth=1,
            learning_rate=0.025,
            eval_metric='AUC',
            early_stopping_rounds=1000,
            task_type="GPU",
            random_seed=42,
            use_best_model = True,
            l2_leaf_reg=5
        )

        model.fit(Pool(X_train1, y_train),
                  eval_set=Pool(X_val1, y_val),
                  verbose=1000)
        models.append(model)

        pred = model.predict_proba(X_val1)[:, 1]
        scores.append(metrics.roc_auc_score(y_val, pred))
        frames_for_metamodel.append(pd.DataFrame({'user_id': X_val.user_id.values, 'pred_emb': pred}))
    metadata = pd.concat(frames_for_metamodel, axis=0).reset_index()
    metadatas.append(metadata)
    for j in range(len(models)):
        predict += models[j].predict_proba(test.drop(['target'], axis=1))[:, 1]

    print(metrics.roc_auc_score(test['target'], predict))

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5795046	best: 0.5795046 (0)	total: 6.08ms	remaining: 122ms
20:	test: 0.6217932	best: 0.6217932 (20)	total: 106ms	remaining: 0us
bestTest = 0.6217931509
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5714928	best: 0.5714928 (0)	total: 5.58ms	remaining: 112ms
20:	test: 0.6104253	best: 0.6104253 (20)	total: 91.7ms	remaining: 0us
bestTest = 0.6104252636
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5780816	best: 0.5780816 (0)	total: 4.21ms	remaining: 84.1ms
20:	test: 0.6215988	best: 0.6215988 (20)	total: 74.6ms	remaining: 0us
bestTest = 0.6215988398
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5609120	best: 0.5609120 (0)	total: 4.3ms	remaining: 86.1ms
20:	test: 0.6067128	best: 0.6067128 (20)	total: 82.2ms	remaining: 0us
bestTest = 0.6067127585
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5667366	best: 0.5667366 (0)	total: 5.3ms	remaining: 106ms
20:	test: 0.6194905	best: 0.6194905 (20)	total: 94.9ms	remaining: 0us
bestTest = 0.6194905341
bestIteration = 20
0.6399161033886576


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5778252	best: 0.5778252 (0)	total: 6.36ms	remaining: 127ms
20:	test: 0.6260607	best: 0.6260607 (20)	total: 103ms	remaining: 0us
bestTest = 0.6260606945
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5725879	best: 0.5725879 (0)	total: 5.73ms	remaining: 115ms
20:	test: 0.6317300	best: 0.6317300 (20)	total: 96.6ms	remaining: 0us
bestTest = 0.63173002
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5669795	best: 0.5669795 (0)	total: 5.17ms	remaining: 103ms
20:	test: 0.6073969	best: 0.6079797 (19)	total: 80.5ms	remaining: 0us
bestTest = 0.6079797149
bestIteration = 19
Shrink model to first 20 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5571909	best: 0.5571909 (0)	total: 6.23ms	remaining: 125ms
20:	test: 0.6211462	best: 0.6211462 (20)	total: 105ms	remaining: 0us
bestTest = 0.6211462021
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5808170	best: 0.5808170 (0)	total: 4.53ms	remaining: 90.7ms
20:	test: 0.6356001	best: 0.6356001 (20)	total: 78.3ms	remaining: 0us
bestTest = 0.6356000602
bestIteration = 20
0.6493352838651324


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5718589	best: 0.5718589 (0)	total: 6.08ms	remaining: 122ms
20:	test: 0.6455424	best: 0.6455424 (20)	total: 104ms	remaining: 0us
bestTest = 0.6455424428
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5761605	best: 0.5761605 (0)	total: 4.53ms	remaining: 90.6ms
20:	test: 0.6358465	best: 0.6358465 (20)	total: 90.1ms	remaining: 0us
bestTest = 0.6358464956
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5744276	best: 0.5744276 (0)	total: 5.84ms	remaining: 117ms
20:	test: 0.6336181	best: 0.6336181 (20)	total: 104ms	remaining: 0us
bestTest = 0.6336180866
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5672620	best: 0.5672620 (0)	total: 5.83ms	remaining: 117ms
20:	test: 0.6144748	best: 0.6154988 (14)	total: 91.5ms	remaining: 0us
bestTest = 0.6154987812
bestIteration = 14
Shrink model to first 15 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5705367	best: 0.5705367 (0)	total: 5.93ms	remaining: 119ms
20:	test: 0.6167338	best: 0.6169741 (19)	total: 104ms	remaining: 0us
bestTest = 0.6169741154
bestIteration = 19
Shrink model to first 20 iterations.
0.6534770792942967


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5785766	best: 0.5785766 (0)	total: 5.85ms	remaining: 117ms
20:	test: 0.6209227	best: 0.6209227 (20)	total: 111ms	remaining: 0us
bestTest = 0.6209227145
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5797179	best: 0.5797179 (0)	total: 4.72ms	remaining: 94.4ms
20:	test: 0.6289101	best: 0.6289101 (20)	total: 87ms	remaining: 0us
bestTest = 0.6289100647
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5607836	best: 0.5607836 (0)	total: 5.9ms	remaining: 118ms
20:	test: 0.5974435	best: 0.5974435 (20)	total: 100ms	remaining: 0us
bestTest = 0.5974434912
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5612241	best: 0.5612241 (0)	total: 4.89ms	remaining: 97.8ms
20:	test: 0.6102076	best: 0.6102076 (20)	total: 101ms	remaining: 0us
bestTest = 0.6102076173
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5778011	best: 0.5778011 (0)	total: 5.71ms	remaining: 114ms
20:	test: 0.6245259	best: 0.6245259 (20)	total: 93ms	remaining: 0us
bestTest = 0.6245259047
bestIteration = 20
0.6528599820255986


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5697834	best: 0.5697834 (0)	total: 6.2ms	remaining: 124ms
20:	test: 0.6323497	best: 0.6323596 (19)	total: 105ms	remaining: 0us
bestTest = 0.6323596239
bestIteration = 19
Shrink model to first 20 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5868518	best: 0.5868518 (0)	total: 4.68ms	remaining: 93.6ms
20:	test: 0.6377155	best: 0.6377155 (20)	total: 76.2ms	remaining: 0us
bestTest = 0.6377155185
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5729147	best: 0.5729147 (0)	total: 5.54ms	remaining: 111ms
20:	test: 0.6240211	best: 0.6240211 (20)	total: 94.9ms	remaining: 0us
bestTest = 0.6240210831
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5613149	best: 0.5613149 (0)	total: 4.6ms	remaining: 92ms
20:	test: 0.6255453	best: 0.6255453 (20)	total: 77.6ms	remaining: 0us
bestTest = 0.6255453229
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5693931	best: 0.5693931 (0)	total: 5.87ms	remaining: 117ms
20:	test: 0.6191851	best: 0.6195283 (9)	total: 99.9ms	remaining: 0us
bestTest = 0.6195282936
bestIteration = 9
Shrink model to first 10 iterations.
0.661730342141184


In [8]:
agg_metadata = metadatas[0].drop('index', axis = 1)
for i in range(1,5):
    agg_metadata=agg_metadata.merge(metadatas[i].drop('index', axis = 1).rename(columns={"pred_emb": f"pred_emb{i}"}), on='user_id')
agg_metadata['pred_emb_sum'] = (agg_metadata['pred_emb'] + agg_metadata['pred_emb1'] + agg_metadata['pred_emb2'] + agg_metadata['pred_emb3'] + agg_metadata['pred_emb4'])/5

In [11]:
test_preds = {'pred_emb_sum': predict/25}
test_preds['user_id'] = test['user_id']
test_preds = pd.DataFrame(test_preds)

In [13]:
final_prediction = pd.concat([agg_metadata[['user_id', 'pred_emb_sum']], test_preds])

In [15]:
from sklearn.model_selection import train_test_split
predict = np.zeros(32000)
for i in range(1, 6):
    frames_for_metamodel = []
    embeddings_path = f'../embeddings/coles_{i}.csv'
    main_embs = pd.read_csv(embeddings_path)
    new_embs = pd.read_csv(f'../embeddings/wtte_coles_{i}.csv')
    main_embs = main_embs.merge(new_embs, on='user_id')
    train = main[main.target != -1]
    test = main[main.target == -1]
    train_with_embs = train.merge(main_embs, on='user_id')
    test_with_embs = test.merge(main_embs, on='user_id')
    strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)

    X, y = train_with_embs.drop(['time', 'target'], axis=1), train_with_embs['target']
    scores = []

    models = []
    for train_index, valid_index in strat_kfold.split(train_with_embs, train_with_embs['target']):

        X_train, X_val = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_val = y.iloc[train_index], y.iloc[valid_index]
        X_train1 = X_train.drop(['user_id'], axis=1)
        X_val1 = X_val.drop(['user_id'], axis=1)
        model = CatBoostClassifier(
            iterations = 21,
            depth=1,
            learning_rate=0.025,
            eval_metric='AUC',
            early_stopping_rounds=1000,
            task_type="GPU",
            random_seed=42,
            use_best_model = True,
            l2_leaf_reg=5
        )

        model.fit(Pool(X_train1, y_train),
                  eval_set=Pool(X_val1, y_val),
                  verbose=100)
        models.append(model)

        pred = model.predict_proba(X_val1)[:, 1]
        scores.append(metrics.roc_auc_score(y_val, pred))
    for j in range(len(models)):
        predict += models[j].predict_proba(test_with_embs.drop(['target', 'user_id', 'time'], axis=1))[:, 1]


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5766022	best: 0.5766022 (0)	total: 6.07ms	remaining: 121ms
20:	test: 0.6272591	best: 0.6272591 (20)	total: 111ms	remaining: 0us
bestTest = 0.6272591352
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5625888	best: 0.5625888 (0)	total: 5.98ms	remaining: 120ms
20:	test: 0.6010063	best: 0.6010063 (20)	total: 101ms	remaining: 0us
bestTest = 0.6010062993
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5816417	best: 0.5816417 (0)	total: 5.87ms	remaining: 117ms
20:	test: 0.6302738	best: 0.6304015 (18)	total: 103ms	remaining: 0us
bestTest = 0.6304014623
bestIteration = 18
Shrink model to first 19 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5821679	best: 0.5821679 (0)	total: 5.56ms	remaining: 111ms
20:	test: 0.6244677	best: 0.6244677 (20)	total: 92.1ms	remaining: 0us
bestTest = 0.6244677007
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5786930	best: 0.5786930 (0)	total: 5.14ms	remaining: 103ms
20:	test: 0.6206597	best: 0.6206958 (14)	total: 90.9ms	remaining: 0us
bestTest = 0.6206958294
bestIteration = 14
Shrink model to first 15 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5696266	best: 0.5696266 (0)	total: 6.35ms	remaining: 127ms
20:	test: 0.6102425	best: 0.6102425 (20)	total: 106ms	remaining: 0us
bestTest = 0.6102424562
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5712755	best: 0.5712755 (0)	total: 4.42ms	remaining: 88.3ms
20:	test: 0.6247401	best: 0.6248243 (19)	total: 85.7ms	remaining: 0us
bestTest = 0.6248242855
bestIteration = 19
Shrink model to first 20 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5847065	best: 0.5847065 (0)	total: 5.72ms	remaining: 114ms
20:	test: 0.6374272	best: 0.6374272 (20)	total: 96.7ms	remaining: 0us
bestTest = 0.6374271512
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5823133	best: 0.5823133 (0)	total: 4.34ms	remaining: 86.9ms
20:	test: 0.6409172	best: 0.6409172 (20)	total: 87ms	remaining: 0us
bestTest = 0.6409171522
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5711586	best: 0.5711586 (0)	total: 5.32ms	remaining: 106ms
20:	test: 0.6180885	best: 0.6186083 (19)	total: 93.7ms	remaining: 0us
bestTest = 0.6186083257
bestIteration = 19
Shrink model to first 20 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5716128	best: 0.5716128 (0)	total: 5.97ms	remaining: 119ms
20:	test: 0.6123097	best: 0.6123097 (20)	total: 101ms	remaining: 0us
bestTest = 0.6123096645
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5646079	best: 0.5646079 (0)	total: 4.74ms	remaining: 94.8ms
20:	test: 0.6264457	best: 0.6264457 (20)	total: 79.4ms	remaining: 0us
bestTest = 0.6264457107
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5768075	best: 0.5768075 (0)	total: 7.39ms	remaining: 148ms
20:	test: 0.6256602	best: 0.6256602 (20)	total: 97.1ms	remaining: 0us
bestTest = 0.6256602108
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5815885	best: 0.5815885 (0)	total: 4.33ms	remaining: 86.5ms
20:	test: 0.6502059	best: 0.6502059 (20)	total: 73.2ms	remaining: 0us
bestTest = 0.65020594
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5825735	best: 0.5825735 (0)	total: 6.28ms	remaining: 126ms
20:	test: 0.6423963	best: 0.6423963 (20)	total: 102ms	remaining: 0us
bestTest = 0.642396301
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5818721	best: 0.5818721 (0)	total: 5.97ms	remaining: 119ms
20:	test: 0.6301308	best: 0.6301308 (20)	total: 93.2ms	remaining: 0us
bestTest = 0.6301308274
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5754111	best: 0.5754111 (0)	total: 5.64ms	remaining: 113ms
20:	test: 0.6223245	best: 0.6223245 (20)	total: 99.8ms	remaining: 0us
bestTest = 0.6223244965
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5709727	best: 0.5709727 (0)	total: 6.03ms	remaining: 121ms
20:	test: 0.6142229	best: 0.6142229 (20)	total: 103ms	remaining: 0us
bestTest = 0.6142228842
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5742085	best: 0.5742085 (0)	total: 5.61ms	remaining: 112ms
20:	test: 0.6182312	best: 0.6182312 (20)	total: 95.9ms	remaining: 0us
bestTest = 0.6182311773
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5716962	best: 0.5716962 (0)	total: 5.45ms	remaining: 109ms
20:	test: 0.6194149	best: 0.6194149 (20)	total: 94.8ms	remaining: 0us
bestTest = 0.6194148958
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5811687	best: 0.5811687 (0)	total: 7.93ms	remaining: 159ms
20:	test: 0.6344275	best: 0.6344275 (20)	total: 111ms	remaining: 0us
bestTest = 0.6344275177
bestIteration = 20


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5764677	best: 0.5764677 (0)	total: 5.39ms	remaining: 108ms
20:	test: 0.6454542	best: 0.6460423 (19)	total: 93.6ms	remaining: 0us
bestTest = 0.6460422575
bestIteration = 19
Shrink model to first 20 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5576193	best: 0.5576193 (0)	total: 4.24ms	remaining: 84.9ms
20:	test: 0.6213041	best: 0.6219133 (19)	total: 79.5ms	remaining: 0us
bestTest = 0.6219133139
bestIteration = 19
Shrink model to first 20 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5872072	best: 0.5872072 (0)	total: 5.92ms	remaining: 118ms
20:	test: 0.6524037	best: 0.6525732 (19)	total: 107ms	remaining: 0us
bestTest = 0.6525731981
bestIteration = 19
Shrink model to first 20 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5749865	best: 0.5749865 (0)	total: 6.33ms	remaining: 127ms
20:	test: 0.6283545	best: 0.6283545 (20)	total: 106ms	remaining: 0us
bestTest = 0.6283545494
bestIteration = 20


In [16]:
sbmt_preds = {'pred_emb_sum': predict/25}
sbmt_preds['user_id'] = test['user_id']
sbmt_preds = pd.DataFrame(sbmt_preds)

In [17]:
final_prediction = pd.concat([final_prediction, sbmt_preds])

In [19]:
final_prediction.to_csv("../predicts/coles_coles-wtte-preds.csv", index=False)