In [None]:
import os
os.environ['NUMEXPR_MAX_THREADS'] = '32'
os.environ['NUMEXPR_NUM_THREADS'] = '30'
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import random
import warnings

warnings.simplefilter('ignore')

pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

random.seed(42)
np.random.seed(42)

In [None]:
main = pd.read_csv('../data/train.csv')

sample = pd.read_csv('../data/sample_submit_naive.csv').drop('predict', axis=1)
sample['target'] = -1
test_ids = pd.read_csv('../data/test_ids.csv')

main = pd.concat([main, sample])

In [None]:
from sklearn.model_selection import train_test_split
predict = np.zeros(12800)
metadatas = []
for i in range(1, 6):
    frames_for_metamodel = []
    embeddings_path = f'../embeddings/coles_{i}.csv'
    main_embs = pd.read_csv(embeddings_path)
    new_embs = pd.read_csv(f'../embeddings/wtte_coles_{i}.csv')
    main_embs = main_embs.merge(new_embs, on='user_id')
    train = main[main.target != -1]
    test = main[main.target == -1]
    train_with_embs = train.merge(main_embs, on='user_id')
    test_with_embs = test.merge(main_embs, on='user_id')
    train, test = train_with_embs.loc[~train_with_embs['user_id'].isin(test_ids['user_id'])], train_with_embs.loc[train_with_embs['user_id'].isin(test_ids['user_id'])]
    strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)

    X, y = train.drop(['time', 'target'], axis=1), train['target']
    scores = []

    models = []
    for train_index, valid_index in strat_kfold.split(train, train['target']):

        X_train, X_val = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_val = y.iloc[train_index], y.iloc[valid_index]
        X_train1 = X_train.drop(['user_id'], axis=1)
        X_val1 = X_val.drop(['user_id'], axis=1)
        model = CatBoostClassifier(
            iterations = 15000,
            depth=5,
            learning_rate=0.025,
            eval_metric='AUC',
            early_stopping_rounds=1000,
            task_type="GPU",
            random_seed=42,
            use_best_model = True,
            l2_leaf_reg=5
        )

        model.fit(Pool(X_train1, y_train),
                  eval_set=Pool(X_val1, y_val),
                  verbose=1000)
        models.append(model)

        pred = model.predict_proba(X_val1)[:, 1]
        scores.append(metrics.roc_auc_score(y_val, pred))
        frames_for_metamodel.append(pd.DataFrame({'user_id': X_val.user_id.values, 'pred_emb': pred}))
    metadata = pd.concat(frames_for_metamodel, axis=0).reset_index()
    metadatas.append(metadata)
    for j in range(len(models)):
        predict += models[j].predict_proba(test.drop(['target'], axis=1))[:, 1]

    print(metrics.roc_auc_score(test['target'], predict))

In [None]:
agg_metadata = metadatas[0].drop('index', axis = 1)
for i in range(1,5):
    agg_metadata=agg_metadata.merge(metadatas[i].drop('index', axis = 1).rename(columns={"pred_emb": f"pred_emb{i}"}), on='user_id')
agg_metadata['pred_emb_sum'] = (agg_metadata['pred_emb'] + agg_metadata['pred_emb1'] + agg_metadata['pred_emb2'] + agg_metadata['pred_emb3'] + agg_metadata['pred_emb4'])/5

In [None]:
test_preds = {'pred_emb_sum': predict/25}
test_preds['user_id'] = test['user_id']
test_preds = pd.DataFrame(test_preds)

In [None]:
final_prediction = pd.concat([agg_metadata[['user_id', 'pred_emb_sum']], test_preds])

In [None]:
from sklearn.model_selection import train_test_split
predict = np.zeros(32000)
for i in range(1, 6):
    frames_for_metamodel = []
    embeddings_path = f'../embeddings/coles_{i}.csv'
    main_embs = pd.read_csv(embeddings_path)
    new_embs = pd.read_csv(f'../embeddings/wtte_coles_{i}.csv')
    main_embs = main_embs.merge(new_embs, on='user_id')
    train = main[main.target != -1]
    test = main[main.target == -1]
    train_with_embs = train.merge(main_embs, on='user_id')
    test_with_embs = test.merge(main_embs, on='user_id')
    strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)

    X, y = train_with_embs.drop(['time', 'target'], axis=1), train_with_embs['target']
    scores = []

    models = []
    for train_index, valid_index in strat_kfold.split(train_with_embs, train_with_embs['target']):

        X_train, X_val = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_val = y.iloc[train_index], y.iloc[valid_index]
        X_train1 = X_train.drop(['user_id'], axis=1)
        X_val1 = X_val.drop(['user_id'], axis=1)
        model = CatBoostClassifier(
            iterations = 15000,
            depth=5,
            learning_rate=0.025,
            eval_metric='AUC',
            early_stopping_rounds=1000,
            task_type="GPU",
            random_seed=42,
            use_best_model = True,
            l2_leaf_reg=5
        )

        model.fit(Pool(X_train1, y_train),
                  eval_set=Pool(X_val1, y_val),
                  verbose=100)
        models.append(model)

        pred = model.predict_proba(X_val1)[:, 1]
        scores.append(metrics.roc_auc_score(y_val, pred))
    for j in range(len(models)):
        predict += models[j].predict_proba(test_with_embs.drop(['target', 'user_id', 'time'], axis=1))[:, 1]


In [None]:
sbmt_preds = {'pred_emb_sum': predict/25}
sbmt_preds['user_id'] = test['user_id']
sbmt_preds = pd.DataFrame(sbmt_preds)

In [None]:
final_prediction = pd.concat([final_prediction, sbmt_preds])

In [None]:
final_prediction.to_csv("../predictions/coles_coles-wtte-preds.csv", index=False)