In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from scipy import sparse, stats
from scipy.linalg import svd
from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import (KFold, StratifiedKFold, cross_val_score,
                                     cross_validate, train_test_split)
from sklearn.ensemble import VotingClassifier

from tqdm import tqdm
from catboost import CatBoostClassifier

import datetime
import json
import pickle
import copy

from sklearn.model_selection import train_test_split
from catboost import Pool
from catboost import cv as catboost_cv
from scipy import sparse

import xgboost as xgb
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
import lightgbm as lgb

# Preparations

In [3]:
TRAIN_PATH = 'train/'
TEST_PATH = 'test/'

In [4]:
X1 = pd.read_csv(TRAIN_PATH + 'X1.csv')
X2 = pd.read_csv(TRAIN_PATH + 'X2.csv')
X3 = pd.read_csv(TRAIN_PATH + 'X3.csv')

Y = pd.read_csv(TRAIN_PATH + 'Y.csv')

In [5]:
X1_test = pd.read_csv(TEST_PATH + 'X1.csv')
X2_test = pd.read_csv(TEST_PATH + 'X2.csv')
X3_test = pd.read_csv(TEST_PATH + 'X3.csv')

In [34]:
def transform_x2(X2, svd=None):
    rows, row_pos = np.unique(X2.iloc[:, 0], return_inverse=True)
    cols, col_pos = np.unique(X2.iloc[:, 1], return_inverse=True)
    print((len(rows), len(cols)))
    sparse_matrix = sparse.csr_matrix((len(rows), len(cols)))

    sparse_matrix[row_pos, col_pos] = 1

    cols_ = sparse_matrix.sum(axis=0)
    rows_ = sparse_matrix.sum(axis=1)

    minimum_users_per_group = 5
    selected_cols = cols_ >= minimum_users_per_group
    trimmed_sparse_matrix = sparse_matrix[:, np.squeeze(np.asarray(selected_cols))]

    sns.distplot(trimmed_sparse_matrix.sum(axis=0), bins=100)

    if svd is None:
        svd = TruncatedSVD(n_components=150)
        svd.fit(trimmed_sparse_matrix)

    components = pd.DataFrame(svd.transform(trimmed_sparse_matrix))
    components['id'] = X2.iloc[:, 0].unique()
    X2 = components
    return X2, svd

def get_x2_summed(X2):
    rows, row_pos = np.unique(X2.iloc[:, 0], return_inverse=True)
    cols, col_pos = np.unique(X2.iloc[:, 1], return_inverse=True)
    sparse_matrix = sparse.csr_matrix((len(rows), len(cols)))

    sparse_matrix[row_pos, col_pos] = 1

    return np.squeeze(np.asarray(sparse_matrix.sum(axis=1)))

def validate(estimator, X_train, y_train, random_state=None, n_folds=5):

    cv = KFold(n_splits=n_folds, shuffle=False, random_state=random_state)
    cv_iter = list(cv.split(X_train, y_train))

    scores=[]
    for train, test in tqdm(cv_iter):
        estimator.fit(X_train[train, :], y_train[train],
                        # eval_set=(X_train[test, :], y_train[test])
                        )
        pred_statement = estimator.predict_proba(X_train[test, :])[:, 1]
        metric = roc_auc_score(y_train[test], pred_statement)
        # print(metric)
        scores.append(metric)
    return np.array(scores)

def make_predictions(estimator, exp_name, X_train, Y_train, X_test):
    probas = []

    for i in tqdm('12345'):

        # X_train = X1.merge(Y, on='id', suffixes=('', '_y'),).iloc[:, :-5].drop('id', axis=1).values
        y = Y_train[i].values

        estimator.fit(X_train, y)
        proba = estimator.predict_proba(X_test)[:, 1]

        probas.append(proba)

    tmp = pd.DataFrame(probas).T
    baseline = pd.DataFrame(tmp.values, columns=['1', '2', '3', '4', '5'])
    baseline['id'] = X1_test['id']
    baseline[['id', '1', '2', '3', '4', '5']].to_csv(exp_name+'.csv', index=False)

# Dataset construction

In [25]:
def get_X3_denormalized(X3):
    
    return X3\
        .drop('id', axis=1)\
        .multiply(1 / X3[X3 != 0]\
        .min(axis=1), axis=0)

# X3_train_summed = X3\
#     .drop('id', axis=1)\
#     .multiply(1 / X3[X3 != 0]\
#     .drop('id', axis=1)\
#     .min(axis=1), axis=0)\
#     .sum(axis=1)
# X3_test_summed = X3_test\
#     .drop('id', axis=1).multiply(1 / X3_test[X3_test != 0].drop('id', axis=1).min(axis=1), axis=0).sum(axis=1)


X3_train_summed = get_X3_denormalized(X3).sum(axis=1)
X3_test_summed  = get_X3_denormalized(X3_test).sum(axis=1)

In [29]:
X_train = X1\
    .assign(X2_summed=get_x2_summed(X2))\
    .assign(X3_summed_dernomalized=X3_train_summed)\
    .assign(X3_non_zero_count=(X3.drop('id',axis=1) != 0).sum(axis=1))\
    .assign(X3_mean=X3.drop('id', axis=1).mean(axis=1))\
    .merge(Y, on='id', suffixes=('', '_y'),)\
    .iloc[:, :-5]

X_test = X1_test\
    .assign(X2_summed=get_x2_summed(X2_test))\
    .assign(X3_summed_dernomalized=X3_test_summed)\
    .assign(X3_non_zero_count=(X3_test.drop('id',axis=1) != 0).sum(axis=1))\
    .assign(X3_mean=X3_test.drop('id', axis=1).mean(axis=1))\



In [32]:
y_train = Y['1']

# Modeling and Validation

In [35]:
xgb_params = {'objective': 'binary:logistic', 
        'eval_metric': 'auc',
        'eta': 0.001,
        'max_depth': 1, 
        'subsample': 0.6, 
        'colsample_bytree': 0.6,
        'alpha':0.001,
        # 'random_state': 42, 
        'silent': True}

xgb_cls = xgb.XGBClassifier(n_jobs=8, **xgb_params)

scores = validate(xgb_cls, X_train.values, y_train.values)
print(scores)
print(scores.mean())

100%|████████████████████████████████████████████| 5/5 [00:08<00:00,  1.66s/it]


[0.58371091 0.62171739 0.58833971 0.60352552 0.60595521]
0.6006497474402898


In [36]:
params = {}
params['n_estimators'] = 300
params['l2_leaf_reg'] = 30
params['learning_rate'] = 0.0005
params['depth'] = 1

cat_cls = CatBoostClassifier(verbose=False, task_type="GPU", eval_metric="AUC",
                         loss_function='CrossEntropy', use_best_model=False, **params)

scores = validate(cat_cls, X_train.values, y_train.values)
print(scores)
print(scores.mean())

100%|████████████████████████████████████████████| 5/5 [04:15<00:00, 49.87s/it]


[0.57121281 0.6185144  0.57983353 0.59744492 0.58199789]
0.5898007099871575


In [37]:
import lightgbm as lgb

lgb_cls = lgb.LGBMClassifier(objective='cross_entropy', n_estimators=300, max_depth=5, learning_rate=0.001)

scores = validate(lgb_cls, X_train.values, y_train.values)
print(scores)
print(scores.mean())

100%|████████████████████████████████████████████| 5/5 [02:25<00:00, 37.14s/it]


[0.59565217 0.60133171 0.58033324 0.61274577 0.61476035]
0.6009646495264659


In [39]:
ensemble_cls = VotingClassifier(estimators=[('cat', cat_cls), ('xgb', xgb_cls), ('lgb', lgb_cls)], voting='soft')

scores = validate(ensemble_cls, X_train.values, y_train.values)
print(scores)
print(scores.mean())

100%|████████████████████████████████████████████| 5/5 [06:37<00:00, 80.94s/it]


[0.5908505  0.62379293 0.58937268 0.61400669 0.60951762]
0.6055080848951422


# Testing

In [None]:
make_predictions(ensemble_cls, 'ensemble-X1-X2_sum-with_id-X3-sum-mean-non_zero.csv', X_train, Y, X_test)

  0%|                                                    | 0/5 [00:00<?, ?it/s]

**LB score: 0.588304**