In [12]:
import pandas as pd
import numpy as np
import h5py

from sklearn import model_selection, linear_model, metrics, pipeline, preprocessing

# Set random seed
np.random.seed(42)

In [2]:
# Load the data and separate the id's
X_data = pd.read_hdf("cat.hdf5", "train")
y_data = pd.read_hdf("cat.hdf5", "train_target")
X_test = pd.read_hdf("cat.hdf5", "test")

data_id = X_data.loc[:, "id"]
test_id = X_test.loc[:, "id"]

X_data.drop(columns="id", inplace=True)
X_test.drop(columns="id", inplace=True)

In [3]:
# Split the data with labels into training and validation.
X_train, X_val, y_train, y_val = model_selection.train_test_split(X_data, y_data, test_size=0.3)

In [4]:
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape)

(210000, 40) (210000,) (90000, 40) (90000,) (200000, 40)


## Logisitic Regression as baseline model

In [21]:
logistic = pipeline.Pipeline(
    [
        ("min_max_scaler", preprocessing.MinMaxScaler()),
        ("logistic_classifier", linear_model.LogisticRegressionCV(
            solver="lbfgs", max_iter=2000, cv=5, n_jobs=-1))
    ]
)

logistic.fit(X_train, y_train);

Pipeline(memory=None,
         steps=[('min_max_scaler',
                 MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('logistic_classifier',
                 LogisticRegressionCV(Cs=10, class_weight=None, cv=5,
                                      dual=False, fit_intercept=True,
                                      intercept_scaling=1.0, l1_ratios=None,
                                      max_iter=2000, multi_class='warn',
                                      n_jobs=-1, penalty='l2',
                                      random_state=None, refit=True,
                                      scoring=None, solver='lbfgs', tol=0.0001,
                                      verbose=0))],
         verbose=False)

In [22]:
y_val_pred = logistic.predict_proba(X_val)[:, 1]

In [23]:
metrics.roc_auc_score(y_val, y_val_pred)

0.7663509689077292

In [45]:
def get_score(model):
    y_val_pred = model.predict_proba(X_val)[:, 1]
    val_score = metrics.roc_auc_score(y_val, y_val_pred)

    y_train_pred = model.predict_proba(X_train)[:, 1]
    train_score = metrics.roc_auc_score(y_train, y_train_pred)

    return (train_score, val_score)

In [46]:
get_score(logistic)

(0.7650231180301985, 0.7663509689077292)

## Now use XGBoost

In [31]:
from xgboost import XGBClassifier

In [48]:
xgb = XGBClassifier(n_estimators=100, subsample=0.8, n_jobs=-1)
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.8, verbosity=1)

In [50]:
get_score(xgb)

(0.7634729337738386, 0.7593287489853908)

In [47]:
X_val.shape

(90000, 40)