In [1]:
import hyperopt
import mlflow
import numpy as np
from hyperopt import Trials
from hyperopt.pyll.base import scope
from sklearn.model_selection import train_test_split

from encoder import CategoricalToIntegerEncoders
from format_data import DATE_VARIABLES, ID_VARIABLES, TARGET_VARIABLE
from hp import build_train_objective, find_best_run
from spark_utils import SparkSessionContext
from transform_aggregated import (SUMMARY_FEATURE_CATEGORICAL_VARIABLES,
                                  WINDOW_FEATURE_CATEGORICAL_VARIABLES)


In [2]:
with SparkSessionContext() as spark:
    train_data = spark.read.parquet(
        'data_transformed/amex-default-prediction/train_data_aggregated')
    train_labels = spark.read.parquet(
        'data/amex-default-prediction/train_labels')
    train_data_labelled = train_data.join(
        train_labels, on=ID_VARIABLES, how='inner')
    assert train_data_labelled.count() == train_data.count()
    assert train_data_labelled.select(ID_VARIABLES).distinct(
    ).count() == train_data.select(ID_VARIABLES).distinct().count()

    non_feature_columns = [
        TARGET_VARIABLE,
        *ID_VARIABLES,
        *DATE_VARIABLES.keys(),
    ]
    feature_columns = [
        c for c in train_data.columns
        if c not in non_feature_columns
    ]
    categorical_feature_columns = [
        *WINDOW_FEATURE_CATEGORICAL_VARIABLES,
        *SUMMARY_FEATURE_CATEGORICAL_VARIABLES,
    ]
    numerical_feature_columns = [
        c for c in feature_columns if c not in categorical_feature_columns]
    print(
        f'Feature columns {len(feature_columns)} '
        f'Categorical feature columns {len(categorical_feature_columns)} '
        f'Numerical feature columns {len(numerical_feature_columns)} '
    )

    encs = CategoricalToIntegerEncoders(
        columns=categorical_feature_columns).fit(train_data)
    transformed_feature_columns = numerical_feature_columns + encs.columns_encoded

    train_pdf = encs.transform(spark=spark, df=train_data_labelled).toPandas()
    train_pdf_bytes = train_pdf.memory_usage(deep=True).sum()
    print(
        f'train_pdf.memory_usage in megabytes: {train_pdf_bytes / 1048576: .2f}')

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/08/05 19:18:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

Feature columns 1296 Categorical feature columns 33 Numerical feature columns 1263 


22/08/05 19:18:43 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

train_pdf.memory_usage in megabytes:  3034.25


In [3]:
from sklearn.model_selection import train_test_split

X_train = train_pdf[transformed_feature_columns].reset_index(drop=True)
y_train = np.array(train_pdf[TARGET_VARIABLE])
print(
    f'X_train.shape: {X_train.shape} '
    f'y_train.shape: {y_train.shape} ' 
    f'y_train uniques: {np.unique(y_train, return_counts=True)} '
)

X_fit, X_valid, y_fit, y_valid = train_test_split(
    X_train, y_train, test_size=.1)
print(
    f'X_fit.shape: {X_fit.shape} '
    f'X_valid.shape: {X_valid.shape} '
    f'y_fit.shape: {y_fit.shape} '
    f'y_valid.shape: {y_valid.shape} '
)

X_train.shape: (458913, 1296) y_train.shape: (458913,) y_train uniques: (array([0., 1.], dtype=float32), array([340085, 118828])) 
X_fit.shape: (413021, 1296) X_valid.shape: (45892, 1296) y_fit.shape: (413021,) y_valid.shape: (45892,) 


In [4]:
from pprint import pprint
import json

experiment_id = '0'
run_id = '0bf17f30bd244ea4a0854942d281667d'

client = mlflow.tracking.MlflowClient()
best_run = client.get_run(run_id=run_id)
best_params = json.loads(best_run.data.params['lgb_params_json'])

# we need floats as keys here, but it's string
if 'class_weight' in best_params:
    best_params['class_weight'] = {float(k): v for k, v in best_params['class_weight'].items()}

pprint(best_params)

{'learning_rate': 0.007067830673760356,
 'num_iterations': 4119,
 'num_leaves': 50,
 'scale_pos_weight': 4.979631827974136}


In [5]:
from pprint import pprint
import mlflow
from hp import get_cv_hp_metrics

model, metrics = get_cv_hp_metrics(
    X_train=X_fit,
    y_train=y_fit,
    X_test=X_valid,
    y_test=y_valid,
    categorical_feature=encs.columns_encoded,
    lgb_params=best_params,
    nested=False,
)
pprint(metrics)


New categorical_feature is ['B_30_CategoricalToIntegerEncoder', 'B_30_mode_CategoricalToIntegerEncoder', 'B_30_previous_CategoricalToIntegerEncoder', 'B_38_CategoricalToIntegerEncoder', 'B_38_mode_CategoricalToIntegerEncoder', 'B_38_previous_CategoricalToIntegerEncoder', 'D_114_CategoricalToIntegerEncoder', 'D_114_mode_CategoricalToIntegerEncoder', 'D_114_previous_CategoricalToIntegerEncoder', 'D_116_CategoricalToIntegerEncoder', 'D_116_mode_CategoricalToIntegerEncoder', 'D_116_previous_CategoricalToIntegerEncoder', 'D_117_CategoricalToIntegerEncoder', 'D_117_mode_CategoricalToIntegerEncoder', 'D_117_previous_CategoricalToIntegerEncoder', 'D_120_CategoricalToIntegerEncoder', 'D_120_mode_CategoricalToIntegerEncoder', 'D_120_previous_CategoricalToIntegerEncoder', 'D_126_CategoricalToIntegerEncoder', 'D_126_mode_CategoricalToIntegerEncoder', 'D_126_previous_CategoricalToIntegerEncoder', 'D_63_CategoricalToIntegerEncoder', 'D_63_mode_CategoricalToIntegerEncoder', 'D_63_previous_Categorical

Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[4119]	valid10pct's binary_logloss: 0.285826




{'learning_rate': 0.007067830673760356, 'num_iterations': 4119, 'num_leaves': 50, 'scale_pos_weight': 4.979631827974136}
{'test_feval_amex': 0.7837716053659095, 'test_feval_amex_gini': 0.9215092045336449, 'test_feval_amex_top4': 0.6460340061981741}
{'test_feval_amex': 0.7837716053659095,
 'test_feval_amex_gini': 0.9215092045336449,
 'test_feval_amex_top4': 0.6460340061981741}


In [6]:
import gc

del train_pdf, X_fit, y_fit, X_valid, y_valid
gc.collect()

354889

In [7]:
with SparkSessionContext() as spark:
    test_data = spark.read.parquet(
        'data_transformed/amex-default-prediction/test_data_aggregated')
    sample_submission = spark.read.parquet(
        'data/amex-default-prediction/sample_submission')
    test_data_labelled = test_data.join(
        sample_submission, on=ID_VARIABLES, how='inner')
    assert test_data_labelled.count() == test_data.count()
    assert test_data_labelled.select(ID_VARIABLES).distinct(
    ).count() == test_data.select(ID_VARIABLES).distinct().count()

    test_pdf = encs.transform(spark=spark, df=test_data).toPandas()
    test_pdf_bytes = test_pdf.memory_usage(deep=True).sum()
    print(
        f'test_pdf.memory_usage in megabytes: {test_pdf_bytes / 1048576: .2f}')


                                                                                

test_pdf.memory_usage in megabytes:  6109.90


In [8]:
X_test = test_pdf[transformed_feature_columns].reset_index(drop=True)
print(
    f'X_test.shape: {X_test.shape} '
)

X_test.shape: (924621, 1296) 


In [9]:
from format_data import PREDICTION_VARIABLE
import pandas as pd

pred_test = pd.DataFrame({
    'customer_ID': test_pdf['customer_ID'],
    PREDICTION_VARIABLE: model.predict(X_test, raw_score=True),
}) 
pred_test.to_csv('submission.csv', index=False)
pred_test.head()

Unnamed: 0,customer_ID,prediction
0,64f546fba7cc37633f0e209a00778cc17db40ba318858d...,-0.155468
1,5d64a3afafd1b99ef38abc766bc219a42570a57a7de41c...,-5.227469
2,50232e74a7bdae9c143c34a58bb308513f286231ef58ad...,-6.30651
3,832e2c6479383c7a9f2ddab0c39405b81a452fd8d219ce...,-5.922323
4,2d9c4159dd43f299fb00f2b6e4c875d46256e035a56ea0...,-2.658408
