In [1]:
import mlflow
import numpy as np

from encoder import CategoricalToIntegerEncoders
from format_data import (CATEGORICAL_VARIABLES, DATE_VARIABLES, ID_VARIABLES,
                         TARGET_VARIABLE)
from spark_utils import SparkSessionContext, get_spark_session

In [2]:
with SparkSessionContext() as spark:
    train_data = spark.read.parquet(
        'data_transformed/amex-default-prediction/train_data_latest')
    train_labels = spark.read.parquet(
        'data/amex-default-prediction/train_labels')
    train_data_labelled = train_data.join(
        train_labels, on=ID_VARIABLES, how='inner')
    assert train_data_labelled.count() == train_data.count()
    assert train_data_labelled.select(ID_VARIABLES).distinct(
    ).count() == train_data.select(ID_VARIABLES).distinct().count()

    test_data = spark.read.parquet(
        'data_transformed/amex-default-prediction/test_data_latest')
    sample_submission = spark.read.parquet(
        'data/amex-default-prediction/sample_submission')
    test_data_labelled = test_data.join(
        sample_submission, on=ID_VARIABLES, how='inner')
    assert test_data_labelled.count() == test_data.count()
    assert test_data_labelled.select(ID_VARIABLES).distinct(
    ).count() == test_data.select(ID_VARIABLES).distinct().count()

    non_feature_columns = [
        TARGET_VARIABLE,
        *ID_VARIABLES,
        *DATE_VARIABLES.keys(),
    ]
    feature_columns = [
        c for c in train_data.columns
        if c not in non_feature_columns
    ]
    categorical_feature_columns = CATEGORICAL_VARIABLES
    numerical_feature_columns = [
        c for c in feature_columns if c not in categorical_feature_columns]
    print(
        f'Feature columns {len(feature_columns)} '
        f'Categorical feature columns {len(categorical_feature_columns)} '
        f'Numerical feature columns {len(numerical_feature_columns)} '
    )

    encs = CategoricalToIntegerEncoders(
        columns=categorical_feature_columns).fit(train_data)
    transformed_feature_columns = numerical_feature_columns + encs.columns_encoded

    train_pdf = encs.transform(spark=spark, df=train_data_labelled).toPandas()
    train_pdf_bytes = train_pdf.memory_usage(deep=True).sum()
    print(
        f'train_pdf.memory_usage in megabytes: {train_pdf_bytes / 1048576: .2f}')

    test_pdf = encs.transform(spark=spark, df=test_data).toPandas()
    test_pdf_bytes = test_pdf.memory_usage(deep=True).sum()
    print(
        f'test_pdf.memory_usage in megabytes: {test_pdf_bytes / 1048576: .2f}')


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/08/04 02:06:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

Feature columns 189 Categorical feature columns 11 Numerical feature columns 178 


22/08/04 02:06:48 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

train_pdf.memory_usage in megabytes:  420.59


                                                                                

test_pdf.memory_usage in megabytes:  843.87


In [3]:
from sklearn.model_selection import train_test_split

X_train = train_pdf[transformed_feature_columns].reset_index(drop=True)
y_train = np.array(train_pdf[TARGET_VARIABLE])
print(
    f'X_train.shape: {X_train.shape} '
    f'y_train.shape: {y_train.shape} '
    f'y_train uniques: {np.unique(y_train, return_counts=True)} '
)

X_fit, X_valid, y_fit, y_valid = train_test_split(
    X_train, y_train, test_size=.1)
print(
    f'X_fit.shape: {X_fit.shape} '
    f'X_valid.shape: {X_valid.shape} '
    f'y_fit.shape: {y_fit.shape} '
    f'y_valid.shape: {y_valid.shape} '
)


X_test = test_pdf[transformed_feature_columns].reset_index(drop=True)
print(
    f'X_test.shape: {X_test.shape} '
)


X_train.shape: (458913, 189) y_train.shape: (458913,) y_train uniques: (array([0., 1.], dtype=float32), array([340085, 118828])) 
X_fit.shape: (413021, 189) X_valid.shape: (45892, 189) y_fit.shape: (413021,) y_valid.shape: (45892,) 
X_test.shape: (924621, 189) 


In [4]:
from pprint import pprint
import json

experiment_id = '0'
run_id = 'b96eaeb95d4a463ab68fb3078d6c0a83'

client = mlflow.tracking.MlflowClient()
best_run = client.get_run(run_id=run_id)
best_params = json.loads(best_run.data.params['lgb_params_json'])

# we need floats as keys here, but it's string
if 'class_weight' in best_params:
    best_params['class_weight'] = {float(k): v for k, v in best_params['class_weight'].items()}

pprint(best_params)

{'learning_rate': 0.01962380797591788,
 'num_iterations': 1369,
 'num_leaves': 60,
 'scale_pos_weight': 3.9313947986775477}


In [5]:
from pprint import pprint
import mlflow
from hp import get_cv_hp_metrics

model, metrics = get_cv_hp_metrics(
    X_train=X_fit,
    y_train=y_fit,
    X_test=X_valid,
    y_test=y_valid,
    categorical_feature=encs.columns_encoded,
    lgb_params=best_params,
    nested=False,
)
pprint(metrics)


New categorical_feature is ['B_30_CategoricalToIntegerEncoder', 'B_38_CategoricalToIntegerEncoder', 'D_114_CategoricalToIntegerEncoder', 'D_116_CategoricalToIntegerEncoder', 'D_117_CategoricalToIntegerEncoder', 'D_120_CategoricalToIntegerEncoder', 'D_126_CategoricalToIntegerEncoder', 'D_63_CategoricalToIntegerEncoder', 'D_64_CategoricalToIntegerEncoder', 'D_66_CategoricalToIntegerEncoder', 'D_68_CategoricalToIntegerEncoder']


Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1369]	valid10pct's binary_logloss: 0.270919




{'learning_rate': 0.01962380797591788, 'num_iterations': 1369, 'num_leaves': 60, 'scale_pos_weight': 3.9313947986775477}
{'test_feval_amex': 0.7822104577291258, 'test_feval_amex_gini': 0.91960987261674, 'test_feval_amex_top4': 0.6448110428415117}
{'test_feval_amex': 0.7822104577291258,
 'test_feval_amex_gini': 0.91960987261674,
 'test_feval_amex_top4': 0.6448110428415117}


In [6]:
from format_data import PREDICTION_VARIABLE
import pandas as pd

pred_test = pd.DataFrame({
    'customer_ID': test_pdf['customer_ID'],
    PREDICTION_VARIABLE: model.predict(X_test, raw_score=True),
})
pred_test.to_csv('submission.csv', index=False)
pred_test.head()

Unnamed: 0,customer_ID,prediction
0,61fb2dd47f7a2eb2e81c3bf89e147bc61d6a42914cf570...,-5.865666
1,117a23d25e3b45d80250045da0d9a1bd50a7f57468cf5e...,1.491204
2,344029ccdb720001501d138a9a5ab1ff7abf827d296bba...,1.79738
3,4b09ee54b4254d63fa938bb67b93b9c134f336d78945d5...,-0.247041
4,c4e6c35ecccc7d71d7d677513fe15cb52dd034420cad2b...,-6.408206
