In [1]:
from spark_utils import get_spark_session
import pyspark.sql.functions as F

spark = get_spark_session()
# run transform_latest.py if this don't exist
test_data = spark.read.parquet('data_transformed/amex-default-prediction/test_data_latest')
train_data = spark.read.parquet('data_transformed/amex-default-prediction/train_data_latest')
# run format_data.py if these don't exist
train_labels = spark.read.parquet('data/amex-default-prediction/train_labels')
sample_submission = spark.read.parquet('data/amex-default-prediction/sample_submission')

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/08/01 00:30:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
%%time
from format_data import CATEGORICAL_VARIABLES
from encoder import CategoricalToIntegerEncoders

encs = CategoricalToIntegerEncoders(columns=CATEGORICAL_VARIABLES).fit(train_data)

                                                                                

CPU times: user 413 ms, sys: 24.6 ms, total: 438 ms
Wall time: 10.5 s


In [3]:
%%time
from format_data import TARGET_VARIABLE, DATE_VARIABLES, ID_VARIABLES

# make train_pdf
train_pdf = train_data.join(train_labels, on='customer_ID', how='inner')
train_pdf = encs.transform(spark=spark, df=train_pdf).toPandas()

# make test_pdf
test_pdf = encs.transform(spark=spark, df=test_data).toPandas()

non_feature_columns = [
    TARGET_VARIABLE,
    *ID_VARIABLES,
    *DATE_VARIABLES.keys(),
]
feature_columns = [c for c in train_pdf.columns if c not in non_feature_columns]
print(f'len(feature_columns): {len(feature_columns)}\n', ', '.join(feature_columns))

22/08/01 00:30:19 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

len(feature_columns): 189
 P_2, D_39, B_1, B_2, R_1, S_3, D_41, B_3, D_42, D_43, D_44, B_4, D_45, B_5, R_2, D_46, D_47, D_48, D_49, B_6, B_7, B_8, D_50, D_51, B_9, R_3, D_52, P_3, B_10, D_53, S_5, B_11, S_6, D_54, R_4, S_7, B_12, S_8, D_55, D_56, B_13, R_5, D_58, S_9, B_14, D_59, D_60, D_61, B_15, S_11, D_62, D_65, B_16, B_17, B_18, B_19, B_20, S_12, R_6, S_13, B_21, D_69, B_22, D_70, D_71, D_72, S_15, B_23, D_73, P_4, D_74, D_75, D_76, B_24, R_7, D_77, B_25, B_26, D_78, D_79, R_8, R_9, S_16, D_80, R_10, R_11, B_27, D_81, D_82, S_17, R_12, B_28, R_13, D_83, R_14, R_15, D_84, R_16, B_29, S_18, D_86, D_87, R_17, R_18, D_88, B_31, S_19, R_19, B_32, S_20, R_20, R_21, B_33, D_89, R_22, R_23, D_91, D_92, D_93, D_94, R_24, R_25, D_96, S_22, S_23, S_24, S_25, S_26, D_102, D_103, D_104, D_105, D_106, D_107, B_36, B_37, R_26, R_27, D_108, D_109, D_110, D_111, B_39, D_112, B_40, S_27, D_113, D_115, D_118, D_119, D_121, D_122, D_123, D_124, D_125, D_127, D_128, D_129, B_41, B_42, D_130, D_131, D_1

In [4]:
import numpy as np

X_fit = train_pdf[feature_columns].reset_index(drop=True)
X_test = test_pdf[feature_columns].reset_index(drop=True)
print(X_fit.shape, X_test.shape)

y_fit = np.array(train_pdf[TARGET_VARIABLE])
print(np.unique(y_fit, return_counts=True))

(458913, 189) (924621, 189)
(array([0., 1.], dtype=float32), array([340085, 118828]))


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_fit, y_fit)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(344184, 189) (114729, 189) (344184,) (114729,)


In [6]:
from evaluation import feval_amex, feval_amex_gini, feval_amex_top4
from sklearn import model_selection
import pandas as pd
from lightgbm import LGBMClassifier
import numpy as np


RANDOM_STATE = 20220731

def get_cv_hp_metrics(
    X_train: pd.DataFrame,
    y_train: np.array,
    lgb_params: dict,
) -> dict:
    with mlflow.start_run(nested=True) as run:
        mlflow.log_params(params=lgb_params)
        model = LGBMClassifier(**lgb_params)
        y_pred = model_selection.cross_val_predict(
            estimator=model, 
            X=X_train, 
            y=y_train, 
            method='predict_proba'
        )[:,1]
        metrics = {
            # returns (metric_name, metric_value, higher_is_better)
            'feval_amex': feval_amex(y_true=y_train, y_pred=y_pred)[1],
            'feval_amex_gini': feval_amex_gini(y_true=y_train, y_pred=y_pred)[1],
            'feval_amex_top4': feval_amex_top4(y_true=y_train, y_pred=y_pred)[1],
        }
        mlflow.log_metrics(metrics=metrics)
    return metrics


In [7]:
import hyperopt
import mlflow

def build_train_objective(
    X_train: pd.DataFrame,
    y_train: np.array,
    metric_name: str = 'feval_amex',
    higher_is_better: bool = True,
):
    def obj_fn(lgb_params):
        metrics = get_cv_hp_metrics(
            X_train=X_train,
            y_train=y_train,
            lgb_params=lgb_params,
        )
        return {
            'status': hyperopt.STATUS_OK, 
            'loss': metrics[metric_name] * -1 if higher_is_better else 1.,
        }
    return obj_fn

In [8]:
def find_best_run(
    run: mlflow.entities.Run,
    metric_name: str = 'feval_amex',
    higher_is_better: bool = True,
):
    client = mlflow.tracking.MlflowClient()
    nested_runs = client.search_runs(
        [run.info.experiment_id],
        "tags.mlflow.parentRunId = '{run_id}' ".format(run_id=run.info.run_id)
    )
    best_run = min(
        nested_runs, 
        key=lambda run: run.data.metrics[metric_name] * -1. if higher_is_better else 1.
    )
    mlflow.set_tag("best_run", best_run.info.run_id)
    mlflow.log_metric(f"best_{metric_name}", best_run.data.metrics[metric_name])
    print(
        f'best run id: {best_run.info.run_id} '
        f'achieved {metric_name} of {best_run.data.metrics[metric_name]}'
    )
    return best_run

In [9]:
from hyperopt.pyll.base import scope

MAX_EVALS = 10
PARALLELISM = 1

space = {
    'class_weight': {
        0.: 1.,
        1.: hyperopt.hp.uniform('class_weight', 0., 10.)
    },
    # 'subsample': hyperopt.hp.uniform('subsample', 0.05, 1.0),
    # The parameters below are cast to int using the scope.int() wrapper 
    # 'num_iterations': scope.int(hyperopt.hp.quniform('num_iterations', 10, 200, 1)),
    # 'num_leaves': scope.int(hyperopt.hp.quniform('num_leaves', 20, 50, 1))
}

trials = hyperopt.SparkTrials(parallelism=PARALLELISM)
train_objective = build_train_objective(
    X_train=X_train,
    y_train=y_train,
)

with mlflow.start_run(nested=False) as run:
    hyperopt.fmin(
        fn=train_objective,
        space=space,
        algo=hyperopt.tpe.suggest,
        max_evals=MAX_EVALS,
        trials=trials
    )
    find_best_run(run)
    print(
        f'run_id: {run.info.run_id} '
        f'experiment_id: {run.info.experiment_id} '
    )


  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

[Stage 43:>                                                         (0 + 1) / 1]

 10%|█         | 1/10 [00:56<08:24, 56.06s/trial, best loss: -0.7802366828473533]

                                                                                

 20%|██        | 2/10 [01:50<07:19, 54.89s/trial, best loss: -0.7802366828473533]

                                                                                

 30%|███       | 3/10 [02:52<06:47, 58.17s/trial, best loss: -0.7802366828473533]

                                                                                

 40%|████      | 4/10 [03:58<06:07, 61.29s/trial, best loss: -0.7802366828473533]

                                                                                

 50%|█████     | 5/10 [05:06<05:18, 63.74s/trial, best loss: -0.7802366828473533]

[Stage 48:>                                                         (0 + 1) / 1]

 60%|██████    | 6/10 [06:00<04:01, 60.46s/trial, best loss: -0.7805087730692806]

[Stage 49:>                                                         (0 + 1) / 1]

 70%|███████   | 7/10 [07:05<03:05, 61.97s/trial, best loss: -0.7805087730692806]

                                                                                

 80%|████████  | 8/10 [08:01<02:00, 60.09s/trial, best loss: -0.7805087730692806]

                                                                                

 90%|█████████ | 9/10 [08:57<00:58, 58.83s/trial, best loss: -0.7805087730692806]

                                                                                

100%|██████████| 10/10 [09:52<00:00, 59.27s/trial, best loss: -0.7805087730692806]


Total Trials: 10: 10 succeeded, 0 failed, 0 cancelled.


ValueError: min() arg is an empty sequence

In [10]:
print(
    f'run_id: {run.info.run_id} '
    f'experiment_id: {run.info.experiment_id} '
)

run_id: c1e5c4a2f1cb41e894f626f6faa56a6d experiment_id: 0 
