In [1]:
from spark_utils import get_spark_session


spark = get_spark_session()
# run transform_latest.py if this don't exist
test_data = spark.read.parquet('data_transformed/amex-default-prediction/test_data_latest')
train_data = spark.read.parquet('data_transformed/amex-default-prediction/train_data_latest')
# run format_data.py if these don't exist
train_labels = spark.read.parquet('data/amex-default-prediction/train_labels')
sample_submission = spark.read.parquet('data/amex-default-prediction/sample_submission')

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/19 03:18:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
%%time
from format_data import CATEGORICAL_VARIABLES
from encoder import CategoricalToIntegerEncoders
from format_data import TARGET_VARIABLE, DATE_VARIABLES, ID_VARIABLES


encs = CategoricalToIntegerEncoders(columns=CATEGORICAL_VARIABLES).fit(train_data)

train_pdf = train_data.join(train_labels, on='customer_ID', how='inner')
train_pdf = encs.transform(spark=spark, df=train_pdf).toPandas()

test_pdf = encs.transform(spark=spark, df=test_data).toPandas()

feature_columns = [
    c for c in train_pdf.columns 
    if c not in [TARGET_VARIABLE,] + ID_VARIABLES + list(DATE_VARIABLES.keys())
]
print('feature_columns\n', ', '.join(feature_columns))

22/06/19 03:18:52 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

feature_columns
 P_2, D_39, B_1, B_2, R_1, S_3, D_41, B_3, D_42, D_43, D_44, B_4, D_45, B_5, R_2, D_46, D_47, D_48, D_49, B_6, B_7, B_8, D_50, D_51, B_9, R_3, D_52, P_3, B_10, D_53, S_5, B_11, S_6, D_54, R_4, S_7, B_12, S_8, D_55, D_56, B_13, R_5, D_58, S_9, B_14, D_59, D_60, D_61, B_15, S_11, D_62, D_65, B_16, B_17, B_18, B_19, B_20, S_12, R_6, S_13, B_21, D_69, B_22, D_70, D_71, D_72, S_15, B_23, D_73, P_4, D_74, D_75, D_76, B_24, R_7, D_77, B_25, B_26, D_78, D_79, R_8, R_9, S_16, D_80, R_10, R_11, B_27, D_81, D_82, S_17, R_12, B_28, R_13, D_83, R_14, R_15, D_84, R_16, B_29, S_18, D_86, D_87, R_17, R_18, D_88, B_31, S_19, R_19, B_32, S_20, R_20, R_21, B_33, D_89, R_22, R_23, D_91, D_92, D_93, D_94, R_24, R_25, D_96, S_22, S_23, S_24, S_25, S_26, D_102, D_103, D_104, D_105, D_106, D_107, B_36, B_37, R_26, R_27, D_108, D_109, D_110, D_111, B_39, D_112, B_40, S_27, D_113, D_115, D_118, D_119, D_121, D_122, D_123, D_124, D_125, D_127, D_128, D_129, B_41, B_42, D_130, D_131, D_132, D_133,

In [3]:
from sklearn.model_selection import train_test_split

X = train_pdf[feature_columns]
X_test = test_pdf[feature_columns]
y = train_pdf[TARGET_VARIABLE]
print('y.unique()', y.unique())

X_train, X_valid, y_train, y_valid = train_test_split(X, y)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

y.unique() [0. 1.]


((344184, 188), (114729, 188), (344184,), (114729,))

In [4]:
def get_weights(y, negative_label_weight: float = 20.):
    # default is thus 20. because:
    # Note that the negative class has been subsampled for this dataset at 5%, and thus receives a 20x weighting in the scoring metric.
    return y.apply(lambda x: negative_label_weight if x == 0. else 1.)

In [5]:
%%time
import mlflow
from lightgbm import LGBMClassifier
from evaluation import evaluate
import pandas as pd
from format_data import PREDICTION_VARIABLE
from tempfile import TemporaryDirectory
import os
import numpy as np


for negative_label_weight_train in np.linspace(0.01, 5., num=10, endpoint=False):
    # mlflow.lightgbm.autolog()
    experiment_id = mlflow.get_experiment_by_name('use_latest_tune_label_weight.ipynb').experiment_id
    with mlflow.start_run(experiment_id=experiment_id) as run:
        run_id = run.info.run_id
        print(f'run_id: {run_id}')

        w_train = get_weights(y_train, negative_label_weight=negative_label_weight_train)
        mlflow.log_param('negative_label_weight_train', negative_label_weight_train)

        m = LGBMClassifier().fit(
            X=X_train, y=y_train, sample_weight=w_train,
            categorical_feature=encs.columns_encoded,
        )

        score_train = m.score(X=X_train, y=y_train, sample_weight=get_weights(y_train))
        mlflow.log_metric('score_train', score_train)
        score_valid = m.score(X=X_valid, y=y_valid, sample_weight=get_weights(y_valid))
        mlflow.log_metric('score_valid', score_valid)
        score_amex_train = evaluate(X_train, y_train, m=m)
        mlflow.log_metric('score_amex_train', score_amex_train)
        score_amex_valid = evaluate(X_valid, y_valid, m=m)
        mlflow.log_metric('score_amex_valid', score_amex_valid)

        pred_df = pd.DataFrame({
            PREDICTION_VARIABLE: m.predict_proba(X_test)[:, 1],
            'customer_ID': test_pdf['customer_ID'],
        })
        pred_and_sample_joined_counts = (
            spark
            .createDataFrame(pred_df)
            .join(sample_submission, on='customer_ID', how='inner')
            .count()
        )
        assert pred_and_sample_joined_counts == len(pred_df), \
            f'''These should be identical:
            sample_submission has {sample_submission.count()} rows,
            pred_and_sample_joined_counts is {pred_and_sample_joined_counts},
            pred_df has {len(pred_df)} rows
            '''
        with TemporaryDirectory() as p:
            p = os.path.join(p, 'submission.csv')
            pred_df.to_csv(p, header=True, index=False)
            mlflow.log_artifact(local_path=p)


run_id: 0648c2e4f5ee42989989c14ccfe48fb0


22/06/19 03:19:31 WARN TaskSetManager: Stage 43 contains a task of very large size (8582 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

run_id: 2978cb2402734c7890cf7dfa45531854


22/06/19 03:19:55 WARN TaskSetManager: Stage 52 contains a task of very large size (8582 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

run_id: ba3604b2bd994505b1be80b33137d684


22/06/19 03:20:20 WARN TaskSetManager: Stage 61 contains a task of very large size (8582 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

run_id: e10d4b6828964c80b52803fea9ba2877


22/06/19 03:20:43 WARN TaskSetManager: Stage 70 contains a task of very large size (8582 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

run_id: c76a640a39b24fcf9c23495961973f04


22/06/19 03:21:06 WARN TaskSetManager: Stage 79 contains a task of very large size (8582 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

run_id: 9d589fe0533747e7a2f04fb9244ca0b2


22/06/19 03:21:30 WARN TaskSetManager: Stage 88 contains a task of very large size (8582 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

run_id: 3436f72261214893a61bdfdb89ce344a


22/06/19 03:21:53 WARN TaskSetManager: Stage 97 contains a task of very large size (8582 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

run_id: 97cfd11d59274b729e15ff2df056ac74


22/06/19 03:22:16 WARN TaskSetManager: Stage 106 contains a task of very large size (8582 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

run_id: e9189f79e2a84eeda000cfe470865921


22/06/19 03:22:41 WARN TaskSetManager: Stage 115 contains a task of very large size (8582 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

run_id: 6b2463bc29a84fa99c7d2ed0d63e537c


22/06/19 03:23:04 WARN TaskSetManager: Stage 124 contains a task of very large size (8582 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 17min 4s, sys: 27.6 s, total: 17min 32s
Wall time: 3min 54s
