In [1]:
from spark_utils import get_spark_session
import pyspark.sql.functions as F

spark = get_spark_session()
# run transform_latest.py if this don't exist
test_data = spark.read.parquet('data_transformed/amex-default-prediction/test_data_latest')
train_data = spark.read.parquet('data_transformed/amex-default-prediction/train_data_latest')
# run format_data.py if these don't exist
train_labels = spark.read.parquet('data/amex-default-prediction/train_labels')
sample_submission = spark.read.parquet('data/amex-default-prediction/sample_submission')

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/20 13:12:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/06/20 13:12:42 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

In [2]:
%%time
from format_data import CATEGORICAL_VARIABLES
from encoder import CategoricalToIntegerEncoders

encs = CategoricalToIntegerEncoders(columns=CATEGORICAL_VARIABLES).fit(train_data)

                                                                                

CPU times: user 372 ms, sys: 53 ms, total: 425 ms
Wall time: 8.82 s


In [3]:
%%time
from format_data import TARGET_VARIABLE, DATE_VARIABLES, ID_VARIABLES

# make train_pdf
train_pdf = train_data.join(train_labels, on='customer_ID', how='inner')
train_pdf = encs.transform(spark=spark, df=train_pdf).toPandas()

# make test_pdf
test_pdf = encs.transform(spark=spark, df=test_data).toPandas()

feature_columns = [
    c for c in train_pdf.columns 
    if c not in [TARGET_VARIABLE,] + ID_VARIABLES + list(DATE_VARIABLES.keys())
]
print(f'len(feature_columns): {len(feature_columns)}\n', ', '.join(feature_columns))

22/06/20 13:12:58 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

len(feature_columns): 188
 P_2, D_39, B_1, B_2, R_1, S_3, D_41, B_3, D_42, D_43, D_44, B_4, D_45, B_5, R_2, D_46, D_47, D_48, D_49, B_6, B_7, B_8, D_50, D_51, B_9, R_3, D_52, P_3, B_10, D_53, S_5, B_11, S_6, D_54, R_4, S_7, B_12, S_8, D_55, D_56, B_13, R_5, D_58, S_9, B_14, D_59, D_60, D_61, B_15, S_11, D_62, D_65, B_16, B_17, B_18, B_19, B_20, S_12, R_6, S_13, B_21, D_69, B_22, D_70, D_71, D_72, S_15, B_23, D_73, P_4, D_74, D_75, D_76, B_24, R_7, D_77, B_25, B_26, D_78, D_79, R_8, R_9, S_16, D_80, R_10, R_11, B_27, D_81, D_82, S_17, R_12, B_28, R_13, D_83, R_14, R_15, D_84, R_16, B_29, S_18, D_86, D_87, R_17, R_18, D_88, B_31, S_19, R_19, B_32, S_20, R_20, R_21, B_33, D_89, R_22, R_23, D_91, D_92, D_93, D_94, R_24, R_25, D_96, S_22, S_23, S_24, S_25, S_26, D_102, D_103, D_104, D_105, D_106, D_107, B_36, B_37, R_26, R_27, D_108, D_109, D_110, D_111, B_39, D_112, B_40, S_27, D_113, D_115, D_118, D_119, D_121, D_122, D_123, D_124, D_125, D_127, D_128, D_129, B_41, B_42, D_130, D_131, D_1

In [4]:
import numpy as np

X_fit = train_pdf[feature_columns].reset_index(drop=True)
X_test = test_pdf[feature_columns].reset_index(drop=True)
print(X_fit.shape, X_test.shape)

y_fit = np.array(train_pdf[TARGET_VARIABLE])
print(np.unique(y_fit, return_counts=True))

(458913, 188) (924621, 188)
(array([0., 1.], dtype=float32), array([340085, 118828]))


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_fit, y_fit)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((344184, 188), (114729, 188), (344184,), (114729,))

In [6]:
from evaluation import amex_metric
from lightgbm import Dataset


def feval_amex(preds, eval_data: Dataset):
    # https://github.com/microsoft/LightGBM/blob/2f5baa3d39efb518cd13a7932fe4d8602c36762f/python-package/lightgbm/engine.py#L54-L71
    eval_result, _, _ = amex_metric(y_true=eval_data.label, y_pred=preds)
    # return eval_name, eval_result, is_higher_better
    return 'amex', eval_result, True

def feval_amex_gini(preds, eval_data: Dataset):
    # https://github.com/microsoft/LightGBM/blob/2f5baa3d39efb518cd13a7932fe4d8602c36762f/python-package/lightgbm/engine.py#L54-L71
    _, eval_result, _ = amex_metric(y_true=eval_data.label, y_pred=preds)
    # return eval_name, eval_result, is_higher_better
    return 'amex_gini', eval_result, True

def feval_amex_top4(preds, eval_data: Dataset):
    # https://github.com/microsoft/LightGBM/blob/2f5baa3d39efb518cd13a7932fe4d8602c36762f/python-package/lightgbm/engine.py#L54-L71
    _, _, eval_result = amex_metric(y_true=eval_data.label, y_pred=preds)
    # return eval_name, eval_result, is_higher_better
    return 'amex_top4', eval_result, True

In [7]:
%%time
from lightgbm import train, Dataset

train_set = Dataset(data=X_train, label=y_train, categorical_feature=encs.columns_encoded)
valid_set = Dataset(data=X_valid, label=y_valid, categorical_feature=encs.columns_encoded)

m = train(
    params={
        'metric': ['auc', 'average_precision'],
    },
    train_set=train_set,
    valid_sets=[valid_set],
    feval=[feval_amex, feval_amex_gini, feval_amex_top4],
)



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 44546
[LightGBM] [Info] Number of data points in the train set: 344184, number of used features: 188




[LightGBM] [Info] Start training from score 0.258495
[1]	valid_0's auc: 0.931035	valid_0's average_precision: 0.809948	valid_0's amex: 0.692304	valid_0's amex_gini: 0.862067	valid_0's amex_top4: 0.52254
[2]	valid_0's auc: 0.934723	valid_0's average_precision: 0.833461	valid_0's amex: 0.705269	valid_0's amex_gini: 0.869712	valid_0's amex_top4: 0.540827
[3]	valid_0's auc: 0.938419	valid_0's average_precision: 0.842473	valid_0's amex: 0.711287	valid_0's amex_gini: 0.876959	valid_0's amex_top4: 0.545616
[4]	valid_0's auc: 0.940622	valid_0's average_precision: 0.847922	valid_0's amex: 0.719197	valid_0's amex_gini: 0.881458	valid_0's amex_top4: 0.556936
[5]	valid_0's auc: 0.941942	valid_0's average_precision: 0.851769	valid_0's amex: 0.724439	valid_0's amex_gini: 0.883837	valid_0's amex_top4: 0.565041
[6]	valid_0's auc: 0.943468	valid_0's average_precision: 0.854067	valid_0's amex: 0.726171	valid_0's amex_gini: 0.887168	valid_0's amex_top4: 0.565175
[7]	valid_0's auc: 0.944657	valid_0's aver

In [8]:
from format_data import PREDICTION_VARIABLE
import pandas as pd

pred_test = pd.DataFrame({
    'customer_ID': test_pdf['customer_ID'],
    PREDICTION_VARIABLE: m.predict(X_test),
})
pred_test.head()

Unnamed: 0,customer_ID,prediction
0,d72e30f1857dcdc45eb55a09972c983ddc381108ae3275...,1.035711
1,774996eacc8ff2cab9f5a098556cb0fdb8079386dd3ffa...,1.03649
2,4975dd02572f2a564bf615232fa59d353d6034c46cc7e2...,0.024468
3,d49bca69b1cd613720f1f458fb051d7e5f6b2ae8f1ac82...,0.847613
4,36195b70707d85375966335f249542a1e3c6650fce7f16...,0.014975
