In [1]:
from spark_utils import get_spark_session
import pyspark.sql.functions as F

spark = get_spark_session()

# run format_data.py if these don't exist
train_data = spark.read.parquet('data/amex-default-prediction/train_data')
train_labels = spark.read.parquet('data/amex-default-prediction/train_labels')
print(train_data.count(), train_labels.count())

# test_data = spark.read.parquet('data/amex-default-prediction/test_data')
# sample_submission = spark.read.parquet('data/amex-default-prediction/sample_submission')


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/20 21:14:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


5531451 458913


In [2]:
assert train_data.count() == train_data.select('customer_ID', 'S_2').distinct().count()
# assert test_data.count() == test_data.select('customer_ID', 'S_2').distinct().count()

                                                                                

In [3]:
from pyspark.sql.window import Window
from format_data import TARGET_VARIABLE

window_latest_date_by_id = (
    Window
    .partitionBy('customer_ID')
    .orderBy(F.col('S_2').desc())
    .rangeBetween(
        Window.unboundedPreceding,
        Window.unboundedFollowing,
    )
)

train_data = train_data.withColumn('S_2_max', F.max('S_2').over(window_latest_date_by_id))
# test_data = test_data.withColumn('S_2_max', F.max('S_2').over(window_latest_date_by_id))

In [4]:
# train_labels_imputed = (
#     train_data
#     .select('customer_ID', 'S_2', 'S_2_max')
#     .join(train_labels, on='customer_ID', how='inner')
#     .withColumn(
#         'imputed_target',
#         F.when(F.col('S_2') == F.col('S_2_max'), F.col(TARGET_VARIABLE)).otherwise(F.lit(0.))
#     )
#     .cache()
# )

# train_labels_imputed.filter(F.col(TARGET_VARIABLE) == 1.).show()

In [5]:
%%time
from format_data import CATEGORICAL_VARIABLES
from encoder import CategoricalToIntegerEncoders

encs = CategoricalToIntegerEncoders(columns=CATEGORICAL_VARIABLES).fit(train_data)

CPU times: user 279 ms, sys: 99.8 ms, total: 379 ms
Wall time: 5.11 s


In [6]:
%%time
from format_data import TARGET_VARIABLE, DATE_VARIABLES, ID_VARIABLES

# make train_pdf
# train_pdf = train_data.join(
#     train_labels_imputed.select(
#         'S_2', 'customer_ID', 'imputed_target',
#     ),
#     on=['S_2', 'customer_ID'],
#     how='inner'
# )
train_pdf = train_data.join(train_labels, on='customer_ID', how='inner')
train_pdf = encs.transform(spark=spark, df=train_pdf).toPandas()
print('train_pdf.shape', train_pdf.shape)

# # make test_pdf
# test_pdf = encs.transform(spark=spark, df=test_data).toPandas()
# print('test_pdf.shape', test_pdf.shape)

non_feature_columns = [
    TARGET_VARIABLE,
    'imputed_target',
    *ID_VARIABLES,
    *DATE_VARIABLES.keys(),
    'S_2_max',
]
feature_columns = [c for c in train_pdf.columns if c not in non_feature_columns]
print(f'len(feature_columns): {len(feature_columns)}\n', ', '.join(feature_columns))

22/06/20 21:15:12 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

train_pdf.shape (5531451, 192)
len(feature_columns): 188
 P_2, D_39, B_1, B_2, R_1, S_3, D_41, B_3, D_42, D_43, D_44, B_4, D_45, B_5, R_2, D_46, D_47, D_48, D_49, B_6, B_7, B_8, D_50, D_51, B_9, R_3, D_52, P_3, B_10, D_53, S_5, B_11, S_6, D_54, R_4, S_7, B_12, S_8, D_55, D_56, B_13, R_5, D_58, S_9, B_14, D_59, D_60, D_61, B_15, S_11, D_62, D_65, B_16, B_17, B_18, B_19, B_20, S_12, R_6, S_13, B_21, D_69, B_22, D_70, D_71, D_72, S_15, B_23, D_73, P_4, D_74, D_75, D_76, B_24, R_7, D_77, B_25, B_26, D_78, D_79, R_8, R_9, S_16, D_80, R_10, R_11, B_27, D_81, D_82, S_17, R_12, B_28, R_13, D_83, R_14, R_15, D_84, R_16, B_29, S_18, D_86, D_87, R_17, R_18, D_88, B_31, S_19, R_19, B_32, S_20, R_20, R_21, B_33, D_89, R_22, R_23, D_91, D_92, D_93, D_94, R_24, R_25, D_96, S_22, S_23, S_24, S_25, S_26, D_102, D_103, D_104, D_105, D_106, D_107, B_36, B_37, R_26, R_27, D_108, D_109, D_110, D_111, B_39, D_112, B_40, S_27, D_113, D_115, D_118, D_119, D_121, D_122, D_123, D_124, D_125, D_127, D_128, D_129

In [7]:
import numpy as np

X_fit = train_pdf[feature_columns].reset_index(drop=True)
print(X_fit.shape)

# X_test = test_pdf[feature_columns].reset_index(drop=True)
# print(X_test.shape)

# y_fit = np.array(train_pdf['imputed_target'])
y_fit = np.array(train_pdf[TARGET_VARIABLE])
print(np.unique(y_fit, return_counts=True))

(5531451, 188)
(array([0., 1.], dtype=float32), array([4153582, 1377869]))


In [8]:
from sklearn.model_selection import train_test_split 

X_train, X_valid, y_train, y_valid = train_test_split(X_fit, y_fit)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((4148588, 188), (1382863, 188), (4148588,), (1382863,))

In [9]:
%%time
from lightgbm import train, Dataset
from evaluation import feval_amex, feval_amex_gini, feval_amex_top4

train_set = Dataset(data=X_train, label=y_train, categorical_feature=encs.columns_encoded)
valid_set = Dataset(data=X_valid, label=y_valid, categorical_feature=encs.columns_encoded)

m = train(
    params={
        'metric': ['auc', 'average_precision'],
    },
    train_set=train_set,
    valid_sets=[valid_set],
    feval=[feval_amex, feval_amex_gini, feval_amex_top4],
)



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 44501
[LightGBM] [Info] Number of data points in the train set: 4148588, number of used features: 188




[LightGBM] [Info] Start training from score 0.249144
[1]	valid_0's auc: 0.907426	valid_0's average_precision: 0.731737	valid_0's amex: 0.606174	valid_0's amex_gini: 0.814805	valid_0's amex_top4: 0.397543
[2]	valid_0's auc: 0.912403	valid_0's average_precision: 0.74482	valid_0's amex: 0.621747	valid_0's amex_gini: 0.824755	valid_0's amex_top4: 0.418738
[3]	valid_0's auc: 0.915658	valid_0's average_precision: 0.756997	valid_0's amex: 0.631018	valid_0's amex_gini: 0.831229	valid_0's amex_top4: 0.430807
[4]	valid_0's auc: 0.916921	valid_0's average_precision: 0.763836	valid_0's amex: 0.636024	valid_0's amex_gini: 0.833777	valid_0's amex_top4: 0.438272
[5]	valid_0's auc: 0.918905	valid_0's average_precision: 0.770039	valid_0's amex: 0.639743	valid_0's amex_gini: 0.837743	valid_0's amex_top4: 0.441743
[6]	valid_0's auc: 0.919782	valid_0's average_precision: 0.773604	valid_0's amex: 0.642405	valid_0's amex_gini: 0.839489	valid_0's amex_top4: 0.445321
[7]	valid_0's auc: 0.921034	valid_0's aver

In [10]:
train_data_latest = train_data.filter(F.col('S_2') == F.col('S_2_max'))
train_data_latest = train_data_latest.join(train_labels, on='customer_ID', how='inner')
train_data_latest = encs.transform(spark=spark, df=train_data_latest).toPandas()

                                                                                

In [11]:
X_train_latest = train_data_latest[feature_columns].reset_index(drop=True)
train_data_latest = Dataset(
    data=train_data_latest[feature_columns].reset_index(drop=True),
    label=train_data_latest[TARGET_VARIABLE].reset_index(drop=True),
)
for feval in [feval_amex, feval_amex_gini, feval_amex_top4]:
    print(feval(m.predict(X_train_latest), train_data_latest))

('amex', 0.7757070634257448, True)
('amex_gini', 0.9148385722683947, True)
('amex_top4', 0.6365755545830949, True)
