In [1]:
from spark_utils import get_spark_session
import pyspark.sql.functions as F

spark = get_spark_session()
# run transform_latest.py if this don't exist
test_data = spark.read.parquet('data_transformed/amex-default-prediction/test_data_latest')
train_data = spark.read.parquet('data_transformed/amex-default-prediction/train_data_latest')
# run format_data.py if these don't exist
train_labels = spark.read.parquet('data/amex-default-prediction/train_labels')
sample_submission = spark.read.parquet('data/amex-default-prediction/sample_submission')

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/20 22:14:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/06/20 22:14:11 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
%%time
from format_data import CATEGORICAL_VARIABLES
from encoder import CategoricalToIntegerEncoders

encs = CategoricalToIntegerEncoders(columns=CATEGORICAL_VARIABLES).fit(train_data)

                                                                                

CPU times: user 327 ms, sys: 53.7 ms, total: 381 ms
Wall time: 8.49 s


In [3]:
%%time
from format_data import TARGET_VARIABLE, DATE_VARIABLES, ID_VARIABLES

# make train_pdf
train_pdf = train_data.join(train_labels, on='customer_ID', how='inner')
train_pdf = encs.transform(spark=spark, df=train_pdf).toPandas()

# make test_pdf
test_pdf = encs.transform(spark=spark, df=test_data).toPandas()

non_feature_columns = [
    TARGET_VARIABLE,
    *ID_VARIABLES,
    *DATE_VARIABLES.keys(),
]
feature_columns = [c for c in train_pdf.columns if c not in non_feature_columns]
print(f'len(feature_columns): {len(feature_columns)}\n', ', '.join(feature_columns))

22/06/20 22:14:25 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

len(feature_columns): 189
 P_2, D_39, B_1, B_2, R_1, S_3, D_41, B_3, D_42, D_43, D_44, B_4, D_45, B_5, R_2, D_46, D_47, D_48, D_49, B_6, B_7, B_8, D_50, D_51, B_9, R_3, D_52, P_3, B_10, D_53, S_5, B_11, S_6, D_54, R_4, S_7, B_12, S_8, D_55, D_56, B_13, R_5, D_58, S_9, B_14, D_59, D_60, D_61, B_15, S_11, D_62, D_65, B_16, B_17, B_18, B_19, B_20, S_12, R_6, S_13, B_21, D_69, B_22, D_70, D_71, D_72, S_15, B_23, D_73, P_4, D_74, D_75, D_76, B_24, R_7, D_77, B_25, B_26, D_78, D_79, R_8, R_9, S_16, D_80, R_10, R_11, B_27, D_81, D_82, S_17, R_12, B_28, R_13, D_83, R_14, R_15, D_84, R_16, B_29, S_18, D_86, D_87, R_17, R_18, D_88, B_31, S_19, R_19, B_32, S_20, R_20, R_21, B_33, D_89, R_22, R_23, D_91, D_92, D_93, D_94, R_24, R_25, D_96, S_22, S_23, S_24, S_25, S_26, D_102, D_103, D_104, D_105, D_106, D_107, B_36, B_37, R_26, R_27, D_108, D_109, D_110, D_111, B_39, D_112, B_40, S_27, D_113, D_115, D_118, D_119, D_121, D_122, D_123, D_124, D_125, D_127, D_128, D_129, B_41, B_42, D_130, D_131, D_1

In [4]:
import numpy as np

X_fit = train_pdf[feature_columns].reset_index(drop=True)
X_test = test_pdf[feature_columns].reset_index(drop=True)
print(X_fit.shape, X_test.shape)

y_fit = np.array(train_pdf[TARGET_VARIABLE])
print(np.unique(y_fit, return_counts=True))

(458913, 189) (924621, 189)
(array([0., 1.], dtype=float32), array([340085, 118828]))


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_fit, y_fit)
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

(344184, 189) (114729, 189) (344184,) (114729,)


In [6]:
%%time
import mlflow
from lightgbm import LGBMClassifier
from evaluation import feval_amex, feval_amex_gini, feval_amex_top4

mlflow.lightgbm.autolog()
experiment_id = mlflow.get_experiment_by_name('v2.ipynb').experiment_id
with mlflow.start_run(experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    m = LGBMClassifier().fit(
        X=X_train, y=y_train, categorical_feature=encs.columns_encoded,
        eval_set=[(X_valid, y_valid)],
        eval_metric=['auc', 'average_precision', feval_amex, feval_amex_gini, feval_amex_top4],
    )

New categorical_feature is ['B_30_CategoricalToIntegerEncoder', 'B_38_CategoricalToIntegerEncoder', 'D_114_CategoricalToIntegerEncoder', 'D_116_CategoricalToIntegerEncoder', 'D_117_CategoricalToIntegerEncoder', 'D_120_CategoricalToIntegerEncoder', 'D_126_CategoricalToIntegerEncoder', 'D_63_CategoricalToIntegerEncoder', 'D_64_CategoricalToIntegerEncoder', 'D_66_CategoricalToIntegerEncoder', 'D_68_CategoricalToIntegerEncoder']


[1]	valid_0's auc: 0.931128	valid_0's average_precision: 0.813688	valid_0's binary_logloss: 0.519549	valid_0's amex: 0.694826	valid_0's amex_gini: 0.862129	valid_0's amex_top4: 0.527523
[2]	valid_0's auc: 0.937062	valid_0's average_precision: 0.835216	valid_0's binary_logloss: 0.47978	valid_0's amex: 0.706384	valid_0's amex_gini: 0.874621	valid_0's amex_top4: 0.538148
[3]	valid_0's auc: 0.940298	valid_0's average_precision: 0.845113	valid_0's binary_logloss: 0.44767	valid_0's amex: 0.716623	valid_0's amex_gini: 0.880764	valid_0's amex_top4: 0.552482
[4]	valid_0's auc: 0.941381	valid_0's average_precision: 0.849213	valid_0's binary_logloss: 0.421264	valid_0's amex: 0.719297	valid_0's amex_gini: 0.882772	valid_0's amex_top4: 0.555822
[5]	valid_0's auc: 0.942403	valid_0's average_precision: 0.851436	valid_0's binary_logloss: 0.399134	valid_0's amex: 0.722836	valid_0's amex_gini: 0.884858	valid_0's amex_top4: 0.560814
[6]	valid_0's auc: 0.943176	valid_0's average_precision: 0.853406	valid_



CPU times: user 2min 1s, sys: 2.56 s, total: 2min 4s
Wall time: 53.5 s


In [7]:
from format_data import PREDICTION_VARIABLE
import pandas as pd

pred_test = pd.DataFrame({
    'customer_ID': test_pdf['customer_ID'],
    PREDICTION_VARIABLE: m.predict(X_test, raw_score=True),
})
pred_test.head()

Unnamed: 0,customer_ID,prediction
0,61fb2dd47f7a2eb2e81c3bf89e147bc61d6a42914cf570...,-5.33321
1,117a23d25e3b45d80250045da0d9a1bd50a7f57468cf5e...,0.601343
2,344029ccdb720001501d138a9a5ab1ff7abf827d296bba...,0.164597
3,4b09ee54b4254d63fa938bb67b93b9c134f336d78945d5...,-1.942375
4,c4e6c35ecccc7d71d7d677513fe15cb52dd034420cad2b...,-6.197563
