In [1]:
from spark_utils import get_spark_session
import pyspark.sql.functions as F

spark = get_spark_session()
# run transform_latest.py if this don't exist
test_data = spark.read.parquet('data_transformed/amex-default-prediction/test_data_aggregated')
train_data = spark.read.parquet('data_transformed/amex-default-prediction/train_data_aggregated')
# run format_data.py if these don't exist
train_labels = spark.read.parquet('data/amex-default-prediction/train_labels')
sample_submission = spark.read.parquet('data/amex-default-prediction/sample_submission')

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/21 03:34:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
%%time
from format_data import CATEGORICAL_VARIABLES
from encoder import CategoricalToIntegerEncoders

categorical_cols = []
for c in CATEGORICAL_VARIABLES:
    categorical_cols += [
        # windowed features
        c,
        f'{c}_previous',
        # aggregated_features
        f'{c}_mode',
    ]

encs = CategoricalToIntegerEncoders(columns=categorical_cols).fit(train_data)

                                                                                

CPU times: user 417 ms, sys: 74.6 ms, total: 491 ms
Wall time: 17.7 s


In [3]:
%%time
from format_data import TARGET_VARIABLE, DATE_VARIABLES, ID_VARIABLES

# make train_pdf
train_pdf = train_data.join(train_labels, on='customer_ID', how='inner')
train_pdf = encs.transform(spark=spark, df=train_pdf).toPandas()

# make test_pdf
# test_pdf = encs.transform(spark=spark, df=test_data).toPandas()

non_feature_columns = [
    TARGET_VARIABLE,
    *ID_VARIABLES,
    *DATE_VARIABLES.keys(),
]
feature_columns = [c for c in train_pdf.columns if c not in non_feature_columns]
print(f'len(feature_columns): {len(feature_columns)}\n', ', '.join(feature_columns))

22/06/21 03:34:41 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

len(feature_columns): 1307
 S_2_days_since_previous, P_2, P_2_previous, D_39, D_39_previous, B_1, B_1_previous, B_2, B_2_previous, R_1, R_1_previous, S_3, S_3_previous, D_41, D_41_previous, B_3, B_3_previous, D_42, D_42_previous, D_43, D_43_previous, D_44, D_44_previous, B_4, B_4_previous, D_45, D_45_previous, B_5, B_5_previous, R_2, R_2_previous, D_46, D_46_previous, D_47, D_47_previous, D_48, D_48_previous, D_49, D_49_previous, B_6, B_6_previous, B_7, B_7_previous, B_8, B_8_previous, D_50, D_50_previous, D_51, D_51_previous, B_9, B_9_previous, R_3, R_3_previous, D_52, D_52_previous, P_3, P_3_previous, B_10, B_10_previous, D_53, D_53_previous, S_5, S_5_previous, B_11, B_11_previous, S_6, S_6_previous, D_54, D_54_previous, R_4, R_4_previous, S_7, S_7_previous, B_12, B_12_previous, S_8, S_8_previous, D_55, D_55_previous, D_56, D_56_previous, B_13, B_13_previous, R_5, R_5_previous, D_58, D_58_previous, S_9, S_9_previous, B_14, B_14_previous, D_59, D_59_previous, D_60, D_60_previous, D_61

In [7]:
import numpy as np

X_fit = train_pdf[feature_columns].reset_index(drop=True).astype(float)
print(X_fit.shape)
# X_test = test_pdf[feature_columns].reset_index(drop=True)
# print(X_test.shape)

y_fit = np.array(train_pdf[TARGET_VARIABLE])
print(np.unique(y_fit, return_counts=True))

(458913, 1307)
(array([0., 1.], dtype=float32), array([340085, 118828]))


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_fit, y_fit) 
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

(344184, 1307) (114729, 1307) (344184,) (114729,)


In [9]:
%%time
import mlflow
from lightgbm import LGBMClassifier
from evaluation import feval_amex, feval_amex_gini, feval_amex_top4

mlflow.lightgbm.autolog()
experiment_id = mlflow.get_experiment_by_name('v2_aggregated.ipynb').experiment_id
with mlflow.start_run(experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    m = LGBMClassifier().fit(
        X=X_train, y=y_train, categorical_feature=encs.columns_encoded,
        eval_set=[(X_valid, y_valid)],
        eval_metric=['auc', 'average_precision', feval_amex, feval_amex_gini, feval_amex_top4],
    )

New categorical_feature is ['B_30_CategoricalToIntegerEncoder', 'B_30_mode_CategoricalToIntegerEncoder', 'B_30_previous_CategoricalToIntegerEncoder', 'B_38_CategoricalToIntegerEncoder', 'B_38_mode_CategoricalToIntegerEncoder', 'B_38_previous_CategoricalToIntegerEncoder', 'D_114_CategoricalToIntegerEncoder', 'D_114_mode_CategoricalToIntegerEncoder', 'D_114_previous_CategoricalToIntegerEncoder', 'D_116_CategoricalToIntegerEncoder', 'D_116_mode_CategoricalToIntegerEncoder', 'D_116_previous_CategoricalToIntegerEncoder', 'D_117_CategoricalToIntegerEncoder', 'D_117_mode_CategoricalToIntegerEncoder', 'D_117_previous_CategoricalToIntegerEncoder', 'D_120_CategoricalToIntegerEncoder', 'D_120_mode_CategoricalToIntegerEncoder', 'D_120_previous_CategoricalToIntegerEncoder', 'D_126_CategoricalToIntegerEncoder', 'D_126_mode_CategoricalToIntegerEncoder', 'D_126_previous_CategoricalToIntegerEncoder', 'D_63_CategoricalToIntegerEncoder', 'D_63_mode_CategoricalToIntegerEncoder', 'D_63_previous_Categorical

[1]	valid_0's auc: 0.932108	valid_0's average_precision: 0.819139	valid_0's binary_logloss: 0.520107	valid_0's amex: 0.693407	valid_0's amex_gini: 0.864037	valid_0's amex_top4: 0.522777
[2]	valid_0's auc: 0.938503	valid_0's average_precision: 0.843151	valid_0's binary_logloss: 0.480095	valid_0's amex: 0.712521	valid_0's amex_gini: 0.876681	valid_0's amex_top4: 0.548361
[3]	valid_0's auc: 0.940349	valid_0's average_precision: 0.847258	valid_0's binary_logloss: 0.448055	valid_0's amex: 0.719135	valid_0's amex_gini: 0.880463	valid_0's amex_top4: 0.557808
[4]	valid_0's auc: 0.942117	valid_0's average_precision: 0.85239	valid_0's binary_logloss: 0.421351	valid_0's amex: 0.726549	valid_0's amex_gini: 0.884128	valid_0's amex_top4: 0.56897
[5]	valid_0's auc: 0.942932	valid_0's average_precision: 0.854871	valid_0's binary_logloss: 0.399076	valid_0's amex: 0.728253	valid_0's amex_gini: 0.885587	valid_0's amex_top4: 0.570919
[6]	valid_0's auc: 0.94459	valid_0's average_precision: 0.85828	valid_0'

In [None]:
# from format_data import PREDICTION_VARIABLE
# import pandas as pd

# pred_test = pd.DataFrame({
#     'customer_ID': test_pdf['customer_ID'],
#     PREDICTION_VARIABLE: m.predict(X_test, raw_score=True),
# })
# pred_test.head()

In [None]:
# pred_test.to_csv(f'{run_id}.csv', index=False)

In [None]:
# !kaggle competitions submit -c amex-default-prediction -f 0261ff0b99024adc88cd4b8566e2a4d1.csv -m "experiments/7/runs/0261ff0b99024adc88cd4b8566e2a4d1"