In [1]:
from spark_utils import get_spark_session
import pyspark.sql.functions as F

spark = get_spark_session()
# run transform_latest.py if this don't exist
test_data = spark.read.parquet('data_transformed/amex-default-prediction/test_data_aggregated')
train_data = spark.read.parquet('data_transformed/amex-default-prediction/train_data_aggregated')
# run format_data.py if these don't exist
train_labels = spark.read.parquet('data/amex-default-prediction/train_labels')
sample_submission = spark.read.parquet('data/amex-default-prediction/sample_submission')

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/20 22:41:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
%%time
from format_data import CATEGORICAL_VARIABLES
from encoder import CategoricalToIntegerEncoders

categorical_cols = []
for c in CATEGORICAL_VARIABLES:
    categorical_cols += [
        f'{c}_first',
        f'{c}_last',
        f'{c}_mode',
    ]

encs = CategoricalToIntegerEncoders(columns=categorical_cols).fit(train_data)

                                                                                

CPU times: user 394 ms, sys: 77.7 ms, total: 472 ms
Wall time: 19.1 s


In [3]:
%%time
from format_data import TARGET_VARIABLE, DATE_VARIABLES, ID_VARIABLES

# make train_pdf
train_pdf = train_data.join(train_labels, on='customer_ID', how='inner')
train_pdf = encs.transform(spark=spark, df=train_pdf).toPandas()

# make test_pdf
test_pdf = encs.transform(spark=spark, df=test_data).toPandas()

non_feature_columns = [
    TARGET_VARIABLE,
    *ID_VARIABLES,
    *DATE_VARIABLES.keys(),
    'S_2_first',
    'S_2_last',
]
feature_columns = [c for c in train_pdf.columns if c not in non_feature_columns]
print(f'len(feature_columns): {len(feature_columns)}\n', ', '.join(feature_columns))

22/06/20 22:42:08 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

len(feature_columns): 565
 num_statements, P_2_first, P_2_last, P_2_mean, D_39_first, D_39_last, D_39_mean, B_1_first, B_1_last, B_1_mean, B_2_first, B_2_last, B_2_mean, R_1_first, R_1_last, R_1_mean, S_3_first, S_3_last, S_3_mean, D_41_first, D_41_last, D_41_mean, B_3_first, B_3_last, B_3_mean, D_42_first, D_42_last, D_42_mean, D_43_first, D_43_last, D_43_mean, D_44_first, D_44_last, D_44_mean, B_4_first, B_4_last, B_4_mean, D_45_first, D_45_last, D_45_mean, B_5_first, B_5_last, B_5_mean, R_2_first, R_2_last, R_2_mean, D_46_first, D_46_last, D_46_mean, D_47_first, D_47_last, D_47_mean, D_48_first, D_48_last, D_48_mean, D_49_first, D_49_last, D_49_mean, B_6_first, B_6_last, B_6_mean, B_7_first, B_7_last, B_7_mean, B_8_first, B_8_last, B_8_mean, D_50_first, D_50_last, D_50_mean, D_51_first, D_51_last, D_51_mean, B_9_first, B_9_last, B_9_mean, R_3_first, R_3_last, R_3_mean, D_52_first, D_52_last, D_52_mean, P_3_first, P_3_last, P_3_mean, B_10_first, B_10_last, B_10_mean, D_53_first, D_53

In [4]:
import numpy as np

X_fit = train_pdf[feature_columns].reset_index(drop=True)
X_test = test_pdf[feature_columns].reset_index(drop=True)
print(X_fit.shape, X_test.shape)

y_fit = np.array(train_pdf[TARGET_VARIABLE])
print(np.unique(y_fit, return_counts=True))

(458913, 565) (924621, 565)
(array([0., 1.], dtype=float32), array([340085, 118828]))


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_fit, y_fit)
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

(344184, 565) (114729, 565) (344184,) (114729,)


In [6]:
%%time
import mlflow
from lightgbm import LGBMClassifier
from evaluation import feval_amex, feval_amex_gini, feval_amex_top4

mlflow.lightgbm.autolog()
experiment_id = mlflow.get_experiment_by_name('v2_aggregated.ipynb').experiment_id
with mlflow.start_run(experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    m = LGBMClassifier().fit(
        X=X_train, y=y_train, categorical_feature=encs.columns_encoded,
        eval_set=[(X_valid, y_valid)],
        eval_metric=['auc', 'average_precision', feval_amex, feval_amex_gini, feval_amex_top4],
    )

New categorical_feature is ['B_30_first_CategoricalToIntegerEncoder', 'B_30_last_CategoricalToIntegerEncoder', 'B_30_mode_CategoricalToIntegerEncoder', 'B_38_first_CategoricalToIntegerEncoder', 'B_38_last_CategoricalToIntegerEncoder', 'B_38_mode_CategoricalToIntegerEncoder', 'D_114_first_CategoricalToIntegerEncoder', 'D_114_last_CategoricalToIntegerEncoder', 'D_114_mode_CategoricalToIntegerEncoder', 'D_116_first_CategoricalToIntegerEncoder', 'D_116_last_CategoricalToIntegerEncoder', 'D_116_mode_CategoricalToIntegerEncoder', 'D_117_first_CategoricalToIntegerEncoder', 'D_117_last_CategoricalToIntegerEncoder', 'D_117_mode_CategoricalToIntegerEncoder', 'D_120_first_CategoricalToIntegerEncoder', 'D_120_last_CategoricalToIntegerEncoder', 'D_120_mode_CategoricalToIntegerEncoder', 'D_126_first_CategoricalToIntegerEncoder', 'D_126_last_CategoricalToIntegerEncoder', 'D_126_mode_CategoricalToIntegerEncoder', 'D_63_first_CategoricalToIntegerEncoder', 'D_63_last_CategoricalToIntegerEncoder', 'D_63_

[1]	valid_0's auc: 0.929956	valid_0's average_precision: 0.814174	valid_0's binary_logloss: 0.520128	valid_0's amex: 0.693264	valid_0's amex_gini: 0.85918	valid_0's amex_top4: 0.527347
[2]	valid_0's auc: 0.937917	valid_0's average_precision: 0.83757	valid_0's binary_logloss: 0.480084	valid_0's amex: 0.709294	valid_0's amex_gini: 0.875374	valid_0's amex_top4: 0.543214
[3]	valid_0's auc: 0.940007	valid_0's average_precision: 0.846702	valid_0's binary_logloss: 0.447958	valid_0's amex: 0.719678	valid_0's amex_gini: 0.879433	valid_0's amex_top4: 0.559922
[4]	valid_0's auc: 0.941333	valid_0's average_precision: 0.849337	valid_0's binary_logloss: 0.421531	valid_0's amex: 0.721893	valid_0's amex_gini: 0.882283	valid_0's amex_top4: 0.561502
[5]	valid_0's auc: 0.942092	valid_0's average_precision: 0.851671	valid_0's binary_logloss: 0.399342	valid_0's amex: 0.725622	valid_0's amex_gini: 0.883825	valid_0's amex_top4: 0.567419
[6]	valid_0's auc: 0.943275	valid_0's average_precision: 0.854129	valid_



CPU times: user 4min 53s, sys: 8.85 s, total: 5min 2s
Wall time: 1min 31s


In [7]:
from format_data import PREDICTION_VARIABLE
import pandas as pd

pred_test = pd.DataFrame({
    'customer_ID': test_pdf['customer_ID'],
    PREDICTION_VARIABLE: m.predict(X_test, raw_score=True),
})
pred_test.head()

Unnamed: 0,customer_ID,prediction
0,f46725e7c0d8fafde0552b32932ffca160824299931b08...,1.165237
1,cfff19edd4504ac82b42e5f3bb4a25ccd113bc52693fc5...,-6.411117
2,2c2d36b88f7fb75d0a184e1a6572c5b17991983be33617...,-1.706765
3,e71386714f6336d86973d5f01be027172291ec5c03a0b2...,-2.529496
4,7830708a49202bab905d42cede695a4e04ae60dc20bfcd...,-0.75349


In [8]:
pred_test.to_csv(f'{run_id}.csv', index=False)

In [10]:
!kaggle competitions submit -c amex-default-prediction -f 0261ff0b99024adc88cd4b8566e2a4d1.csv -m "experiments/7/runs/0261ff0b99024adc88cd4b8566e2a4d1"

100%|██████████████████████████████████████| 74.2M/74.2M [00:01<00:00, 45.7MB/s]
Successfully submitted to American Express - Default Prediction