In [1]:
from spark_utils import get_spark_session

spark = get_spark_session()
train_data = spark.read.parquet('data_transformed/amex-default-prediction/train_data_aggregated')
train_labels = spark.read.parquet('data/amex-default-prediction/train_labels')

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/21 11:36:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
%%time
from format_data import CATEGORICAL_VARIABLES
from encoder import CategoricalToIntegerEncoders

categorical_cols = []
for c in CATEGORICAL_VARIABLES:
    categorical_cols += [
        # windowed features
        c,
        f'{c}_previous',
        # aggregated_features
        f'{c}_mode',
    ]

encs = CategoricalToIntegerEncoders(columns=categorical_cols).fit(train_data)

                                                                                

CPU times: user 423 ms, sys: 92.1 ms, total: 515 ms
Wall time: 22.4 s


In [3]:
%%time
from format_data import TARGET_VARIABLE, DATE_VARIABLES, ID_VARIABLES

# make train_pdf
train_pdf = train_data.join(train_labels, on='customer_ID', how='inner')
train_pdf = encs.transform(spark=spark, df=train_pdf).toPandas()

non_feature_columns = [
    TARGET_VARIABLE,
    *ID_VARIABLES,
    *DATE_VARIABLES.keys(),
]
feature_columns = [c for c in train_pdf.columns if c not in non_feature_columns]
print(f'len(feature_columns): {len(feature_columns)}\n', ', '.join(feature_columns))

22/06/21 11:37:18 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

len(feature_columns): 931
 S_2_days_since_previous, P_2, P_2_previous, D_39, D_39_previous, B_1, B_1_previous, B_2, B_2_previous, R_1, R_1_previous, S_3, S_3_previous, D_41, D_41_previous, B_3, B_3_previous, D_42, D_42_previous, D_43, D_43_previous, D_44, D_44_previous, B_4, B_4_previous, D_45, D_45_previous, B_5, B_5_previous, R_2, R_2_previous, D_46, D_46_previous, D_47, D_47_previous, D_48, D_48_previous, D_49, D_49_previous, B_6, B_6_previous, B_7, B_7_previous, B_8, B_8_previous, D_50, D_50_previous, D_51, D_51_previous, B_9, B_9_previous, R_3, R_3_previous, D_52, D_52_previous, P_3, P_3_previous, B_10, B_10_previous, D_53, D_53_previous, S_5, S_5_previous, B_11, B_11_previous, S_6, S_6_previous, D_54, D_54_previous, R_4, R_4_previous, S_7, S_7_previous, B_12, B_12_previous, S_8, S_8_previous, D_55, D_55_previous, D_56, D_56_previous, B_13, B_13_previous, R_5, R_5_previous, D_58, D_58_previous, S_9, S_9_previous, B_14, B_14_previous, D_59, D_59_previous, D_60, D_60_previous, D_61,

In [4]:
import numpy as np

X_fit = train_pdf[feature_columns].reset_index(drop=True).astype(float)
print(X_fit.shape)

y_fit = np.array(train_pdf[TARGET_VARIABLE])
print(np.unique(y_fit, return_counts=True))

(458913, 931)
(array([0., 1.], dtype=float32), array([340085, 118828]))


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_fit, y_fit) 
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

(344184, 931) (114729, 931) (344184,) (114729,)


In [6]:
%%time
import mlflow
from lightgbm import LGBMClassifier
from evaluation import feval_amex, feval_amex_gini, feval_amex_top4
import tempfile
import pickle
import os

mlflow.lightgbm.autolog()
experiment_id = mlflow.get_experiment_by_name('v2_aggregated.ipynb').experiment_id
with mlflow.start_run(experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    print('run_id', run_id)

    # store things required for prediction
    with tempfile.TemporaryDirectory() as p:
        prediction_artifacts = {
            'encs': encs,
            'feature_columns': feature_columns,
            # TODO: pack a spark dataframe to X transformer here so simplify prediction
        }
        with open(os.path.join(p, 'prediction_artifacts.pickle'), 'wb') as f:
            pickle.dump(prediction_artifacts, f)
        mlflow.log_artifacts(p)

    #  training
    m = LGBMClassifier(
        num_boost_round=200,
    ).fit(
        X=X_train, y=y_train, categorical_feature=encs.columns_encoded,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_names=['train', 'valid'],
        eval_metric=[feval_amex, feval_amex_gini, feval_amex_top4],
    )

run_id a4afb732fa744b81b8e3163f9af307b0


New categorical_feature is ['B_30_CategoricalToIntegerEncoder', 'B_30_mode_CategoricalToIntegerEncoder', 'B_30_previous_CategoricalToIntegerEncoder', 'B_38_CategoricalToIntegerEncoder', 'B_38_mode_CategoricalToIntegerEncoder', 'B_38_previous_CategoricalToIntegerEncoder', 'D_114_CategoricalToIntegerEncoder', 'D_114_mode_CategoricalToIntegerEncoder', 'D_114_previous_CategoricalToIntegerEncoder', 'D_116_CategoricalToIntegerEncoder', 'D_116_mode_CategoricalToIntegerEncoder', 'D_116_previous_CategoricalToIntegerEncoder', 'D_117_CategoricalToIntegerEncoder', 'D_117_mode_CategoricalToIntegerEncoder', 'D_117_previous_CategoricalToIntegerEncoder', 'D_120_CategoricalToIntegerEncoder', 'D_120_mode_CategoricalToIntegerEncoder', 'D_120_previous_CategoricalToIntegerEncoder', 'D_126_CategoricalToIntegerEncoder', 'D_126_mode_CategoricalToIntegerEncoder', 'D_126_previous_CategoricalToIntegerEncoder', 'D_63_CategoricalToIntegerEncoder', 'D_63_mode_CategoricalToIntegerEncoder', 'D_63_previous_Categorical





[1]	train's binary_logloss: 0.519178	train's amex: 0.706113	train's amex_gini: 0.869267	train's amex_top4: 0.54296	valid's binary_logloss: 0.520279	valid's amex: 0.697641	valid's amex_gini: 0.865518	valid's amex_top4: 0.529764
[2]	train's binary_logloss: 0.478901	train's amex: 0.722597	train's amex_gini: 0.880396	train's amex_top4: 0.564799	valid's binary_logloss: 0.480236	valid's amex: 0.712963	valid's amex_gini: 0.876376	valid's amex_top4: 0.54955
[3]	train's binary_logloss: 0.446547	train's amex: 0.728289	train's amex_gini: 0.883908	train's amex_top4: 0.57267	valid's binary_logloss: 0.448012	valid's amex: 0.718757	valid's amex_gini: 0.879969	valid's amex_top4: 0.557545
[4]	train's binary_logloss: 0.419907	train's amex: 0.732422	train's amex_gini: 0.8872	train's amex_top4: 0.577644	valid's binary_logloss: 0.421526	valid's amex: 0.724649	valid's amex_gini: 0.883019	valid's amex_top4: 0.566279
[5]	train's binary_logloss: 0.397446	train's amex: 0.734754	train's amex_gini: 0.889012	train



CPU times: user 11min 40s, sys: 12.9 s, total: 11min 52s
Wall time: 5min 55s


In [1]:
# run these if kernel dies due to OOM

import mlflow
import tempfile
import pickle
from spark_utils import get_spark_session

spark = get_spark_session()
run_id = 'a4afb732fa744b81b8e3163f9af307b0'


loaded_model = mlflow.lightgbm.load_model(f'runs:/{run_id}/model')

with tempfile.TemporaryDirectory() as p:
    p = mlflow.artifacts.download_artifacts(
        run_id=run_id, artifact_path='prediction_artifacts.pickle', dst_path=p)
    with open(p, 'rb') as f:
        prediction_artifacts = pickle.load(f)

# print(loaded_model, prediction_artifacts)

encs = prediction_artifacts['encs']
feature_columns = prediction_artifacts['feature_columns']
m = loaded_model

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/22 03:20:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from format_data import PREDICTION_VARIABLE
import pandas as pd
import pyspark.sql.functions as F
import warnings


warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

test_data = spark.read.parquet('data_transformed/amex-default-prediction/test_data_aggregated')
test_batch_size = 458913

test_data = test_data.withColumn(
    'batch_num', F.floor(F.monotonically_increasing_id()/F.lit(test_batch_size)))
test_batch_indices = [
    row['batch_num'] for row in test_data.select('batch_num').distinct().collect()]
print(len(test_batch_indices), test_batch_indices)



19 [0, 18718, 37436, 56154, 74872, 93590, 112308, 131026, 149744, 168462, 187180, 205898, 224616, 243334, 262052, 280770, 299488, 318206, 336924]


                                                                                

In [3]:
%%time
pred_test_l = []
for i, test_batch_index in enumerate(test_batch_indices):
    print(f'[{i}/{len(test_batch_indices)}]')
    test_pdf = encs.transform(
        spark=spark, df=test_data.filter(F.col('batch_num') == test_batch_index)
    ).toPandas()

    X_test = test_pdf[feature_columns].reset_index(drop=True).astype(float)
    print(X_test.shape)

    pred_test = pd.DataFrame({
        'customer_ID': test_pdf['customer_ID'],
        PREDICTION_VARIABLE: m.predict(X_test, raw_score=True),
         
    })
    pred_test_l.append(pred_test)

pred_test = pd.concat(pred_test_l, axis=0)
pred_test.head()
pred_test.to_csv(f'{run_id}.csv', index=False)
os.system(f'kaggle competitions submit -c amex-default-prediction -f {run_id}.csv -m "{run_id}"')

[0/19


22/06/22 03:20:27 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

(48694, 931)
[1/19


                                                                                

(48701, 931)
[2/19


                                                                                

(48686, 931)
[3/19


                                                                                

(48698, 931)
[4/19


                                                                                

(48678, 931)
[5/19


                                                                                

(48676, 931)
[6/19


                                                                                

(48680, 931)
[7/19


                                                                                

(48662, 931)
[8/19


                                                                                

(48667, 931)
[9/19


                                                                                

(48654, 931)
[10/19


                                                                                

(48666, 931)
[11/19


                                                                                

(48655, 931)
[12/19


                                                                                

(48657, 931)
[13/19


                                                                                

(48660, 931)
[14/19


                                                                                

(48638, 931)
[15/19


                                                                                

(48638, 931)
[16/19


                                                                                

(48635, 931)
[17/19


                                                                                

(48632, 931)
[18/19


                                                                                

(48644, 931)


100%|██████████| 74.2M/74.2M [00:01<00:00, 41.9MB/s]


Successfully submitted to American Express - Default PredictionCPU times: user 34.4 s, sys: 12 s, total: 46.4 s
Wall time: 5min 58s


0

In [4]:
pred_test_df = spark.createDataFrame(pred_test)
assert pred_test_df.count() == pred_test_df.select('customer_ID').distinct().count()

sample_submission = spark.read.parquet('data/amex-default-prediction/sample_submission')
assert pred_test_df.count() == pred_test_df.join(sample_submission, on='customer_ID', how='inner').count()

22/06/22 03:26:20 WARN TaskSetManager: Stage 23 contains a task of very large size (8582 KiB). The maximum recommended task size is 1000 KiB.
22/06/22 03:26:21 WARN TaskSetManager: Stage 26 contains a task of very large size (8582 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

(924621, 924621)