In [1]:
from spark_utils import get_spark_session
import pyspark.sql.functions as F

spark = get_spark_session()
# run rank_by_latest.py if this doesn't exist
train_data = spark.read.parquet('data_transformed/amex-default-prediction/train_data_latest')
train_labels = spark.read.parquet('data/amex-default-prediction/train_labels')

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/11 15:27:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
assert train_data.count() == train_data.select('customer_ID').distinct().count()
assert train_labels.count() == train_labels.select('customer_ID').distinct().count()
assert train_data.count() == train_data.join(train_labels, on='customer_ID', how='inner').count()

                                                                                

In [3]:
%%time
from format_data import CATEGORICAL_VARIABLES
from encoder import CategoricalToIntegerEncoders

encs = CategoricalToIntegerEncoders(columns=CATEGORICAL_VARIABLES).fit(train_data)
# not actually transformed yet, just fitted
train_data_transformed = encs.transform(spark=spark, df=train_data)

In [None]:
%%time
train_pdf = encs.transform(
    spark=spark,
    df=train_data.join(train_labels, on='customer_ID', how='inner')
).toPandas()

In [None]:
from format_data import TARGET_VARIABLE, DATE_VARIABLES, ID_VARIABLES

feature_columns = [
    c for c in train_pdf.columns 
    if c not in [TARGET_VARIABLE,] + ID_VARIABLES + list(DATE_VARIABLES.keys())
]
', '.join(feature_columns)

In [None]:
X = train_pdf[feature_columns]
y = train_pdf[TARGET_VARIABLE]
print(y.unique())
# Note that the negative class has been subsampled for this dataset at 5%, and thus receives a 20x weighting in the scoring metric.
w = y.apply(lambda x: 20. if x == 1. else 1.)
print(w.unique())

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid, w_train, w_valid = train_test_split(X, y, w)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape, w_train.shape, w_valid.shape

In [None]:
%%time
from lightgbm import LGBMClassifier

m = LGBMClassifier().fit(
    X=X_train, y=y_train, sample_weight=w_train,
    categorical_feature = encs.columns_encoded
)
m.score(X=X_train, y=y_train, sample_weight=w_train), m.score(X=X_valid, y=y_valid, sample_weight=w_valid)

In [None]:
from evaluation import amex_metric
from format_data import TARGET_VARIABLE, PREDICTION_VARIABLE
import pandas as pd


def evaluate(X, y_true) -> float:
    y_true = y_true.reset_index(drop=True)
    y_pred = m.predict_proba(X)[:, 1]

    y_true = pd.DataFrame({TARGET_VARIABLE: y_true})
    y_pred = pd.DataFrame({PREDICTION_VARIABLE: y_pred})
    return amex_metric(y_true=y_true, y_pred=y_pred)


evaluate(X_train, y_train)
