In this notebook we are going to explore the efficiency of some tree based approaches, namely:
- RandomForestClassifier
- BalancedRandomForestClassifier
- LightGBM


The reason for this appraoch is that they don't require as much preprocessing and they can work quite well with imbalanced data. Especially, `BalancedRandomForestClassifier` and Gradient Boost Methods (like `LightGBM`) could work potentially a bit better than simple `RandomForestsClassifiers`.

In [None]:
%load_ext nb_black

In [None]:
import datetime as dt
import os
import os.path as osp
from functools import wraps

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

try:
    from sklearnex import patch_sklearn

    patch_sklearn()
except ImportError as e:
    print(e)
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import FunctionTransformer, FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight

from sklearn.metrics import precision_recall_curve

from helpers.read_file import read_feather
from helpers.profiling import timing
from helpers.series_list import bag_of_words_series, get_n_elements

from modelling.utils.performance import (
    get_classification_report,
    plot_conf_matrix,
    plot_precision_recall_curve_over_thresholds,
    plot_precision_recall_curve,
)
from pipelines.preprocessing_pandas import (
    extract_hour_str,
    extract_weekday_timestamp,
    frequency_encoding,
    is_weekend,
    remove_dollar_sign,
)

from pipelines.decorators import apply_function_to_row

In [None]:
plt.style.use("ggplot")

# Load data

In [None]:
df = read_feather("../../data/train.fth")
df.head()

In [None]:
df["date"] = df.apply(
    lambda row: dt.datetime(int(row["Year"]), int(row["Month"]), int(row["Day"])),
    axis=1,
)

In [None]:
df["Merchant State"] = df["Merchant State"].fillna("")

In [None]:
labels_error_indicator = pd.Series(
    np.where(df["Errors?"] == df["Errors?"], 1, 0), name="errors_indicator"
)

In [None]:
X = df
y = labels_error_indicator

# Split train, test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Preprocessing data/creating pipelines

In [None]:
frequency_encoding_fn = FunctionTransformer(
    frequency_encoding, kw_args={"normalize": False}
)
ohe = OneHotEncoder(handle_unknown="ignore")

## Card

## Time

### Hour

In [None]:
TIME_FORMAT = "%H:%M"

In [None]:
hour_pipeline = Pipeline(
    [
        (
            "extract_hour",
            FunctionTransformer(extract_hour_str, kw_args={"time_format": TIME_FORMAT}),
        ),
        ("ohe", ohe),
    ]
)

### Weekday

In [None]:
weekday_pipeline = Pipeline(
    [
        (
            "extract_weekday",
            FunctionTransformer(extract_weekday_timestamp),
        ),
        ("ohe", ohe),
    ]
)


## Amount

In [None]:
amount_pipeline = Pipeline(
    [("remove_dollar_sign", FunctionTransformer(remove_dollar_sign))]
)


## Use chip

In [None]:
use_chip_pipeline = ohe

## Merchant name

In [None]:
merchant_name_pipeline = Pipeline(
    [
         (
            "merchant_name", frequency_encoding_fn),
        
    ]
)


## Merchant city

In [None]:
merchant_city_pipeline = Pipeline(
    [
        (
            "merchant_city",
            frequency_encoding_fn,
        ),
    ]
)

## Merchant state

In [None]:
merchant_state_pipeline = Pipeline(
    [
        (
            "merchant_state",
            frequency_encoding_fn,
        ),
    ]
)

## MCC

In [None]:
mcc_pipeline = Pipeline(
    [
        (
            "mcc",
            frequency_encoding_fn,
        )
    ]
)

## Creating pipeline

In [None]:
pipeline = ColumnTransformer(
    [
        #         ("Card", card_pipeline, ["Card"]),
        ("Amount", amount_pipeline, "Amount"),
        ("Weekday", weekday_pipeline, "date"),
        ("Hour", hour_pipeline, "Time"),
        ("Use Chip", use_chip_pipeline, ["Use Chip"]),
        ("Merchant Name", merchant_name_pipeline, "Merchant Name"),
        ("Merchant City", merchant_city_pipeline, "Merchant City"),
        ("Merchant State", merchant_state_pipeline, "Merchant State"),
        ("MCC", mcc_pipeline, "MCC"),
    ]
)

In [None]:
X_train[y_train == 1][:10]

In [None]:
X_train_prepared = pipeline.fit_transform(X_train)

In [None]:
X_test_prepared = pipeline.transform(X_test)

# Modelling

In [None]:
weights = compute_class_weight(class_weight="balanced", classes=[0, 1], y=y_train)
weights

## Hyperparameter tuning

Here, we are using `RandomizedSearchCV` ( a better option might be `skopt.BayesSearchCV`) and testing only a few hyperparameters (although we could add more).

In [None]:
RESAMPLING = False
if RESAMPLING:
    method = SMOTE()
    X_train_prepared, y_train = method.fit_resample(X_train_prepared, y_train)

### Models

In [None]:
models = {
    "random_forest": {
        "clf": RandomForestClassifier(random_state=42),
        "param_distributions": {
            "n_estimators": [50, 100, 200],
            "max_depth": [10, 30, 100],
        },
    },
    "lgb": {
        "clf": lgb.LGBMClassifier(
            objective="binary",
            learning_rate=0.01,
            class_weight={0: 1, 1: 20},
            random_state=42,
        ),
        "param_distributions": {
            #             "n_estimators": [50, 100, 200],
            #             "max_depth": [10, 30, 100],
        },
    },
    "balanced_tree": {
        "clf": BalancedRandomForestClassifier(random_state=42),
        "param_distributions": {
            "n_estimators": [50, 100, 200],
            "max_depth": [10, 30, 100],
        },
    },
}

In [None]:
def get_model(model: str, scoring="f1", cv=2, verbose=3, n_jobs=-1):

    selected_model = models[model]

    rnd_search = RandomizedSearchCV(
        estimator=selected_model["clf"],
        param_distributions=selected_model["param_distributions"],
        cv=cv,
        n_jobs=n_jobs,
        verbose=verbose,
        random_state=42,
        scoring=scoring,
    )

    return rnd_search




In [None]:
rnd_search = get_model("lgb")
rnd_search.fit(X_train_prepared, y_train)

In [None]:
rnd_search.best_params_

In [None]:
pd.DataFrame(rnd_search.cv_results_).sort_values("rank_test_score")

## Feature importance

In [None]:
rnd_search.best_estimator_.feature_importances_

In [None]:
pd.Series(rnd_search.best_estimator_.feature_importances_).plot(kind="barh")

# Performance on training set

In [None]:
CLASSES = ["errorless", "errors"]

In [None]:
y_train_pred = rnd_search.predict(X_train_prepared)

In [None]:
get_classification_report(y_train, y_train_pred, digits=4, classes=CLASSES)

In [None]:
conf_matrix = confusion_matrix(y_true=y_train, y_pred=y_train_pred)
plot_conf_matrix(conf_matrix, figsize=(5, 4), classes=CLASSES)

In [None]:
56 / (56 + 22)

# Performance on test set

In [None]:
y_test_pred = rnd_search.predict(X_test_prepared)
get_classification_report(y_test, y_test_pred, classes=CLASSES)

In [None]:
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_test_pred)
plot_conf_matrix(conf_matrix)

## Record performance

### Random Forest


### LGB






### Balanced Random Forest Classifier





## Precision recall curves

In [None]:
y_test_pred_proba = rnd_search.predict_proba(X_test_prepared)[:, 1]

precisions, recalls, thresholds = precision_recall_curve(
    y_true=y_test, probas_pred=y_test_pred_proba
)

In [None]:
ax = plot_precision_recall_curve_over_thresholds(precisions, recalls, thresholds)
ax.vlines(0.5, ymin=0, ymax=1, color="orange", linestyle="--", alpha=0.5)


In [None]:
ax = plot_precision_recall_curve(precisions, recalls)

# Error analysis

# <font color="green"> TEST </font>