In [None]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as make_pipeline_with_sampler
from sklearn.impute import SimpleImputer
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import (
    balanced_accuracy_score,
    classification_report,
    confusion_matrix,
    plot_confusion_matrix,
    precision_recall_fscore_support,
    precision_score,
    recall_score,
    roc_curve,
    roc_auc_score,
    precision_recall_curve,
    auc,
)
import matplotlib.pyplot as plt

%matplotlib inline
%load_ext nb_black

In [None]:
df = pd.read_csv("../data/input/train.csv", index_col="company_id")

In [None]:
pipeline = make_pipeline_with_sampler(
    SimpleImputer(strategy="constant", fill_value=0),
    #     RandomUnderSampler(random_state=42),
    RandomForestClassifier(random_state=42),
)

## Single train test split

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

In [None]:
pipeline = pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
bal_acc = balanced_accuracy_score(y_test, y_pred)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred)

In [None]:
print(f"Balanced accuracy score : {bal_acc:.3f}")
print(f"Precision : {precision[1]:.3f}")
print(f"Recall : {recall[1]:.3f}")
print(f"Fscore : {fscore[1]:.3f}")
print(f"Support : {support[1]:.3f}")

In [None]:
classification_report(y_test, y_pred)

In [None]:
# confusion_matrix(y_test, y_pred)
plot_confusion_matrix(pipeline, X_test, y_test)

In [None]:
y_pred_proba = pipeline.predict_proba(X_test)
y_pred_proba_positive = y_pred_proba[:, 1]

In [None]:
# calculate roc curve for model
fpr, tpr, _ = roc_curve(y_test, y_pred_proba_positive)
# plot no skill roc curve
plt.plot([0, 1], [0, 1], linestyle="--", label="Baseline")
# plot model roc curve
plt.plot(fpr, tpr, marker=".", label="Random Forest")
# axis labels
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
# show the legend
plt.legend()


# calculate roc auc score
roc_auc = roc_auc_score(y_test, y_pred_proba_positive)
print(f"ROC AUC : {roc_auc:.3f}")

In [None]:
# calculate precision recall curve for model
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba_positive)
# plot no skill precision recall curve
baseline = len(y_test[y_test == 1]) / len(y_test)
plt.plot([0, 1], [baseline, baseline], linestyle="--", label="Baseline")
# plot model precision recall curve
plt.plot(recall, precision, marker=".", label="Random Forest")
# axis labels
plt.xlabel("Recall")
plt.ylabel("Precision")
# show the legend
plt.legend()

# calculate precision recall auc score
precision_recall_auc_score = auc(recall, precision)
print(f"Precision Recall AUC : {precision_recall_auc_score:.3f}")

In [None]:
plt.hist(y_pred_proba_positive, bins=100)
plt.title("Histogram plot for predicted probability of positive class")
plt.show()

In [None]:
precision, recall, threshold = precision_recall_curve(y_test, y_pred_proba_positive)
pd.DataFrame(
    data={"threshold": threshold, "precision": precision[:-1], "recall": recall[:-1]}
).set_index("threshold")

### Cross validation

In [None]:
cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
scoring = ["accuracy", "balanced_accuracy", "precision", "recall", "roc_auc"]
cv_results = cross_validate(
    estimator=pipeline,
    X=X,
    y=y,
    scoring=scoring,
    cv=cv,
    n_jobs=-1,
    return_train_score=True,
    return_estimator=True,
)

In [None]:
pd.DataFrame(cv_results).drop("estimator", axis=1)