In [4]:
import os
import sys
import pandas
from copy import deepcopy
from itertools import product
from tqdm import tqdm
import numpy as np
module_path = os.path.abspath(os.path.join('..'))
sys.path.append(module_path)
from joblib import Parallel, delayed
# %%

from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from pysurvival.models.survival_forest import RandomSurvivalForestModel
import lifelines
from lifelines import CoxPHFitter
from matplotlib.axes import Axes

# %%
from bsa.dataset.data_loader import load_raw_data, preprocess_data, splitting_function, drop_unknown_horizon
# %%
import random
from sklearnex import patch_sklearn
patch_sklearn()


seed = 3407
random.seed(seed)
np.random.seed(seed)

  from .autonotebook import tqdm as notebook_tqdm
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [1]:
def load_and_preprocess(path = '../data/data_for_bankruptcy_prediction_no_lags_corrected.csv', print_shape = False):
    raw_data, raw_labels = load_raw_data(path)
    x, outcomes = preprocess_data(raw_data, raw_labels)

    x_train, x_test, outcomes_train, outcomes_test = splitting_function(x, outcomes, 0.5)
    x_val, x_test, outcomes_val, outcomes_test = splitting_function(x_test, outcomes_test, 0.5)

    if print_shape:
        print("Training features shape: ", x_train.shape)
        print("Training labels shape: ", outcomes_train.shape)
        print("Testing features shape: ", x_test.shape)
        print("Testing labels shape: ", outcomes_test.shape)
        print("Validation features shape: ", x_val.shape)
        print("Validation labels shape: ", outcomes_val.shape)
    
    # Sanity check
    assert len(np.intersect1d(x_train.index.get_level_values(0).values, x_test.index.get_level_values(0).values)) == 0
    assert len(np.intersect1d(x_train.index.get_level_values(0).values, x_val.index.get_level_values(0).values)) == 0
    assert len(np.intersect1d(x_test.index.get_level_values(0).values, x_val.index.get_level_values(0).values)) == 0

    return (x_train, outcomes_train), (x_test, outcomes_test), (x_val, outcomes_val)

In [2]:
def get_horizon(x_train, outcomes_train, x_test, outcomes_test, x_val, outcomes_val, horizon):

    x_train = x_train.copy()
    x_test = x_test.copy()
    x_val = x_val.copy()
    outcomes_train = outcomes_train.copy()
    outcomes_test = outcomes_test.copy()
    outcomes_val = outcomes_val.copy()

    outcomes_test = drop_unknown_horizon(outcomes_test, horizon)
    outcomes_train = drop_unknown_horizon(outcomes_train, horizon)
    outcomes_val = drop_unknown_horizon(outcomes_val, horizon)

    x_train = x_train.loc[outcomes_train.index]
    x_test = x_test.loc[outcomes_test.index]
    x_val = x_val.loc[outcomes_val.index]

    # return (x_train, y_train), (x_test, y_test), (x_val, y_val)

    return (x_train, outcomes_train.values[:, 3]), (x_test, outcomes_test.values[:, 3]), (x_val, outcomes_val.values[:, 3])

In [5]:
(x_train_raw, outcomes_train), (x_test_raw, outcomes_test), (x_val_raw, outcomes_val) = load_and_preprocess()

(x_train_h, y_train), (x_test_h, y_test), (x_val_h, y_val) = get_horizon(
            x_train_raw, outcomes_train, x_test_raw, outcomes_test, x_val_raw, outcomes_val, 3
        )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['HBankrupt'] = ((y['T'] <= (horizon + 1)) & bankrupted_filter).astype(int)


In [6]:
data_cox = x_train_raw.join(outcomes_train)
data_cox = data_cox.drop("IBankrupt", axis=1)
data_cox = data_cox.drop(['INaics3_7', 'INaics3_8', 'INaics3_19', 'INaics3_21', 'INaics3_23', 'INaics3_78', 'INaics3_81', 'INaics3_97'], axis=1)
x_val_h_cox = x_val_h.drop(['INaics3_7', 'INaics3_8', 'INaics3_19', 'INaics3_21', 'INaics3_23', 'INaics3_78', 'INaics3_81', 'INaics3_97'], axis=1)

In [8]:
cox = CoxPHFitter(penalizer=0.1).fit(data_cox, duration_col='T', event_col='E')

In [9]:
y_pred_val = cox.predict_survival_function(x_val_h_cox)

In [22]:
y_pred_np = y_pred_val.values[3, :]

In [8]:
rsf = RandomSurvivalForestModel(num_trees=10)

In [12]:
T = y_train_rsf_arr[:, 0]
E = y_train_rsf_arr[:, 1]

In [17]:
rsf.fit(x_train_raw, T, E, sample_size_pct=1.0)

RandomSurvivalForestModel

In [23]:
pred = rsf.predict_survival(x_val, 1)

In [23]:
pred.shape

NameError: name 'pred' is not defined

In [22]:
y_val.shape

(18334,)

In [24]:
fpr, tpr, thresh = roc_curve(y_val, 1 - y_pred_np)
auc = roc_auc_score(y_val, 1 - y_pred_np)