In [29]:
from utils import load_data_from_csv, c_index
from sklearn.model_selection import train_test_split 
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import random
from six import StringIO
from sklearn.tree import export_graphviz
from IPython.display import Image 
import pydotplus

# Load dataset 

In [30]:
X_train_org, X_test, y_train_org, y_test = load_data_from_csv(threshold=10, test_size=0.2)

In [31]:
print("X_train shape:", X_train_org.shape)
print("X_test shape:", X_test.shape)

X_train shape: (6863, 18)
X_test shape: (1716, 18)


In [32]:
X_train, X_val, y_train, y_val = train_test_split(X_train_org, y_train_org, test_size=0.25, random_state=42)

In [33]:
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)

X_train shape: (5147, 18)
X_val shape: (1716, 18)
X_test shape: (1716, 18)


# Random Forest 

In [34]:
random_forest_default = RandomForestClassifier(n_estimators=50, random_state=42)
random_forest_default.fit(X_train, y_train)

RandomForestClassifier(n_estimators=50, random_state=42)

In [45]:
# train C-index score
y_train_predict = random_forest_default.predict_proba(X_train)[:, 1]
print("C-index score for Random Forest:", c_index(y_train, y_train_predict))

C-index score for Random Forest: 1.0


In [46]:
# validation
y_pred_val = random_forest_default.predict_proba(X_val)[:, 1]
print("Validation C-index:", c_index(y_val, y_pred_val))

# test
y_pred_test = random_forest_default.predict_proba(X_test)[:, 1]
print("Test C-index:", c_index(y_test, y_pred_test))

Validation C-index: 0.7370878013567458
Test C-index: 0.7506640680431389


# Change hyperparameters

In [56]:
# train with hypertuning
random_forest_hypertuning = RandomForestClassifier(n_estimators=50, max_depth=10, min_samples_leaf=10, random_state=42)
random_forest_hypertuning.fit(X_train, y_train)


# train C-index score
y_train_predict = random_forest_hypertuning.predict_proba(X_train)[:, 1]
print("C-index score for Random Forest:", c_index(y_train, y_train_predict))

C-index score for Random Forest: 0.907367369725477


In [57]:
# validation c-index score
y_pred_val = random_forest_hypertuning.predict_proba(X_val)[:, 1]
print("Validation C-index:", c_index(y_val, y_pred_val))

# test c-index score
y_pred_test = random_forest_hypertuning.predict_proba(X_test)[:, 1]
print("Test C-index:", c_index(y_test, y_pred_test))

Validation C-index: 0.7625034442397509
Test C-index: 0.7603856250886902
