In [1]:
%cd ..
%matplotlib nbagg

/home/elias/work/data_science


In [2]:
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

from transformer import PandasOneHotEncoder, PandasScaler
from metrics import cindex
from config import REG_COLS, CAT_COLS

In [3]:
data_path = Path("~/datasets/tumor/x_train/features/clinical_data.csv").expanduser()
df = pd.read_csv(data_path)

target_path = Path("~/datasets/tumor/y_train.csv").expanduser()
truth = pd.read_csv(target_path)

In [4]:
train_df = df.head(271)
test_df = df.tail(29)

y_train_ = truth[truth.PatientID.isin(train_df.PatientID)]
y_test_ = truth[truth.PatientID.isin(test_df.PatientID)]

In [5]:
encoder = PandasOneHotEncoder(CAT_COLS).fit(train_df)
input_scaler = PandasScaler(REG_COLS).fit(train_df)
scaler = PandasScaler(["SurvivalTime"]).fit(y_train_)

train_df = encoder.transform(train_df)
train_df = input_scaler.transform(train_df)
y_train = scaler.transform(y_train_)

test_df = encoder.transform(test_df)
test_df = input_scaler.transform(test_df)
y_test = scaler.transform(y_test_)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [18]:
RANDOM_STATE = 80

regressor = RandomForestRegressor(
    n_estimators=4000,
    criterion='mae',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=2,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=-1,
    random_state=RANDOM_STATE,
    verbose=0,
    warm_start=False,
    ccp_alpha=0.0,
    max_samples=None
)

In [19]:
regressor = regressor.fit(train_df[REG_COLS +  encoder.cols], y_train.SurvivalTime)

In [20]:
cindex_train_df = y_train.copy()
cindex_train_df.SurvivalTime = ((regressor.predict(train_df[REG_COLS +  encoder.cols])
                                 * scaler.mapping["SurvivalTime"]["std"])
                                + scaler.mapping["SurvivalTime"]["mean"])

y_test_ = scaler.inverse_transform(y_test)
cindex_test_df = y_test.copy()
cindex_test_df.SurvivalTime = ((regressor.predict(test_df[REG_COLS +  encoder.cols])
                                * scaler.mapping["SurvivalTime"]["std"])
                               + scaler.mapping["SurvivalTime"]["mean"])

In [21]:
print(f"Train score is: {regressor.score(train_df[REG_COLS +  encoder.cols], y_train.SurvivalTime)}")
print(f"Test score is: {regressor.score(test_df[REG_COLS +  encoder.cols], y_test.SurvivalTime)}")

print()
print()

print(f"Cindex train score is: {cindex(y_train_.set_index('PatientID'), cindex_train_df.set_index('PatientID'))}")
print(f"Cindex test score is: {cindex(y_test_.set_index('PatientID'), cindex_test_df.set_index('PatientID'))}")

Train score is: 0.6234641875614646
Test score is: 0.22772394945432606


Cindex train score is: 0.8216430551274939
Cindex test score is: 0.6315789473446775


## Parameters optimization

In [23]:
RANDOM_STATE = 23

base_regressor = RandomForestRegressor(
    n_estimators=400,
    criterion='mae',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=2,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=-1,
    random_state=RANDOM_STATE,
    verbose=0,
    warm_start=False,
    ccp_alpha=0.0,
    max_samples=None
)

parameters = {'n_estimators':range(500, 5000, 300), 'min_samples_split':range(2, 6)}

best_regressor = GridSearchCV(base_regressor, parameters)
best_regressor.fit(train_df[REG_COLS +  encoder.cols], y_train.SurvivalTime)

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=2,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=400, n_jobs=-1,
                                             oob_score=False, random_state=23,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_job

In [26]:
best_regressor.best_estimator_

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=2,
                      min_samples_split=5, min_weight_fraction_leaf=0.0,
                      n_estimators=2600, n_jobs=-1, oob_score=False,
                      random_state=23, verbose=0, warm_start=False)

In [27]:
cindex_train_df = y_train.copy()
cindex_train_df.SurvivalTime = ((best_regressor.best_estimator_.predict(train_df[REG_COLS +  encoder.cols])
                                 * scaler.mapping["SurvivalTime"]["std"])
                                + scaler.mapping["SurvivalTime"]["mean"])

y_test_ = scaler.inverse_transform(y_test)
cindex_test_df = y_test.copy()
cindex_test_df.SurvivalTime = ((best_regressor.best_estimator_.predict(test_df[REG_COLS +  encoder.cols])
                                * scaler.mapping["SurvivalTime"]["std"])
                               + scaler.mapping["SurvivalTime"]["mean"])

print(f"Train score is: {regressor.score(train_df[REG_COLS +  encoder.cols], y_train.SurvivalTime)}")
print(f"Test score is: {regressor.score(test_df[REG_COLS +  encoder.cols], y_test.SurvivalTime)}")

print()
print()

print(f"Cindex train score is: {cindex(y_train_.set_index('PatientID'), cindex_train_df.set_index('PatientID'))}")
print(f"Cindex test score is: {cindex(y_test_.set_index('PatientID'), cindex_test_df.set_index('PatientID'))}")

Train score is: 0.6234641875614646
Test score is: 0.22772394945432606


Cindex train score is: 0.8095549738216778
Cindex test score is: 0.6729323308017694
