In [2]:
# Import libraries
import joblib
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Initialize variables
random_state = 6
test_size = 0.2
db_file = 'your_database.csv'
split_date = '2020-01-01'

In [4]:
# Load data into data frame
df = pd.read_csv(db_file,
                 encoding="utf-8",
                 sep=";",
                 header=0
                )

In [5]:
# Include only most recent tumor boards to reflect current tumorboard guidelines
df['pretb_date'] = pd.to_datetime(df['pretb_date'], dayfirst=True)
df = df.loc[df['pretb_date'] > split_date]

In [6]:
ord_enc = OrdinalEncoder()

In [7]:
# Convert categorical variables from text to numerical
df['dre'] = ord_enc.fit_transform(df[['dre']])
df['site'] = ord_enc.fit_transform(df[['site']])

In [8]:
# Define features and outcomes
feature_cols = ['age', "psa", "dre", 'site', "isup", "cylinder_pos", "cylinder_total", 'ht', 'dm', 'cad', 'bmi', 'preop']
outcome_cols = ["psma", 'conv_staging', "as", 'rp_rt']
X = df[feature_cols]
y = df[outcome_cols]

In [9]:
# Stratification of train test split according to outcomes with lowest numbers
y_strat = y.loc[:, ['psma', 'as']]

In [10]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y_strat, random_state=random_state)

In [11]:
# Impute missing values for continuous variables using median of training set
imp1 = SimpleImputer(missing_values=np.nan, strategy='median')

# Fit and transform on training set
X_train['psa'] = imp1.fit_transform(X_train[['psa']])
X_train['cylinder_pos'] = imp1.fit_transform(X_train[['cylinder_pos']])
X_train['cylinder_total'] = imp1.fit_transform(X_train[['cylinder_total']])

# Transform on test set
X_test['psa'] = imp1.fit_transform(X_test[['psa']])
X_test['cylinder_pos'] = imp1.fit_transform(X_test[['cylinder_pos']])
X_test['cylinder_total'] = imp1.fit_transform(X_test[['cylinder_total']])

In [12]:
# Impute missing values for categorial variables using most frequent in training set
imp2 = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

# Fit and transform on training set
X_train['isup'] = imp2.fit_transform(X_train[['isup']])
X_train['site'] = imp2.fit_transform(X_train[['site']])
X_train['dre'] = imp2.fit_transform(X_train[['dre']])

# Transform on test set
X_test['isup'] = imp2.fit_transform(X_test[['isup']])
X_test['site'] = imp2.fit_transform(X_test[['site']])
X_test['dre'] = imp2.fit_transform(X_test[['dre']])

In [13]:
# Train RandomForest model
rnd_clf = RandomForestClassifier(n_estimators=100, criterion="entropy", random_state=random_state)

rnd_clf.fit(X_train, y_train)

In [14]:
# Save model
joblib.dump(rnd_clf, 'models/rnd_clf.pkl')

['models/rnd_clf.pkl']