In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.validation import check_is_fitted

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", 200)

# Data Preparation

In [3]:
data_path = "../dataset/H_MHAS_c2.dta" 
df = pd.read_stata(data_path)
column_names = list(df.columns.str.lower())

## Feature Pre-Selection

In [4]:
target_columns = ['r1hosp1y',
 'r2hosp1y',
 'r3hosp1y',
 'r4hosp1y',
 'r5hosp1y']
target_columns_count = len(target_columns)

In [5]:
no_wave_columns = [
"unhhidnp",
"ragender",
]
no_wave_columns_count = len(no_wave_columns)

In [6]:
picked_columns = [
    # Age at Interview (Months and Years)
    "r1agey",
    "r2agey",
    "r3agey",
    "r4agey",
    "r5agey",
    # Self-Report of Health
    "r1shlt",
    "r2shlt",
    "r3shlt",
    "r4shlt",
    "r5shlt",
    # Doctor Diagnosed Health Problems: Ever Have Condition
    "r1hibpe",
    "r2hibpe",
    "r3hibpe",
    "r4hibpe",
    "r5hibpe",
    "r1diabe",
    "r2diabe",
    "r3diabe",
    "r4diabe",
    "r5diabe",
    "r1cancre",
    "r2cancre",
    "r3cancre",
    "r4cancre",
    "r5cancre",
    "r1respe",
    "r2respe",
    "r3respe",
    "r4respe",
    "r5respe",
    "r1hrtatte",
    "r2hrtatte",
    "r3hrtatte",
    "r4hrtatte",
    "r5hrtatte",
    "r4hearte",
    "r5hearte",
    "r1stroke",
    "r2stroke",
    "r3stroke",
    "r4stroke",
    "r5stroke",
    "r1arthre",
    "r2arthre",
    "r3arthre",
    "r4arthre",
    "r5arthre",
    "s5arthre",
    # RwBMI is the respondent's self-reported body mass index
    "r1bmi",
    "r2bmi",
    "r3bmi",
    "r4bmi",
    "r5bmi",
    # Health Behaviors: Physical Activity or Exercise
    "r1vigact",
    "r2vigact",
    "r3vigact",
    "r4vigact",
    "r5vigact",
    # Health Behaviors: Smoking (Cigarettes)
    "r1smokev",
    "r2smokev",
    "r3smokev",
    "r4smokev",
    "r5smokev",
    "r1smoken",
    "r2smoken",
    "r3smoken",
    "r4smoken",
    "r5smoken",
    "r1smokef",
    "r2smokef",
    "r3smokef",
    "r4smokef",
    "r5smokef",
    "r1strtsmok",
    "r2strtsmok",
    "r3strtsmok",
    "r4strtsmok",
    "r5strtsmok",
    "r1quitsmok",
    "r2quitsmok",
    "r3quitsmok",
    "r4quitsmok",
    "r5quitsmok",
    # Health Behaviors: Preventive Care
    "r1cholst",
    "r2cholst",
    "r3cholst",
    "r4cholst",
    "r5cholst",
    "r3flusht",
    "r4flusht",
    "r5flusht",
    "r1breast",
    "r2breast",
    "r3breast",
    "r4breast",
    "r5breast",
    "r1mammog",
    "r2mammog",
    "r3mammog",
    "r4mammog",
    "r5mammog",
    "r1papsm",
    "r2papsm",
    "r3papsm",
    "r4papsm",
    "r5papsm",
    "r1prost",
    "r2prost",
    "r3prost",
    "r4prost",
    "r5prost",
    # ADL Help
    "r1dresshlp",
    "r2dresshlp",
    "r3dresshlp",
    "r4dresshlp",
    "r5dresshlp",
    "r1walkhlp",
    "r2walkhlp",
    "r3walkhlp",
    "r4walkhlp",
    "r5walkhlp",
    "r1bathehlp",
    "r2bathehlp",
    "r3bathehlp",
    "r4bathehlp",
    "r5bathehlp",
    "r1eathlp",
    "r2eathlp",
    "r3eathlp",
    "r4eathlp",
    "r5eathlp",
    "r1bedhlp",
    "r2bedhlp",
    "r3bedhlp",
    "r4bedhlp",
    "r5bedhlp",
    "r1toilethlp",
    "r2toilethlp",
    "r3toilethlp",
    "r4toilethlp",
    "r5toilethlp",
    # IADL Help
    "r1mealhlp",
    "r2mealhlp",
    "r3mealhlp",
    "r4mealhlp",
    "r5mealhlp",
    "r1shophlp",
    "r2shophlp",
    "r3shophlp",
    "r4shophlp",
    "r5shophlp",
    "r1medhlp",
    "r2medhlp",
    "r3medhlp",
    "r4medhlp",
    "r5medhlp",
    "r1moneyhlp",
    "r2moneyhlp",
    "r3moneyhlp",
    "r4moneyhlp",
    "r5moneyhlp",
    # Whether Uses Personal Aids
    "r1walkre",
    "r2walkre",
    "r3walkre",
    "r4walkre",
    "r5walkre",
    "r1bede",
    "r2bede",
    "r3bede",
    "r4bede",
    "r5bede",
    # Activities of Daily Living: Whether Receives Any Care
    "r1racany",
    "r2racany",
    "r3racany",
    "r4racany",
    "r5racany",
    # Instrumental Activities of Daily Living: Whether Receives Any Care
    "r1ricany",
    "r2ricany",
    "r3ricany",
    "r4ricany",
    "r5ricany",
]

In [7]:
# Selected columns are the union of several groups
screening = no_wave_columns + target_columns + picked_columns

# Work only in a subset of the dataset
df_sub = df[screening]

# original name for columns with wave reference 
wave_vars_orig_name = screening[no_wave_columns_count:]

# Remove the wave reference characters
from collections import OrderedDict
wave_vars_clean_name = list(OrderedDict.fromkeys([name[2:] for name in wave_vars_orig_name]))
wave_vars_clean_name_count = len(wave_vars_clean_name) 

In [8]:
# Decouple the wave as a column from the variables

from pprint import pprint

# Initialize an empty DataFrame with the desired final columns
# accumulated_df = pd.DataFrame(columns=final_columns)
accumulated_df = None

# Loop over the wave numbers
for wave in range(1, 6):
    # Create new column names for the current wave
    _names = [f"r{wave}{var}" for var in wave_vars_clean_name]
    wave_column_names = [name for name in _names if name in df_sub.columns] # Only keep valid column names
    # print(wave_column_names)
    column_mapping = {name: name[2:] for name in wave_column_names}  
    # print(column_mapping)

    # Select the necessary columns from the subset DataFrame and rename them
    wave_df = df_sub[no_wave_columns + wave_column_names].copy()
    wave_df = wave_df.rename(columns=column_mapping)

    # Insert the wave number as a new column
    wave_df.insert(0, "wave", wave)

    # Append the current wave's DataFrame to the accumulated DataFrame
    if accumulated_df is None:
        accumulated_df = wave_df.copy()
    else:
        accumulated_df = pd.concat([accumulated_df, wave_df])


## Get clean dataset to work with

In [9]:

drop_first_columns_count = 2 # wave and unhhidnp columns are not needed
df_clean = accumulated_df.iloc[:,drop_first_columns_count:].dropna(subset=["hosp1y"]).copy()
print(df_clean.shape)

# Swap ragender with hosp1y column to better readability
cols = list(df_clean)  # Get a list of column names
cols[0], cols[1] = cols[1], cols[0]  # Swap the first two names

df_clean = df_clean.reindex(columns=cols)  # Reindex the DataFrame with the new column order

(76353, 39)


## Split the dataset

In [10]:
from sklearn.model_selection import train_test_split

# Split the df_clean dataframe in train, val and test

def split_dataset(df, target, random_state=1):
    df_full_train, df_test = train_test_split(df, test_size=.2, random_state=random_state)
    df_train, df_val = train_test_split(df_full_train, test_size=.2/.8, random_state=random_state)

    # Reset index and get y vectors
    df_full_train = df_full_train.reset_index(drop=True)
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    y_train = df_train[target].values
    y_val = df_val[target].values
    y_test = df_test[target].values
            
    del df_train[target] 
    del df_val[target] 
    del df_test[target]

    return df_full_train, df_train, df_val, df_test, y_train, y_val, y_test

In [11]:
df_full_train, df_train, df_val, df_test, y_train, y_val, y_test = split_dataset(df_clean, "hosp1y")
len(df_train), len(df_val), len(df_test)

(45811, 15271, 15271)

# Model Training

In [12]:
numerical = list(df_full_train.dtypes[df_full_train.dtypes.ne("category")].index)[1:] # using 1: to skip hosp1y
# numerical
categorical = [col for col in list(df_full_train.dtypes.index) if col not in numerical][1:] # using 1: to skip hosp1y
# categorical

In [13]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

# Transform a dict to the X nparray ready to be used for predictions
# it requires a dictionary with the encoder, imputer and scaler objects generated
# during training
def transform_data(dict_data, transformers):
    dv = transformers["dv"]
    imputer = transformers["imputer"]
    scaler = transformers["scaler"]

    X = dv.transform(dict_data)
    X = imputer.transform(X)
    X = scaler.transform(X)

    return X

# Train the model and fit the encoder, imputer and scaler 
# returns the model and a dictionary with the encoder, imputer and scaler
def train(df, y, C=1.0):
    dict_data = df[categorical + numerical].to_dict(orient='records')
    
    dv = DictVectorizer(sparse=False)
    dv.fit(dict_data)

    X = dv.transform(dict_data)

    imputer = SimpleImputer(strategy='median')
    imputer.fit(X)

    X = imputer.transform(X)
    scaler = MinMaxScaler(feature_range = (0, 1))
    scaler.fit(X)

    X = scaler.transform(X)

    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
    model.fit(X, y)

    transformers = {"dv": dv, "imputer":imputer, "scaler":scaler}

    return transformers, model

# Predict a full dataframe using the encoder, imputer and scaler and obviously the model
def predict(df, transformers, model):
    dict_data = df[categorical + numerical].to_dict(orient='records')
    
    X = transform_data(dict_data, transformers)

    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [14]:
# Train the model using the previously created functions
C = 1.0
transformers, model = train(df_train, y_train, C=C)

In [15]:
print(transformers)
print(model)

{'dv': DictVectorizer(sparse=False), 'imputer': SimpleImputer(strategy='median'), 'scaler': MinMaxScaler()}
LogisticRegression(max_iter=1000, solver='liblinear')


In [16]:
# Check the AUC Score of the validation dataset
y_pred = predict(df_val, transformers, model)
roc_auc_score(y_val, y_pred)

0.7348594096930517

## Model evaluation

In [17]:
# Compare train and eval perfomance

# Train data predictions (class 1)
log_reg_train = predict(df_train, transformers, model)

# Validation data predictions (class 1)
log_reg_val = predict(df_val, transformers, model)

In [18]:
from sklearn.metrics import roc_auc_score

# Train ROC AUC Score
roc_auc_train = roc_auc_score(y_true=y_train, y_score=log_reg_train)
print(f"Train ROC AUC Score: {roc_auc_train:.4f}")

# Validation ROC AUC Score
roc_auc_val = roc_auc_score(y_true=y_val, y_score=log_reg_val)
print(f"Validation ROC AUC Score: {roc_auc_val:.4f}")

Train ROC AUC Score: 0.7367
Validation ROC AUC Score: 0.7349


In [19]:
# Check the test score
y_pred = predict(df_test, transformers, model)
roc_auc_score(y_test, y_pred)

0.7530830512054058

## Save the model

In [20]:
import pickle

model_type = "logistic_model_test" # This is the name for the filename of the model

output_file = f"hospitalization-{model_type}.bin"

with open(output_file, "wb") as f_out:
    pickle.dump((transformers, model), f_out)

## Load the model

In [21]:
import pickle

input_file = f"hospitalization-{model_type}.bin" # Use the same name than above

with open(input_file, "rb") as f_in:
    transformers, model = pickle.load(f_in)

transformers, model

({'dv': DictVectorizer(sparse=False),
  'imputer': SimpleImputer(strategy='median'),
  'scaler': MinMaxScaler()},
 LogisticRegression(max_iter=1000, solver='liblinear'))

## Example patient

In [22]:
pos = 157 # change this number to extract a different patient

print(y_test[pos]) # Print it the patient was hospitalizated 

# Convert patient to dict similar to format of the json in the api
patient = df_test[categorical + numerical].to_dict(orient='records')[pos]

print(patient)

0.No
{'ragender': '2.Woman', 'agey': 61.0, 'shlt': '2.Very good', 'hibpe': '0.no', 'diabe': '0.no', 'cancre': '0.no', 'respe': '1.yes', 'hrtatte': '0.no', 'stroke': '0.no', 'arthre': '0.no', 'vigact': '1.Yes', 'smokev': '0.No', 'smoken': '0.No', 'cholst': '1.Yes', 'breast': '1.Yes', 'mammog': '0.No', 'papsm': '1.Yes', 'prost': nan, 'dresshlp': nan, 'walkhlp': nan, 'bathehlp': nan, 'eathlp': nan, 'bedhlp': nan, 'toilethlp': nan, 'mealhlp': nan, 'shophlp': nan, 'medhlp': nan, 'moneyhlp': nan, 'walkre': nan, 'bede': nan, 'racany': nan, 'ricany': nan, 'flusht': '1.Yes', 'hearte': nan, 'bmi': nan, 'smokef': 0.0, 'strtsmok': nan, 'quitsmok': nan}


In [23]:
patient["prost"] # Beware of nan values

nan

In [24]:
def predict_single_patient(patient, transformers):
    """
    Predicts the probability of hospitalization for a single patient.

    Parameters:
    - patient: The data of the patient to be predicted.
    - transformers: The transformers used to preprocess the data.

    Returns:
    - y_pred: The predicted probability of hospitalization.
    - hospitalization: True if the predicted probability is greater than or equal to 0.5, False otherwise.
    """
    X_patient = transform_data([patient], transformers)
    y_pred = model.predict_proba(X_patient)[0, 1]
    hospitalization = y_pred >= 0.5
    return y_pred, hospitalization

In [25]:
y_test[pos], predict_single_patient(patient, transformers)

('0.No', (0.055877408312371896, False))

## Export to json

In [26]:
patient

{'ragender': '2.Woman',
 'agey': 61.0,
 'shlt': '2.Very good',
 'hibpe': '0.no',
 'diabe': '0.no',
 'cancre': '0.no',
 'respe': '1.yes',
 'hrtatte': '0.no',
 'stroke': '0.no',
 'arthre': '0.no',
 'vigact': '1.Yes',
 'smokev': '0.No',
 'smoken': '0.No',
 'cholst': '1.Yes',
 'breast': '1.Yes',
 'mammog': '0.No',
 'papsm': '1.Yes',
 'prost': nan,
 'dresshlp': nan,
 'walkhlp': nan,
 'bathehlp': nan,
 'eathlp': nan,
 'bedhlp': nan,
 'toilethlp': nan,
 'mealhlp': nan,
 'shophlp': nan,
 'medhlp': nan,
 'moneyhlp': nan,
 'walkre': nan,
 'bede': nan,
 'racany': nan,
 'ricany': nan,
 'flusht': '1.Yes',
 'hearte': nan,
 'bmi': nan,
 'smokef': 0.0,
 'strtsmok': nan,
 'quitsmok': nan}

In [27]:
# We need to clean the dictionary from NaN in order to dump to a json file
patient_clean = dict(patient) # Create a copy of the original patient

for key, value in patient_clean.items():
    if isinstance(value, np.ndarray):
        patient_clean[key] = value.tolist()  # Convert ndarray to list for JSON compatibility
    if isinstance(value, float) and np.isnan(value):
        patient_clean[key] = ""  # Replace np.nan with string ""

patient_clean

{'ragender': '2.Woman',
 'agey': 61.0,
 'shlt': '2.Very good',
 'hibpe': '0.no',
 'diabe': '0.no',
 'cancre': '0.no',
 'respe': '1.yes',
 'hrtatte': '0.no',
 'stroke': '0.no',
 'arthre': '0.no',
 'vigact': '1.Yes',
 'smokev': '0.No',
 'smoken': '0.No',
 'cholst': '1.Yes',
 'breast': '1.Yes',
 'mammog': '0.No',
 'papsm': '1.Yes',
 'prost': '',
 'dresshlp': '',
 'walkhlp': '',
 'bathehlp': '',
 'eathlp': '',
 'bedhlp': '',
 'toilethlp': '',
 'mealhlp': '',
 'shophlp': '',
 'medhlp': '',
 'moneyhlp': '',
 'walkre': '',
 'bede': '',
 'racany': '',
 'ricany': '',
 'flusht': '1.Yes',
 'hearte': '',
 'bmi': '',
 'smokef': 0.0,
 'strtsmok': '',
 'quitsmok': ''}

In [28]:
import json

json_filename = "lr-single_patient"
with open(f"{json_filename}.json", "w") as fp:
    json.dump(patient_clean, fp) 

## Test using json 

This code will be used in the API codebase, it is tested here for convenience.

In [30]:
input_file = "lr-single_patient.json"

with open(input_file, "rb") as f_in:
    patient_json = json.load(f_in)
print(patient_json)

{'ragender': '2.Woman', 'agey': 61.0, 'shlt': '2.Very good', 'hibpe': '0.no', 'diabe': '0.no', 'cancre': '0.no', 'respe': '1.yes', 'hrtatte': '0.no', 'stroke': '0.no', 'arthre': '0.no', 'vigact': '1.Yes', 'smokev': '0.No', 'smoken': '0.No', 'cholst': '1.Yes', 'breast': '1.Yes', 'mammog': '0.No', 'papsm': '1.Yes', 'prost': '', 'dresshlp': '', 'walkhlp': '', 'bathehlp': '', 'eathlp': '', 'bedhlp': '', 'toilethlp': '', 'mealhlp': '', 'shophlp': '', 'medhlp': '', 'moneyhlp': '', 'walkre': '', 'bede': '', 'racany': '', 'ricany': '', 'flusht': '1.Yes', 'hearte': '', 'bmi': '', 'smokef': 0.0, 'strtsmok': '', 'quitsmok': ''}


In [31]:
def revive_nan(data):
    """
    Recursively replaces empty strings with NaN values in a nested dictionary or list.

    Parameters:
    data (dict or list): The input data to be processed.

    Returns:
    dict or list: The processed data with empty strings replaced by NaN values.
    """
    if isinstance(data, dict):
        for key, value in data.items():
            data[key] = revive_nan(value)
    elif isinstance(data, list):
        for i, value in enumerate(data):
            data[i] = revive_nan(value)
    elif data == "":
        return np.nan
    else:
        return data

In [32]:
revive_nan(patient_json)

In [33]:
patient_json

{'ragender': '2.Woman',
 'agey': 61.0,
 'shlt': '2.Very good',
 'hibpe': '0.no',
 'diabe': '0.no',
 'cancre': '0.no',
 'respe': '1.yes',
 'hrtatte': '0.no',
 'stroke': '0.no',
 'arthre': '0.no',
 'vigact': '1.Yes',
 'smokev': '0.No',
 'smoken': '0.No',
 'cholst': '1.Yes',
 'breast': '1.Yes',
 'mammog': '0.No',
 'papsm': '1.Yes',
 'prost': nan,
 'dresshlp': nan,
 'walkhlp': nan,
 'bathehlp': nan,
 'eathlp': nan,
 'bedhlp': nan,
 'toilethlp': nan,
 'mealhlp': nan,
 'shophlp': nan,
 'medhlp': nan,
 'moneyhlp': nan,
 'walkre': nan,
 'bede': nan,
 'racany': nan,
 'ricany': nan,
 'flusht': '1.Yes',
 'hearte': nan,
 'bmi': nan,
 'smokef': 0.0,
 'strtsmok': nan,
 'quitsmok': nan}

In [34]:
predict_single_patient(patient_json, transformers)

(0.055877408312371896, False)

The API drawboard ends here, below there is code used for other models but not used in the final API

## Old method

In [None]:
enc_mapper = {"numerical": numerical, "categorical": categorical}

In [None]:
enc_mapper["categorical"], enc_mapper["numerical"]

(['ragender',
  'shlt',
  'hibpe',
  'diabe',
  'cancre',
  'respe',
  'hrtatte',
  'stroke',
  'arthre',
  'vigact',
  'smokev',
  'smoken',
  'cholst',
  'breast',
  'mammog',
  'papsm',
  'prost',
  'dresshlp',
  'walkhlp',
  'bathehlp',
  'eathlp',
  'bedhlp',
  'toilethlp',
  'mealhlp',
  'shophlp',
  'medhlp',
  'moneyhlp',
  'walkre',
  'bede',
  'racany',
  'ricany',
  'flusht',
  'hearte'],
 ['agey', 'bmi', 'smokef', 'strtsmok', 'quitsmok'])

In [43]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder


def preprocess_data(df_train, df_val, df_test, enc_mapper):
    # Print shape of input data
    print("Input train data shape: ", df_train.shape)
    print("Input val data shape: ", df_val.shape)
    print("Input test data shape: ", df_test.shape, "\n")

    # Make a copy of the dataframes
    working_train_df = df_train.copy()
    working_val_df = df_val.copy()
    working_test_df = df_test.copy()

    # Group cols by type
    categorical = enc_mapper["categorical"]
    numerical = enc_mapper["numerical"]

    X_train = working_train_df[numerical].values
    X_val = working_val_df[numerical].values
    X_test = working_test_df[numerical].values

    ohe = OneHotEncoder()

    # Fit on the training data
    ohe.fit(working_train_df[categorical])

    # Transform train, val and test data
    X_train = np.concatenate((X_train, ohe.transform(working_train_df[categorical]).todense()), axis=1)
    X_val = np.concatenate((X_val, ohe.transform(working_val_df[categorical]).todense()), axis=1)
    X_test = np.concatenate((X_test, ohe.transform(working_test_df[categorical]).todense()), axis=1)

    X_train = np.asarray(X_train)
    X_val = np.asarray(X_val)
    X_test = np.asarray(X_test)
    
    imputer = SimpleImputer(strategy='median')
    imputer.fit(X_train)

    X_train = imputer.transform(X_train)
    X_val = imputer.transform(X_val)
    X_test = imputer.transform(X_test)

    scaler = MinMaxScaler(feature_range = (0, 1))
    scaler.fit(X_train)

    X_train = scaler.transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    return X_train, X_val, X_test

## Preprocessing

In [44]:
X, X_val, X_test = preprocess_data(df_train, df_val, df_test, enc_mapper)

Input train data shape:  (45811, 38)
Input val data shape:  (15271, 38)
Input test data shape:  (15271, 38) 



## Logistic Regression

In [45]:
from sklearn.linear_model import LogisticRegression

In [46]:
%%time

log_reg = None
log_reg = LogisticRegression(C=0.0001, solver='liblinear', max_iter=1000)
log_reg.fit(X_train, y_train)

CPU times: user 278 ms, sys: 5.57 ms, total: 283 ms
Wall time: 277 ms


In [47]:
# Train data predictions (class 1)
log_reg_train = log_reg.predict_proba(X)[:, 1]

# Validation data predictions (class 1)
log_reg_val = log_reg.predict_proba(X_val)[:, 1]

In [66]:
from sklearn.metrics import roc_auc_score

# Train ROC AUC Score
roc_auc_train = roc_auc_score(y_true=y_train, y_score=log_reg_train)
print(f"Train ROC AUC Score: {roc_auc_train:.4f}")

# Validation ROC AUC Score
roc_auc_val = roc_auc_score(y_true=y_val, y_score=log_reg_val)
print(f"Validation ROC AUC Score: {roc_auc_val:.4f}")

Train ROC AUC Score: 0.7145
Validation ROC AUC Score: 0.7185


## Random Forest

In [304]:
%%time
rf = RandomForestClassifier(n_estimators = 100, random_state = 47, verbose = 1, n_jobs = -1, max_depth=10)
rf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.6s


CPU times: user 10.9 s, sys: 41.8 ms, total: 11 s
Wall time: 1.81 s


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.8s finished


In [305]:
# Train data predictions (class 1)
rf_pred_train = rf.predict_proba(X)[:, 1]

# Validation data predictions (class 1)
rf_pred_val = rf.predict_proba(X_val)[:, 1]

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished


In [306]:
roc_auc_train = roc_auc_score(y_true=y_train, y_score=rf_pred_train)
print(f"Train ROC AUC Score: {roc_auc_train:.4f}")

# Validation ROC AUC Score
roc_auc_val = roc_auc_score(y_true=y_val, y_score=rf_pred_val)
print(f"Validation ROC AUC Score: {roc_auc_val:.4f}")

Train ROC AUC Score: 0.7832
Validation ROC AUC Score: 0.7295


In [266]:
rf.feature_importances_

array([9.14511471e-02, 9.60600421e-02, 9.59689377e-03, 3.99633498e-02,
       3.88595647e-02, 3.48119807e-02, 5.12213797e-03, 5.23831865e-03,
       1.74946475e-03, 2.56871035e-03, 8.51070857e-03, 1.09811233e-02,
       9.21210717e-03, 5.46707466e-04, 9.91863208e-03, 1.00470574e-02,
       3.25541079e-04, 9.92392229e-03, 1.05204886e-02, 2.29541418e-04,
       5.20682682e-03, 5.70797702e-03, 3.00202350e-04, 7.53026442e-03,
       7.54358519e-03, 2.15189107e-04, 6.66865070e-03, 7.42405496e-03,
       3.56216576e-04, 4.98124745e-03, 5.16866042e-03, 2.46982411e-04,
       1.15919927e-02, 1.18063507e-02, 3.05330170e-04, 1.19825209e-02,
       1.15073066e-02, 9.57661409e-04, 1.04077003e-02, 1.06442986e-02,
       3.95505510e-05, 5.40721416e-03, 5.31058827e-03, 1.33377501e-04,
       9.59803274e-03, 8.12920967e-03, 1.43743565e-03, 8.93001628e-03,
       9.17971449e-03, 3.44561554e-03, 8.65518822e-03, 8.72932912e-03,
       3.42634580e-03, 7.47488307e-03, 8.66264127e-03, 4.05431506e-03,
      

In [307]:
%%time

rf = RandomForestClassifier(random_state = 47, n_jobs = -1)
distributions = {
    "n_estimators": [10, 20, 50, 100],
    'max_depth': [10, 30, 50],
}
rf_random = RandomizedSearchCV(
    estimator=rf, 
    param_distributions=distributions, 
    scoring='roc_auc',  
    n_jobs=-1,
    random_state=47)
rf_random.fit(X_train, y_train)


CPU times: user 12 s, sys: 763 ms, total: 12.8 s
Wall time: 40.8 s


In [308]:
print(f"Best params: {rf_random.best_params_}")
print(f"Best score: {rf_random.best_score_}")

Best params: {'n_estimators': 100, 'max_depth': 10}
Best score: 0.7299960438741016
