In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, GridSearchCV
import sklearn.metrics
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import seaborn as sns

## Load sampled_data_demographics

In [None]:
demographics = pd.read_csv('demographics/sampled_data_demographics.csv')

In [None]:
demographics.columns

## one-hot encoding for race

In [None]:
personal_info = demographics[['actual_age', 'gender', 'race', 'abbrev', 'thirty_day_readmission']].copy()

for race in personal_info['race'].unique():
    personal_info[race] = personal_info['race'] == race

## one-hot encoding for gender

In [None]:
personal_info['male'] = True
personal_info.loc[personal_info['gender'] == 'F', 'male'] = False

## drop gender and race

In [None]:
personal_info = personal_info.drop(['gender', 'race'], axis = 1)

## Split into features and target arbitrarily

In [None]:
X_a = personal_info.drop(['thirty_day_readmission', 'abbrev'], axis = 1)
y_a = personal_info['thirty_day_readmission']

X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X_a, y_a, test_size = 0.5, random_state = 42, stratify = y_a)


param_grid = {
    'C': [0.01, 0.1, 1.0, 10.0, 100.0],
    'class_weight': [None, 'balanced'],
}

grid_search = GridSearchCV(
                LogisticRegression(random_state=42),
                param_grid,
                scoring='roc_auc',
                verbose=0,
                cv=5,
                refit=False,
            )
grid_search = grid_search.fit(X_train_a, y_train_a)
best_params = grid_search.best_params_

classifier = LogisticRegression(random_state=42, **best_params)
classifier = classifier.fit(X_train_a, y_train_a)
y_prediction_a = classifier.predict_proba(X_test_a)[:, 1]
sklearn.metrics.roc_auc_score(y_test_a, y_prediction_a)

## Split into features and targets by subgroups

In [None]:
personal_info_grouped = personal_info.groupby('abbrev')

subgroup = personal_info_grouped.get_group('F_AIAN')
X = subgroup.drop(['thirty_day_readmission', 'abbrev'], axis = 1)
y = subgroup['thirty_day_readmission']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state = 42, stratify = y)


for abbrev in personal_info_grouped.groups:

    if abbrev != 'F_AIAN':
    
        subgroup = personal_info_grouped.get_group(abbrev)
        X = subgroup.drop(['thirty_day_readmission', 'abbrev'], axis = 1)
        y = subgroup['thirty_day_readmission']
    
        this_X_train, this_X_test, this_y_train, this_y_test = train_test_split(X, y, test_size = 0.5, random_state = 42, stratify = y)
    
        X_train = pd.concat([X_train, this_X_train])
        X_test = pd.concat([X_test, this_X_test])
        y_train = pd.concat([y_train, this_y_train])
        y_test = pd.concat([y_test, this_y_test])    

param_grid = {
    'C': [0.01, 0.1, 1.0, 10.0, 100.0],
    'class_weight': [None, 'balanced'],
}

grid_search = GridSearchCV(
                LogisticRegression(random_state=42),
                param_grid,
                scoring='roc_auc',
                verbose=0,
                cv=5,
                refit=False,
            )
grid_search = grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_

classifier = LogisticRegression(random_state=42, **best_params)
classifier = classifier.fit(X_train, y_train)
y_prediction = classifier.predict_proba(X_test)[:, 1]
sklearn.metrics.roc_auc_score(y_test, y_prediction)

In [None]:
best_params

In [None]:
ft_importance = permutation_importance(classifier, X_train, y_train, scoring='roc_auc', n_repeats=100)

In [None]:
#ft_importance

In [None]:
#copied from https://inria.github.io/scikit-learn-mooc/python_scripts/dev_features_importance.html, accessed 16.10.25

def plot_feature_importances(perm_importance_result, feat_name):
    """bar plot the feature importance"""

    fig, ax = plt.subplots(figsize = (10, 5)) 

    indices = perm_importance_result["importances_mean"].argsort()
    plt.barh(
        range(len(indices)),
        perm_importance_result["importances_mean"][indices],
        xerr=perm_importance_result["importances_std"][indices],
    )

    ax.set_yticks(range(len(indices)))
    _ = ax.set_yticklabels(feat_name[indices])

    ax.set_xlabel('AUROC difference', fontsize = 14)
    ax.set_ylabel('Feature', fontsize = 14)
    plt.xticks(fontsize = 11)
    plt.yticks(fontsize = 11)
    fig.tight_layout()
    
    return fig, ax

In [None]:
plot_feature_importances(ft_importance, X_train.columns)

## (Export)

In [None]:
plot, ax = plot_feature_importances(ft_importance, X_train.columns)
plot.savefig('images/personal_info_one_hot_strat.svg')