In [23]:
import numpy
import pandas as pd
from fairlearn.metrics import *
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
def make_clean_X_y(dataset, label_to_use = "mortality_five_years"):
    y = dataset[label_to_use]
   #drop all data that is completely empty
    # dataset = dataset.loc[~dataset.index.duplicated(keep='first')]
    dataset = dataset.drop(columns=["rmonth_survey","ryear_survey","HHID","PN","heart_condition_compared_to_prev", #maybe later just drop 1st heart featuer
                          "mortality_five_years","mortality_ten_years","Key","year_death"]) # this has duplicate
    dataset = dataset.dropna(axis=1, thresh=len(dataset)*0.75)
    return dataset, y

In [22]:
pd.read_pickle("hrs_data_2016.pkl").iloc[0:1].dropna(axis=1, how='all')

Unnamed: 0,HHID,PN,rmonth_survey,ryear_survey,age,live_nursing_facility,divorce_widow_status,health_status,health_status_compared_to_prev,high_blood_pressure,diabetes,cancer,lung_disease,heart_condition,stroke,psychiatric_emotional_problem,arthritis,pain,alcohol,shortness_breath,fatigue,cough,depressed_x,memory,depressed_y,dependents,type_house,own_or_rent,cooperation,tiredness,employment_status,state_live,gender,year_born_y,leave_inheritance,wave,Key,year_death,mortality_ten_years,mortality_five_years
0,10001,10,5.0,2016.0,76.0,5.0,5.0,2.0,2.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,3.0,5.0,5.0,4.0,2.0,3.0,1.0,5.0,2.0,1.0,1939.0,0.0,13.0,010001-010,0,0,0


In [3]:
small_data_2002_X,small_data_2002_y = make_clean_X_y(pd.read_pickle("hrs_data_2002.pkl").iloc[0:100])
small_data_2004_X,small_data_2004_y = make_clean_X_y(pd.read_pickle("hrs_data_2004.pkl").iloc[0:100])

In [19]:
small_data_2002_X.info(verbose=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 38 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             100 non-null    object 
 1   live_nursing_facility           100 non-null    float64
 2   race                            100 non-null    object 
 3   divorce_widow_status            96 non-null     object 
 4   marital_status                  100 non-null    object 
 5   health_status                   100 non-null    float64
 6   health_status_compared_to_prev  100 non-null    object 
 7   high_blood_pressure             100 non-null    float64
 8   diabetes                        100 non-null    float64
 9   cancer                          100 non-null    float64
 10  lung_disease                    100 non-null    float64
 11  heart_condition                 100 non-null    float64
 12  stroke                          100 n

In [4]:
small_data_2002_X.iloc[4]["divorce_widow_status"]==pd.NA

<NA>

In [5]:
full_dataset_X = pd.concat([small_data_2002_X,small_data_2004_X], ignore_index=True)
full_dataset_y = pd.concat([small_data_2002_y,small_data_2004_y], ignore_index=True)

In [6]:
def train_with_subset(data_subset_X, data_subset_y, data_full_X,data_full_y, model):
    # (regular) full to full eg. 2002 + all other years tested on 2002 + all other years
    X_train, X_test, y_train, y_test, race_train,race_test = train_test_split(data_full_X, data_full_y,data_full_X["race"], random_state=5)
    model.fit(X_train, y_train)

    # choose more from here: https://fairlearn.org/main/api_reference/generated/fairlearn.metrics.equal_opportunity_difference.html
    metrics_needed = {
        'demo_parity_diff' : demographic_parity_difference,
        'demo_parity_ratio' : demographic_parity_ratio,
        'eq_parity_diff' : equalized_odds_difference,
        'eq_parity_ratio' : equalized_odds_ratio,
        'count' : count
    }
    y_pred = classifier.predict(X_test)
    metric_frame = MetricFrame(
        metrics=metrics_needed, y_true=y_test, y_pred=y_pred, sensitive_features=race_test
    )
    metric_frame.by_group.plot.bar(
        subplots=True,
        layout=[3, 3],
        legend=False,
        figsize=[12, 8],
        title="Show all metrics",
    )

    # just duplicate above code for other sections

    # # subset to subset ex. 2002 only tested on 2002
    # X_train, X_test, y_train, y_test = train_test_split(data_full_X, data_full_y, random_state=5)
    # model.fit(X_train, y_train)

    # # subset to full ex. 2002 tested on 2002 + all other years
    # X_train, X_test, y_train, y_test = train_test_split(data_full_X, data_full_y, random_state=5)
    # model.fit(X_train, y_train)
    
    
    

In [None]:
## Might need to use the column transformer instead if there's still trouble with imputation
ct = make_column_transformer(
    (
        Pipeline(
            [
                ("imputer", SimpleImputer(strategy="mean")),
                ("normalizer", StandardScaler()),
            ]
        ),
        make_column_selector(dtype_include=number),
    ),
    (
        Pipeline(
            [
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("encoder", OneHotEncoder(drop="if_binary", sparse_output=False)),
            ]
        ),
        make_column_selector(dtype_include="category"),
    ),
)

In [24]:
from fairlearn.reductions import ErrorRate, EqualizedOdds, ExponentiatedGradient

# Control, no fairness (Decision Tree), should be replaceable with RandomForestClassifier()
classifier = DecisionTreeClassifier(min_samples_leaf=10, max_depth=4)
pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant")),
    ("normalizer", StandardScaler()),
    ("clf", classifier),
    ])

train_with_subset(None, None, full_dataset_X,  full_dataset_X, pipe)



ValueError: could not convert string to float: 'missing_value'

In [None]:
# Expotentiated Gradient constraint, should be replaceable with RandomForestClassifier()

objective = ErrorRate(costs={'fp': 0.1, 'fn': 0.9})
constraint = EqualizedOdds(difference_bound=0.01) # can be changed to other constraints instead 
# see fairlearn.reduction around here: https://fairlearn.org/main/api_reference/generated/fairlearn.reductions.DemographicParity.html
classifier = DecisionTreeClassifier(min_samples_leaf=10, max_depth=4)
mitigator = ExponentiatedGradient(classifier, constraint, objective=objective)

pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant")),
    ("normalizer", StandardScaler()),
    ("clf", mitigator),
    ])

train_with_subset(None, None, full_dataset_X,  full_dataset_X, pipe)

In [None]:
# Adversarial learning, should be replaceable with RandomForestClassifier()
from fairlearn.adversarial import AdversarialFairnessClassifier

mitigator = AdversarialFairnessClassifier(
    backend="torch",
    predictor_model=[50, "leaky_relu"], # this is number of nodes and then activation function, see here: https://fairlearn.org/main/api_reference/generated/fairlearn.adversarial.AdversarialFairnessClassifier.html#:~:text=a%20BackendEngine%20class.-,predictor_model,-list%2C%20torch.nn
    adversary_model=[3, "leaky_relu"],
    batch_size=2**8,
    progress_updates=0.5,
    random_state=5,
    #contraints = "demographic_parity" # we can chagne this to equalized_odds as per https://fairlearn.org/main/api_reference/generated/fairlearn.adversarial.AdversarialFairnessClassifier.html#:~:text=constraintsstr%2C%20default,demographic_parity%E2%80%99%20or%20%E2%80%98equalized_odds%E2%80%99.
)


pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant")),
    ("normalizer", StandardScaler()),
    ("clf", mitigator),
    ])

train_with_subset(None, None, full_dataset_X,  full_dataset_X, pipe)

# slightly more advanced usage of adversarial training here: https://fairlearn.org/main/auto_examples/plot_adversarial_fine_tuning.html