In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

# Datasets
from aif360.datasets import MEPSDataset19
from aif360.explainers import MetricTextExplainer

# Fairness metrics
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.metrics import ClassificationMetric
from sklearn.metrics import accuracy_score, balanced_accuracy_score


#graphiques
import plotly.graph_objects as go
from plotly.subplots import make_subplots


# Modèles d'entrainement 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'
  vect_normalized_discounted_cumulative_gain = vmap(
  monte_carlo_vect_ndcg = vmap(vect_normalized_discounted_cumulative_gain, in_dims=(0,))


In [2]:
data_dir = "HERRERA_NATIVI_VLADIMIR"
metadata_Vlad = pd.read_csv("HERRERA_NATIVI_VLADIMIR/metadata.csv")
metadata_Javi = pd.read_csv("PENA_CASTANO_JAVIER/metadata.csv")
# La prof ma parlé d'un tensorboard

In [3]:
# Predictions : 

preds_weights = pd.read_csv("expe_log/predsOrig.csv")

In [4]:
print(f"Vladi : {metadata_Vlad.shape} | Javi : {metadata_Javi.shape}")

Vladi : (5627, 13) | Javi : (5477, 12)


In [5]:
# Elimination des colonnes non necessaire pour notre étude initial
colonnes_a_eliminer = ['OriginalImage[Width','Height]', 'OriginalImagePixelSpacing[x', 'y]']
data = metadata_Vlad.drop(columns=colonnes_a_eliminer, inplace=False) 
data = data.loc[data['Patient Age'] <= 120] # On élimine les patients qui ont plus de 120 ans 

# Analyse quantitative des données 

In [6]:
data_graphiques = data.copy() 

In [7]:
#Affichage de graphiques pour la visualisation de la repartition de l'age, le follow up, le genre et la position
fig = make_subplots(rows = 2, cols=2, subplot_titles = ["Follow Up", "Age", "Gender", "Position"])
fig.add_trace (go.Histogram(x=data_graphiques['Follow-up #'], name= "follow up"), row = 1, col=1)
fig.add_trace (go.Histogram(x=data_graphiques['Patient Age'], name= "Age"), row = 1, col=2)
fig.add_trace (go.Histogram(x=data_graphiques['Patient Gender'], name= "Gender"), row = 2, col=1)
fig.add_trace (go.Histogram(x=data_graphiques['View Position'], name= "Position"), row = 2, col=2)

fig.update_layout(title = "Histograms")
fig.write_image("Histogrammes1.png")

In [8]:
# Créons d'abord une colonne pour identifier les patients malades et non malades
data_graphiques['is_sick'] = ~data_graphiques['Finding Labels'].str.contains('No Finding')
data_graphiques['status'] = data_graphiques['is_sick'].map({True: 'Malade', False: 'Non malade'})

# 1. Histogramme pour l'âge
fig = make_subplots(rows=1, cols=3, subplot_titles=["Âge", "Genre", "Position"])

# Groupes d'âge
age_bins = [0, 20, 40, 60, 80, 100]  # Définir 4 tranches d'âge
age_labels = ['0-20', '21-40', '41-60', '61-80', '81+']
data_graphiques['age_group'] = pd.cut(data_graphiques['Patient Age'], bins=age_bins, labels=age_labels)

# Compte normalisé par groupe d'âge et par statut (malade/non malade)
age_counts = data_graphiques.groupby(['age_group', 'status']).size().unstack(fill_value=0)
age_counts_normalized = age_counts.div(age_counts.sum(axis=0), axis=1)

# Ajout du trace pour l'âge
for status, color in zip(['Malade', 'Non malade'], ['red', 'green']):
    if status in age_counts_normalized.columns:
        fig.add_trace(
            go.Bar(
                x=age_counts_normalized.index,
                y=age_counts_normalized[status],
                name=status,
                marker_color=color
            ),
            row=1, col=1
        )

# 2. Histogramme pour le genre
gender_counts = data_graphiques.groupby(['Patient Gender', 'status']).size().unstack(fill_value=0)
gender_counts_normalized = gender_counts.div(gender_counts.sum(axis=0), axis=1)

for status, color in zip(['Malade', 'Non malade'], ['red', 'green']):
    if status in gender_counts_normalized.columns:
        fig.add_trace(
            go.Bar(
                x=gender_counts_normalized.index,
                y=gender_counts_normalized[status],
                name=status,
                marker_color=color,
                showlegend=False
            ),
            row=1, col=2
        )

# 3. Histogramme pour la position
position_counts = data_graphiques.groupby(['View Position', 'status']).size().unstack(fill_value=0)
position_counts_normalized = position_counts.div(position_counts.sum(axis=0), axis=1)

for status, color in zip(['Malade', 'Non malade'], ['red', 'green']):
    if status in position_counts_normalized.columns:
        fig.add_trace(
            go.Bar(
                x=position_counts_normalized.index,
                y=position_counts_normalized[status],
                name=status,
                marker_color=color,
                showlegend=False
            ),
            row=1, col=3
        )

# Mise à jour de la mise en page
fig.update_layout(
    title="Distribution normalisée par catégorie (Malades vs Non malades)",
    height=500,
    width=1200,
    bargap=0.1,
    bargroupgap=0.2,
)

fig.update_yaxes(title_text="Proportion", range=[0, 1])
fig.update_xaxes(title_text="Groupe d'âge", row=1, col=1)
fig.update_xaxes(title_text="Genre", row=1, col=2)
fig.update_xaxes(title_text="Position", row=1, col=3)

# Sauvegarde de l'image
fig.write_image("Histogrammes_normalises.png")





## Encodage des dataframes

In [9]:
# Encodage des colonnes a valeurs non continue 

# Encodage binaire pour Patient Gender et View Position
data['Patient Gender'] = (data['Patient Gender'] == 'F').astype(int) # F est associé a la valeur 1 
data['View Position'] = (data['View Position'] == 'AP').astype(int) # AP est associé a la valeur 1  

# On effectue un encodage binaire pour finding labels du fait qu'on s'interesse uniquement au fait d'être malade ou pas (on n'essaye pas de prédire les maladies)
data['Finding Labels'] = (data['Finding Labels'] == 'No Finding').astype(int) # No Finding est associé a la valeur 1 

# dataset_encod.head()
# Encodage des predictions 
preds_weights['preds'] = (preds_weights['preds'] == 'sain').astype(int) # sain est associé a 1 

# Ages 
seuil_age = 40
data['Age Group'] = np.where(data['Patient Age'] >= seuil_age, 1, 0) # 1 répresent les "vieux"



In [10]:

# Sauvegarder une fois que le pretraitement sera fait 
#data.to_csv(f"{data_dir}/metadata_encod.csv", index=False) # sauvegarde de csv encodé 

# Pre processing

### Première methode naïve de pre-processing : reweighting

### Première aproche a la main pour "Patient Gender"


In [11]:

# -------------------------------------------  Répondération en fonction du genre  -------------------------------------------
nbtotSexe =  (data["Patient Gender"]).count()
nbFemmes = (data["Patient Gender"] == 1).sum()
nbHommes = (data["Patient Gender"] == 0).sum()
nbMalades = (data["Finding Labels"] == 0).sum()
nbSains= nbtotSexe - nbMalades


propH = nbHommes/nbtotSexe
propF = nbFemmes/nbtotSexe
propM = nbMalades/nbtotSexe
propS = 1 - propM

# études pour les femmes
nbFemmesMalades = ((data["Patient Gender"] == 1) & (data["Finding Labels"] == 0)).sum()
nbFemmesSain= nbFemmes - nbFemmesMalades
propFMF = nbFemmesMalades / nbFemmes
propFM = nbFemmesMalades / nbtotSexe
propFS = 1 - propFM
propFSF = 1 - propFMF

# étude pour les hommes
nbHommesMalades = ((data["Patient Gender"] == 0) & (data["Finding Labels"] == 0)).sum()
nbHommesSain = nbHommes - nbHommesMalades
propHMH = nbHommesMalades / nbHommes  # proportions des Hommes malades sur popHommes 
propHM = nbHommesMalades / nbtotSexe 
propHS = 1 - propHM
propHSH = 1 - propHMH  # proportions des Hommes sains sur popHommes 


poidsFM = (nbFemmes * nbMalades) / (nbFemmesMalades * nbtotSexe)
poidsFS = (nbFemmes * nbSains) / (nbFemmesSain * nbtotSexe)

poidsHM = (nbHommes * nbMalades) / (nbHommesMalades * nbtotSexe)
poidsHS = (nbHommes * nbSains) / (nbHommesSain * nbtotSexe)

print(f"Poids calculées pour les sexes :\nFemme Malade : {poidsFM}\nFemme Saine : {poidsFS}\nHomme Malade : {poidsHM}\nHomme Sain : {poidsHS}\n")

Poids calculées pour les sexes :
Femme Malade : 1.033302225044209
Femme Saine : 0.9734456018962356
Homme Malade : 0.9754662395970608
Homme Sain : 1.0217508263870834



### Implémentation de l'algorithme automatisé

In [12]:
# Favorisés : Jeunes (0-20 ans), Femmes (F), PA

# Défavorisés : Adultes (41-80 ans),Hommes(M),AP

def groupes_bw(S):
    if S == "Patient Gender":
        return [0, 1]
    elif S == "View Position":
        return [1, 0]
    elif S == "Age Group":
        return [1, 0]




def re_sampling_naive(D, S, Label):
    W = []
    groupes = groupes_bw(S)
    for s in groupes:
        for c in [0, 1]:  # classes - et +
            num = ((D[S] == s).sum())*((D[Label]==c).sum())
            denom = (((D[S] == s) & (D[Label] == c)).sum()) * D.shape[0]
            poid =   num / denom
            W.append(poid)
    return W


In [13]:
poids_VP = re_sampling_naive(data,"View Position", "Finding Labels" )

poidsAPM = poids_VP[0]
poidsAPS = poids_VP[1]

poidsPAM = poids_VP[2]
poidsPAS = poids_VP[3]
poids_VP

[np.float64(0.8660089311858132),
 np.float64(1.150692505153632),
 np.float64(1.1149736044363807),
 np.float64(0.9197266785927081)]

In [14]:
data['Age Group'] = np.where(data['Patient Age'] >= seuil_age, 1, 0) # 1 répresent les "vieux"

poids_AGE = re_sampling_naive(data,"Age Group", "Finding Labels" )

poidsVieuxM = poids_AGE[0]
poidsVieuxS = poids_AGE[1]

poidsJeunesM = poids_AGE[2]
poidsJeunesS = poids_AGE[3]
poids_AGE

[np.float64(0.9855372894715779),
 np.float64(1.0125771911382422),
 np.float64(1.0322783050765911),
 np.float64(0.9742161286775578)]

In [15]:
# First, create the new columns with default values
data_c = metadata_Vlad.drop(columns=colonnes_a_eliminer, inplace=False) 
data_c = data_c.loc[data_c['Patient Age'] <= 120] # On élimine les patients qui ont plus de 120 ans
data_c['Age Group'] = np.where(data_c['Patient Age'] >= seuil_age, 1, 0)
data_c['poids_reweigth_gender'] = 1.0
data_c['poids_reweigth_PA'] = 1.0
data_c['poids_reweigth_age'] = 1.0

# Assign weights based on gender and health status
# For females (Gender == "F")
data_c.loc[(data_c['Patient Gender'] == "F") & (data_c['Finding Labels'] != "No Finding"), 'poids_reweigth_gender'] = poidsFM
data_c.loc[(data_c['Patient Gender'] == "F") & (data_c['Finding Labels'] == "No Finding"), 'poids_reweigth_gender'] = poidsFS

# For males (Gender == 0)
data_c.loc[(data_c['Patient Gender'] == "M") & (data_c['Finding Labels'] != "No Finding"), 'poids_reweigth_gender'] = poidsHM
data_c.loc[(data_c['Patient Gender'] == "M") & (data_c['Finding Labels'] == "No Finding"), 'poids_reweigth_gender'] = poidsHS

# Assign weights based on view position and health status
# For AP position (View Position == 1)
data_c.loc[(data_c['View Position'] == "AP") & (data_c['Finding Labels'] != "No Finding"), 'poids_reweigth_PA'] = poidsAPM
data_c.loc[(data_c['View Position'] == "AP") & (data_c['Finding Labels'] == "No Finding"), 'poids_reweigth_PA'] = poidsAPS

# For PA position (View Position == 0)
data_c.loc[(data_c['View Position'] == "PA") & (data_c['Finding Labels'] != "No Finding"), 'poids_reweigth_PA'] = poidsPAM
data_c.loc[(data_c['View Position'] == "PA") & (data_c['Finding Labels'] == "No Finding"), 'poids_reweigth_PA'] = poidsPAS

# Assign weights based on age and health status
# For older patients (Age Group == 1)
data_c.loc[(data_c['Age Group'] == 1) & (data_c['Finding Labels'] != "No Finding"), 'poids_reweigth_age'] = poidsVieuxM
data_c.loc[(data_c['Age Group'] == 1) & (data_c['Finding Labels'] == "No Finding"), 'poids_reweigth_age'] = poidsVieuxS

# For younger patients (Age Group == 0)
data_c.loc[(data_c['Age Group'] == 0) & (data_c['Finding Labels'] != "No Finding"), 'poids_reweigth_age'] = poidsJeunesM
data_c.loc[(data_c['Age Group'] == 0) & (data_c['Finding Labels'] == "No Finding"), 'poids_reweigth_age'] = poidsJeunesS

# Save the updated data_cset with the new columns
data_c.to_csv("HERRERA_NATIVI_VLADIMIR/metadata_c_with_weights.csv", index=False)

# Display a sample to verify the weights were added correctly
print(data_c[['Patient Gender', 'View Position', 'Age Group', 'Finding Labels', 
           'poids_reweigth_gender', 'poids_reweigth_PA', 'poids_reweigth_age']].head(10))

  Patient Gender View Position  Age Group                   Finding Labels  \
0              M            PA          1                       No Finding   
1              M            PA          1                         Effusion   
2              F            PA          1                       No Finding   
3              F            PA          1                      Atelectasis   
4              F            PA          1                       No Finding   
5              F            AP          1                       No Finding   
6              F            AP          1                       No Finding   
7              F            AP          1                         Effusion   
8              F            AP          1             Atelectasis|Effusion   
9              F            AP          1  Effusion|Emphysema|Infiltration   

   poids_reweigth_gender  poids_reweigth_PA  poids_reweigth_age  
0               1.021751           0.919727            1.012577  
1        

## Uniform-Sampling reweighting

In [16]:
# Favorisés : Jeunes (0-20 ans), Femmes (F), PA

# Défavorisés : Adultes (41-80 ans), Hommes (M), AP

def groupes_bw(S):
    if S == "Patient Gender":
        return [0, 1]
    elif S == "View Position":
        return [1, 0]
    elif S == "Age Group":
        return [1, 0]


def uniform_sampling(D, S, Label):
    W = []
    groupes = groupes_bw(S)
    for s in groupes:
        for c in [0, 1]:
            num = ((D[S] == s).sum())*((D[Label]==c).sum())
            denom = (((D[S] == s) & (D[Label] == c)).sum()) * D.shape[0]
            poid = num / denom
            W.append(poid)

    nb_DN = ((D[S] == groupes[0]) & (D[Label] == 0)).sum()
    nb_DP = ((D[S] == groupes[0]) & (D[Label] == 1)).sum()
    nb_FN = ((D[S] == groupes[1]) & (D[Label] == 0)).sum()
    nb_FP = ((D[S] == groupes[1]) & (D[Label] == 1)).sum()

    nb_samples_DN = int(W[0]*nb_DN)
    nb_samples_DP = int(W[1]*nb_DP)
    nb_samples_FN = int(W[2]*nb_FN)
    nb_samples_FP = int(W[3]*nb_FP)

    samples_DN = D[(D[S] == groupes[0]) & (D[Label] == 0)].sample(n=nb_samples_DN, replace=True)  
    samples_DP = D[(D[S] == groupes[0]) & (D[Label] == 1)].sample(n=nb_samples_DP, replace=True)
    samples_FN = D[(D[S] == groupes[1]) & (D[Label] == 0)].sample(n=nb_samples_FN, replace=True)
    samples_FP = D[(D[S] == groupes[1]) & (D[Label] == 1)].sample(n=nb_samples_FP, replace=True)
    
    df_resampled = pd.concat([samples_DN, samples_DP, samples_FN, samples_FP])

    #print(nb_DN, nb_DP, nb_FN, nb_FP)
    return df_resampled





In [17]:
# Favorisés : Jeunes (0-20 ans), Femmes (F), PA
# Défavorisés : Adultes (41-80 ans), Hommes (M), AP

df_sexe = uniform_sampling(data, "Patient Gender", "Finding Labels")
df_age = uniform_sampling(data, "Patient Gender", "Finding Labels")


df_sexe.to_csv("HERRERA_NATIVI_VLADIMIR/metadata_uniform_genre.csv", index=False)
df_age.to_csv("HERRERA_NATIVI_VLADIMIR/metadata_uniform_age.csv", index=False)

### Creer une instance de StandardDataset de AIF360

In [27]:
import os
from aif360.datasets import StandardDataset

In [28]:
# Transformation du type de la colonne Image Index (string) en int
#  et création de la table de correspondance 

mapping_dict = {val: idx for idx, val in enumerate(data['Image Index'].unique())}
data['Image Index'] = data['Image Index'].map(mapping_dict)
mapping_table = pd.DataFrame(list(mapping_dict.items()), columns=['Ancien Image Index', 'Nouveau Index'])
#print(data_imgIdxInt.head())
#print(mapping_table.head())

In [29]:
data_train = data[data["train"] == True].copy()
data_val = data[data["train"] == False].copy()

In [30]:
# Get categorical column from one hot encoding (specitic to MEPSdataset)
# Here we create a dictionnary that links each categorical column name
# to the list of corresponding one hot encoded columns
categorical_columns_dic = {}
for col in data.columns:
    col_split = col.split("=")
    if len(col_split) > 1:
        cat_col = col_split[0]
        if not (cat_col in categorical_columns_dic.keys()):
            categorical_columns_dic[cat_col] = []
        categorical_columns_dic[cat_col].append(col)
categorical_features = categorical_columns_dic.keys()

In [31]:
MyDataset_train = StandardDataset(
    df=data_train,
    label_name="Finding Labels",
    favorable_classes=[0],
    protected_attribute_names=["Patient Gender", "Age Group"],
    privileged_classes=[[1],[0]], # a analyser 
    categorical_features=categorical_features,
    na_values=["?", "Unknown/Invalid"],
    custom_preprocessing=None,
    metadata=None,
)

MyDataset_val = StandardDataset(
    df=data_val,
    label_name="Finding Labels",
    favorable_classes=[0],
    protected_attribute_names=["Patient Gender","Age Group"],
    privileged_classes=[[1],[0]], # a analyser 
    categorical_features=categorical_features,
    na_values=["?", "Unknown/Invalid"],
    custom_preprocessing=None,
    metadata=None,
)

MyDataset = StandardDataset(
    df=data,
    label_name="Finding Labels",
    favorable_classes=[0],
    protected_attribute_names=["Patient Gender","Age Group"],
    privileged_classes=[[1],[0]], # a analyser 
    categorical_features=categorical_features,
    na_values=["?", "Unknown/Invalid"],
    custom_preprocessing=None,
    metadata=None,
)

## Preprocessing : Implémentations Reweighing AIF360

In [32]:
sens_ind = 0
sens_attr = MyDataset_train.protected_attribute_names[sens_ind]
unprivileged_groups = [
    {sens_attr: v}
    for v in MyDataset_train.unprivileged_protected_attributes[sens_ind]
]
privileged_groups = [
    {sens_attr: v}
    for v in MyDataset_train.privileged_protected_attributes[sens_ind]
]
sens_attr, unprivileged_groups, privileged_groups

('Patient Gender',
 [{'Patient Gender': np.float64(0.0)}],
 [{'Patient Gender': np.float64(1.0)}])

In [33]:
from aif360.algorithms.preprocessing import *

RW_gender = Reweighing(
    unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups
)

In [34]:
RW_gender.fit(MyDataset_train)
dataset_transf = RW_gender.transform(MyDataset)
dataset_transf_train = RW_gender.transform(MyDataset_train)
dataset_transf_val = RW_gender.transform(MyDataset_val)

dataset_transf_train , dataset_transf_train.instance_weights

(               instance weights    features                         \
                                                                      
                                 Image Index Follow-up # Patient ID   
 instance names                                                       
 0                      0.993727         0.0         0.0        6.0   
 1                      1.008043         1.0         0.0       25.0   
 3                      0.990955         3.0         0.0       72.0   
 4                      1.007274         4.0         0.0       90.0   
 5                      1.007274         5.0         1.0       90.0   
 ...                         ...         ...         ...        ...   
 5622                   1.007274      5621.0         0.0    30752.0   
 5623                   1.007274      5622.0         0.0    30772.0   
 5624                   0.990955      5623.0         1.0    30772.0   
 5625                   0.990955      5624.0         2.0    30772.0   
 5626 

####  Age RW : 

In [35]:
sens_ind_age = 1
sens_attr_age = MyDataset_train.protected_attribute_names[sens_ind_age]
unprivileged_groups_age = [
    {sens_attr_age: v}
    for v in MyDataset_train.unprivileged_protected_attributes[sens_ind_age]
]
privileged_groups_age = [
    {sens_attr_age: v}
    for v in MyDataset_train.privileged_protected_attributes[sens_ind_age]
]
sens_attr_age, unprivileged_groups_age, privileged_groups_age

('Age Group',
 [{'Age Group': np.float64(1.0)}],
 [{'Age Group': np.float64(0.0)}])

In [36]:
RW_age = Reweighing(
    unprivileged_groups=unprivileged_groups_age, privileged_groups=privileged_groups_age
)

RW_age.fit(MyDataset_train)
dataset_transf_age = RW_age.transform(MyDataset)

dataset_transf_age_train = RW_age.transform(MyDataset_train)
dataset_trans_age_val = RW_age.transform(MyDataset_val)

dataset_transf_age_train , dataset_transf_age_train.instance_weights

(               instance weights    features                         \
                                                                      
                                 Image Index Follow-up # Patient ID   
 instance names                                                       
 0                      0.997075         0.0         0.0        6.0   
 1                      1.003722         1.0         0.0       25.0   
 3                      1.003722         3.0         0.0       72.0   
 4                      0.997075         4.0         0.0       90.0   
 5                      0.997075         5.0         1.0       90.0   
 ...                         ...         ...         ...        ...   
 5622                   0.997075      5621.0         0.0    30752.0   
 5623                   1.005441      5622.0         0.0    30772.0   
 5624                   0.993207      5623.0         1.0    30772.0   
 5625                   0.993207      5624.0         2.0    30772.0   
 5626 

In [37]:
# Ajout des weights calculés avec aif360
data_c["weights_gender_aif360"] = dataset_transf.instance_weights
data_c["weights_age_aif360"] = dataset_transf_age.instance_weights
# Save the updated data_cset with the new columns
data_c.to_csv("HERRERA_NATIVI_VLADIMIR/metadata_c_with_weights.csv", index=False)
data_c


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,WEIGHTS,train,Age Group,poids_reweigth_gender,poids_reweigth_PA,poids_reweigth_age,weights_gender_aif360,weights_age_aif360
0,00000006_000.png,No Finding,0,6,81,M,PA,1,True,1,1.021751,0.919727,1.012577,0.993727,0.997075
1,00000025_000.png,Effusion,0,25,71,M,PA,1,True,1,0.975466,1.114974,0.985537,1.008043,1.003722
2,00000029_000.png,No Finding,0,29,59,F,PA,1,False,1,0.973446,0.919727,1.012577,1.007274,0.997075
3,00000072_000.png,Atelectasis,0,72,67,F,PA,1,True,1,1.033302,1.114974,0.985537,0.990955,1.003722
4,00000090_000.png,No Finding,0,90,67,F,PA,1,True,1,0.973446,0.919727,1.012577,1.007274,0.997075
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5622,00030752_000.png,No Finding,0,30752,64,F,AP,1,True,1,0.973446,1.150693,1.012577,1.007274,0.997075
5623,00030772_000.png,No Finding,0,30772,26,F,AP,1,True,0,0.973446,1.150693,0.974216,1.007274,1.005441
5624,00030772_001.png,Consolidation,1,30772,26,F,AP,1,True,0,1.033302,0.866009,1.032278,0.990955,0.993207
5625,00030772_002.png,Consolidation,2,30772,26,F,AP,1,True,0,1.033302,0.866009,1.032278,0.990955,0.993207


## Disparate Impact Remover 

In [38]:
# Disparate Impact Remover : PATIENT GENDER 

DIR_gender = DisparateImpactRemover(repair_level=1.0, sensitive_attribute="Patient Gender")

DIR_gender.fit(MyDataset_train)

dataset_transf_dir_gender = DIR_gender.fit_transform(MyDataset)
dataset_transf_dir_gender_train = DIR_gender.fit_transform(MyDataset_train)
dataset_transf_dir_gender_val = DIR_gender.fit_transform(MyDataset_val)

dataset_transf_dir_gender_train

               instance weights    features                         \
                                                                     
                                Image Index Follow-up # Patient ID   
instance names                                                       
0                           1.0         0.0         0.0        6.0   
1                           1.0         1.0         0.0       25.0   
3                           1.0         0.0         0.0        6.0   
4                           1.0         1.0         0.0       25.0   
5                           1.0         5.0         0.0       25.0   
...                         ...         ...         ...        ...   
5622                        1.0      5615.0         0.0    30711.0   
5623                        1.0      5618.0         0.0    30772.0   
5624                        1.0      5619.0         0.0    30772.0   
5625                        1.0      5624.0         1.0    30772.0   
5626                

In [39]:
# Disparate Impact Remover : AGE GROUP

DIR_age = DisparateImpactRemover(repair_level=1.0, sensitive_attribute="Age Group")
DIR_age.fit(MyDataset_train)

dataset_transf_dir_age = DIR_age.fit_transform(MyDataset)
dataset_transf_dir_age_train = DIR_age.fit_transform(MyDataset_train)
dataset_transf_dir_age_val = DIR_age.fit_transform(MyDataset_val)

dataset_transf_dir_age_train

               instance weights    features                         \
                                                                     
                                Image Index Follow-up # Patient ID   
instance names                                                       
0                           1.0         0.0         0.0        6.0   
1                           1.0         0.0         0.0        6.0   
3                           1.0         3.0         0.0       72.0   
4                           1.0         3.0         0.0       72.0   
5                           1.0         5.0         1.0       72.0   
...                         ...         ...         ...        ...   
5622                        1.0      5619.0         0.0    30711.0   
5623                        1.0      5612.0         0.0    30687.0   
5624                        1.0      5613.0         1.0    30687.0   
5625                        1.0      5617.0         2.0    30687.0   
5626                

In [40]:
data_c["weights_gender_DIR_aif360"] = dataset_transf_dir_gender.instance_weights
data_c["weights_age_DIR_aif360"] = dataset_transf_dir_age.instance_weights

data_c.to_csv("HERRERA_NATIVI_VLADIMIR/metadata_c_with_weights.csv", index=False)
data_c

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,WEIGHTS,train,Age Group,poids_reweigth_gender,poids_reweigth_PA,poids_reweigth_age,weights_gender_aif360,weights_age_aif360,weights_gender_DIR_aif360,weights_age_DIR_aif360
0,00000006_000.png,No Finding,0,6,81,M,PA,1,True,1,1.021751,0.919727,1.012577,0.993727,0.997075,1.0,1.0
1,00000025_000.png,Effusion,0,25,71,M,PA,1,True,1,0.975466,1.114974,0.985537,1.008043,1.003722,1.0,1.0
2,00000029_000.png,No Finding,0,29,59,F,PA,1,False,1,0.973446,0.919727,1.012577,1.007274,0.997075,1.0,1.0
3,00000072_000.png,Atelectasis,0,72,67,F,PA,1,True,1,1.033302,1.114974,0.985537,0.990955,1.003722,1.0,1.0
4,00000090_000.png,No Finding,0,90,67,F,PA,1,True,1,0.973446,0.919727,1.012577,1.007274,0.997075,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5622,00030752_000.png,No Finding,0,30752,64,F,AP,1,True,1,0.973446,1.150693,1.012577,1.007274,0.997075,1.0,1.0
5623,00030772_000.png,No Finding,0,30772,26,F,AP,1,True,0,0.973446,1.150693,0.974216,1.007274,1.005441,1.0,1.0
5624,00030772_001.png,Consolidation,1,30772,26,F,AP,1,True,0,1.033302,0.866009,1.032278,0.990955,0.993207,1.0,1.0
5625,00030772_002.png,Consolidation,2,30772,26,F,AP,1,True,0,1.033302,0.866009,1.032278,0.990955,0.993207,1.0,1.0


# Post-processing


### On effectuera une regression logistique qui nos permettra d'obtenir les probabiltés de predictions

In [41]:

# Extract features and labels
X_train = data_train.drop(columns=['Finding Labels', 'Image Index', 'Patient ID'])
y_train = data_train['Finding Labels']

X_valid = data_val.drop(columns=['Finding Labels', 'Image Index', 'Patient ID'])
y_valid = data_val['Finding Labels']

# Extract instance weights from your dataframe
sample_weights_train = data_train['WEIGHTS']
sample_weights_valid = data_val['WEIGHTS']

# Create and train the model with instance weights
model = make_pipeline(
    StandardScaler(),
    LogisticRegression(solver='liblinear', random_state=42)
)

# Fit with sample weights
model.fit(
    X_train,
    y_train,
    **{'logisticregression__sample_weight': sample_weights_train}
)

# Make predictions
preds = model.predict(X_valid)

# Evaluate with sample weights
score = model.score(
    X_valid,
    y_valid,
    sample_weight=sample_weights_valid
)

print(f"Model validation score with weights: {score:.4f}")

Model validation score with weights: 0.5701


## Reject-option classification

In [45]:
from aif360.algorithms.postprocessing import RejectOptionClassification
from aif360.metrics import ClassificationMetric
import pandas as pd

def single_attribute_fairness_analysis(dataset_train, dataset_val, protected_attributes):
    """
    Perform ROC fairness adjustment for each protected attribute separately
    
    Args:
        dataset_train: AIF360 training dataset
        dataset_val: AIF360 validation dataset
        protected_attributes: List of protected attribute names to analyze
        
    Returns:
        Dictionary of fairness metrics per attribute
    """
    # 1. Train base model
    model = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42)
    model.fit(dataset_train.features, dataset_train.labels.ravel())
    
    # 2. Get predicted scores
    val_scores = model.predict_proba(dataset_val.features)[:, 1]
    val_pred = dataset_val.copy(deepcopy=True)
    val_pred.scores = val_scores.reshape(-1, 1)
    
    results = {}
    
    for attr in protected_attributes:
        print(f"\n{'='*40}\nAnalyzing attribute: {attr}\n{'='*40}")
        
        # 3. Get attribute-specific groups
        attr_idx = dataset_val.protected_attribute_names.index(attr)
        privileged_group = [{attr: dataset_val.privileged_protected_attributes[attr_idx][0]}]
        unprivileged_group = [{attr: v} for v in dataset_val.unprivileged_protected_attributes[attr_idx]]
        
        # 4. Configure ROC
        roc = RejectOptionClassification(
            unprivileged_groups=unprivileged_group,
            privileged_groups=privileged_group,
            metric_name="Statistical parity difference",
            metric_ub=0.05,
            metric_lb=-0.05,
            low_class_thresh=0.01,
            high_class_thresh=0.99,
            num_class_thresh=50,
            num_ROC_margin=30,
        )
        
        # 5. Fit and predict
        roc.fit(dataset_val, val_pred)
        fair_pred = roc.predict(val_pred)
        
        # 6. Calculate metrics
        metric = ClassificationMetric(
            dataset_val,
            fair_pred,
            unprivileged_groups=unprivileged_group,
            privileged_groups=privileged_group
        )
        
        # Store results
        results[attr] = {
            'threshold': roc.classification_threshold,
            'margin': roc.ROC_margin,
            'spd': metric.statistical_parity_difference(),
            'eod': metric.equal_opportunity_difference(),
            'aod': metric.average_odds_difference()
        }
        
        # Print results
        print(f"\n{attr} fairness metrics:")
        print(f"  Optimal threshold: {results[attr]['threshold']:.4f}")
        print(f"  Statistical parity difference: {results[attr]['spd']:.4f}")
        print(f"  Equal opportunity difference: {results[attr]['eod']:.4f}")
        print(f"  Average odds difference: {results[attr]['aod']:.4f}")
    
    return results

# =================================================================
# Usage with your dataset
# =================================================================

# List of protected attributes to analyze separately
protected_attributes = ["Patient Gender", "Age Group"]

# Run analysis
metrics_results = single_attribute_fairness_analysis(
    MyDataset_train,
    MyDataset_val,
    protected_attributes
)

# Optional: Create combined fair predictions
final_predictions = MyDataset_val.copy(deepcopy=True)
for attr in protected_attributes:
    # Get attribute-specific predictions
    attr_idx = MyDataset_val.protected_attribute_names.index(attr)
    privileged_group = [{attr: MyDataset_val.privileged_protected_attributes[attr_idx][0]}]
    unprivileged_group = [{attr: v} for v in MyDataset_val.unprivileged_protected_attributes[attr_idx]]
    
    roc = RejectOptionClassification(
        unprivileged_groups=unprivileged_group,
        privileged_groups=privileged_group,
        metric_name="Statistical parity difference",
        metric_ub=0.05,
        metric_lb=-0.05
    ).fit(MyDataset_val, final_predictions)
    
    final_predictions = roc.predict(final_predictions)

print("\nFinal combined metrics:")
combined_metric = ClassificationMetric(
    MyDataset_val,
    final_predictions,
    unprivileged_groups=unprivileged_groups,
    privileged_groups=privileged_groups
)
print(f"Statistical parity difference: {combined_metric.statistical_parity_difference():.4f}")
print(f"Equal opportunity difference: {combined_metric.equal_opportunity_difference():.4f}")


Analyzing attribute: Patient Gender

Patient Gender fairness metrics:
  Optimal threshold: 0.1300
  Statistical parity difference: 0.0115
  Equal opportunity difference: 0.0132
  Average odds difference: 0.0117

Analyzing attribute: Age Group

Age Group fairness metrics:
  Optimal threshold: 0.1500
  Statistical parity difference: 0.0030
  Equal opportunity difference: 0.0000
  Average odds difference: 0.0027



Unable to satisy fairness constraints




Final combined metrics:
Statistical parity difference: 0.1098
Equal opportunity difference: 0.0000


## Equalized-odds

# Calcul de Metrics


In [43]:
from aif360.sklearn.metrics import disparate_impact_ratio, base_rate

dir = disparate_impact_ratio(
    y_true=data["Finding Labels"], y_pred=preds_weights["preds"], prot_attr=data["Patient Gender"] , pos_label=1, sample_weight=data.WEIGHTS
)
br = base_rate(y_true=data["Finding Labels"] , pos_label=1, sample_weight=data.WEIGHTS)
dir, br

IndexError: Boolean index has wrong length: 5626 instead of 5627

In [None]:
dir = disparate_impact_ratio(
    y_true=data["Finding Labels"], y_pred=preds_weights["preds"], prot_attr=data["Age Group"] , pos_label=1, sample_weight=data.WEIGHTS
)
br = base_rate(y_true=data["Finding Labels"] , pos_label=1, sample_weight=data.WEIGHTS)
dir, br

(1.2696564700765505, np.float64(0.5414963568508975))