In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

# Datasets
from aif360.datasets import MEPSDataset19
from aif360.explainers import MetricTextExplainer

# Fairness metrics
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.metrics import ClassificationMetric
from sklearn.metrics import accuracy_score, balanced_accuracy_score


#graphiques
import plotly.graph_objects as go
from plotly.subplots import make_subplots


pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'
  vect_normalized_discounted_cumulative_gain = vmap(
  monte_carlo_vect_ndcg = vmap(vect_normalized_discounted_cumulative_gain, in_dims=(0,))


In [2]:
data_dir = "HERRERA_NATIVI_VLADIMIR"
metadata_Vlad = pd.read_csv("HERRERA_NATIVI_VLADIMIR/metadata.csv")
metadata_Javi = pd.read_csv("PENA_CASTANO_JAVIER/metadata.csv")
# La prof ma parlé d'un tensorboard

In [3]:
# Predictions : 

preds_weights = pd.read_csv("expe_log/predsOrig.csv")

In [4]:
print(f"Vladi : {metadata_Vlad.shape} | Javi : {metadata_Javi.shape}")

Vladi : (5627, 13) | Javi : (5477, 12)


In [5]:
# Fusionner les datasets
#metadata_combined = pd.concat([metadata_Vlad, metadata_Javi])

# Supprimer les doublons en gardant la première occurrence
#metadata_combined = metadata_combined.drop_duplicates(subset="Image Index", keep="first")

# Afficher un aperçu
#print(metadata_combined.head())

# Sauvegarder le dataset fusionné si besoin
#metadata_combined.to_csv("metadata_combined.csv", index=False)

#print(f"Merged  : {metadata_combined.shape}")

In [6]:
# Elimination des colonnes non necessaire pour notre étude initial
colonnes_a_eliminer = ['OriginalImage[Width','Height]', 'OriginalImagePixelSpacing[x', 'y]']
data = metadata_Vlad.drop(columns=colonnes_a_eliminer, inplace=False) 
data = data.loc[data['Patient Age'] <= 120] # On élimine les patients qui ont plus de 120 ans 

# Analyse quantitative des données 

In [7]:
data_graphiques = data.copy() 

In [9]:
#Affichage de graphiques pour la visualisation de la repartition de l'age, le follow up, le genre et la position
fig = make_subplots(rows = 2, cols=2, subplot_titles = ["Follow Up", "Age", "Gender", "Position"])
fig.add_trace (go.Histogram(x=data_graphiques['Follow-up #'], name= "follow up"), row = 1, col=1)
fig.add_trace (go.Histogram(x=data_graphiques['Patient Age'], name= "Age"), row = 1, col=2)
fig.add_trace (go.Histogram(x=data_graphiques['Patient Gender'], name= "Gender"), row = 2, col=1)
fig.add_trace (go.Histogram(x=data_graphiques['View Position'], name= "Position"), row = 2, col=2)

fig.update_layout(title = "Histograms")
fig.write_image("Histogrammes1.png")

In [10]:
# Créons d'abord une colonne pour identifier les patients malades et non malades
data_graphiques['is_sick'] = ~data_graphiques['Finding Labels'].str.contains('No Finding')
data_graphiques['status'] = data_graphiques['is_sick'].map({True: 'Malade', False: 'Non malade'})

# 1. Histogramme pour l'âge
fig = make_subplots(rows=1, cols=3, subplot_titles=["Âge", "Genre", "Position"])

# Groupes d'âge
age_bins = [0, 20, 40, 60, 80, 100]  # Définir 4 tranches d'âge
age_labels = ['0-20', '21-40', '41-60', '61-80', '81+']
data_graphiques['age_group'] = pd.cut(data_graphiques['Patient Age'], bins=age_bins, labels=age_labels)

# Compte normalisé par groupe d'âge et par statut (malade/non malade)
age_counts = data_graphiques.groupby(['age_group', 'status']).size().unstack(fill_value=0)
age_counts_normalized = age_counts.div(age_counts.sum(axis=0), axis=1)

# Ajout du trace pour l'âge
for status, color in zip(['Malade', 'Non malade'], ['red', 'green']):
    if status in age_counts_normalized.columns:
        fig.add_trace(
            go.Bar(
                x=age_counts_normalized.index,
                y=age_counts_normalized[status],
                name=status,
                marker_color=color
            ),
            row=1, col=1
        )

# 2. Histogramme pour le genre
gender_counts = data_graphiques.groupby(['Patient Gender', 'status']).size().unstack(fill_value=0)
gender_counts_normalized = gender_counts.div(gender_counts.sum(axis=0), axis=1)

for status, color in zip(['Malade', 'Non malade'], ['red', 'green']):
    if status in gender_counts_normalized.columns:
        fig.add_trace(
            go.Bar(
                x=gender_counts_normalized.index,
                y=gender_counts_normalized[status],
                name=status,
                marker_color=color,
                showlegend=False
            ),
            row=1, col=2
        )

# 3. Histogramme pour la position
position_counts = data_graphiques.groupby(['View Position', 'status']).size().unstack(fill_value=0)
position_counts_normalized = position_counts.div(position_counts.sum(axis=0), axis=1)

for status, color in zip(['Malade', 'Non malade'], ['red', 'green']):
    if status in position_counts_normalized.columns:
        fig.add_trace(
            go.Bar(
                x=position_counts_normalized.index,
                y=position_counts_normalized[status],
                name=status,
                marker_color=color,
                showlegend=False
            ),
            row=1, col=3
        )

# Mise à jour de la mise en page
fig.update_layout(
    title="Distribution normalisée par catégorie (Malades vs Non malades)",
    height=500,
    width=1200,
    bargap=0.1,
    bargroupgap=0.2,
)

fig.update_yaxes(title_text="Proportion", range=[0, 1])
fig.update_xaxes(title_text="Groupe d'âge", row=1, col=1)
fig.update_xaxes(title_text="Genre", row=1, col=2)
fig.update_xaxes(title_text="Position", row=1, col=3)

# Sauvegarde de l'image
fig.write_image("Histogrammes_normalises.png")





## Encodage des dataframes

In [12]:
# Encodage des colonnes a valeurs non continue 

# Encodage binaire pour Patient Gender et View Position
data['Patient Gender'] = (data['Patient Gender'] == 'F').astype(int) # F est associé a la valeur 1 
data['View Position'] = (data['View Position'] == 'AP').astype(int) # AP est associé a la valeur 1  

# On effectue un encodage binaire pour finding labels du fait qu'on s'interesse uniquement au fait d'être malade ou pas (on n'essaye pas de prédire les maladies)
data['Finding Labels'] = (data['Finding Labels'] == 'No Finding').astype(int) # No Finding est associé a la valeur 1 

# dataset_encod.head()
# Encodage des predictions 
preds_weights['preds'] = (preds_weights['preds'] == 'sain').astype(int) # sain est associé a 1 

# Ages 
seuil_age = 40
data['Age Group'] = np.where(data['Patient Age'] >= seuil_age, 1, 0) # 1 répresent les "vieux"



In [9]:

# Sauvegarder une fois que le pretraitement sera fait 
#data.to_csv(f"{data_dir}/metadata_encod.csv", index=False) # sauvegarde de csv encodé 

# Pre processing

### Première methode naïve de pre-processing : reweighting

In [46]:

# -------------------------------------------  Répondération en fonction du genre  -------------------------------------------
nbtotSexe =  (data["Patient Gender"]).count()
nbFemmes = (data["Patient Gender"] == 1).sum()
nbHommes = (data["Patient Gender"] == 0).sum()
nbMalades = (data["Finding Labels"] == 0).sum()
nbSains= nbtotSexe - nbMalades


propH = nbHommes/nbtotSexe
propF = nbFemmes/nbtotSexe
propM = nbMalades/nbtotSexe
propS = 1 - propM

# études pour les femmes
nbFemmesMalades = ((data["Patient Gender"] == 1) & (data["Finding Labels"] == 0)).sum()
nbFemmesSain= nbFemmes - nbFemmesMalades
propFMF = nbFemmesMalades / nbFemmes
propFM = nbFemmesMalades / nbtotSexe
propFS = 1 - propFM
propFSF = 1 - propFMF

# étude pour les hommes
nbHommesMalades = ((data["Patient Gender"] == 0) & (data["Finding Labels"] == 0)).sum()
nbHommesSain = nbHommes - nbHommesMalades
propHMH = nbHommesMalades / nbHommes  # proportions des Hommes malades sur popHommes 
propHM = nbHommesMalades / nbtotSexe 
propHS = 1 - propHM
propHSH = 1 - propHMH  # proportions des Hommes sains sur popHommes 


poidsFM = (nbFemmes * nbMalades) / (nbFemmesMalades * nbtotSexe)
poidsFS = (nbFemmes * nbSains) / (nbFemmesSain * nbtotSexe)

poidsHM = (nbHommes * nbMalades) / (nbHommesMalades * nbtotSexe)
poidsHS = (nbHommes * nbSains) / (nbHommesSain * nbtotSexe)

print(f"Poids calculées pour les sexes :\nFemme Malade : {poidsFM}\nFemme Saine : {poidsFS}\nHomme Malade : {poidsHM}\nHomme Sain : {poidsHS}\n")

Poids calculées pour les sexes :
Femme Malade : 1.033302225044209
Femme Saine : 0.9734456018962356
Homme Malade : 0.9754662395970608
Homme Sain : 1.0217508263870834



In [None]:
# ------------------------------------------- Répondération en fonction de la position du Rayon X -------------------------------------------
#  PA == 0

nbtotPosi =  (data["View Position"]).count()
nbAP = (data["View Position"] == 1).sum()
nbPA = (data["View Position"] == 0).sum()
nbMalades = (data["Finding Labels"] == 0).sum()
nbSains = nbtotPosi - nbMalades


propPA = nbPA/nbtotPosi
propAP = nbAP/nbtotPosi
propM = nbMalades/nbtotPosi
propS = 1 - propM

#nb de PA malades
nbAPMalades = ((data["View Position"] == 1) & (data["Finding Labels"] == 0)).sum()
nbAPSain= nbAP - nbAPMalades


#nb de AP malades
nbPAMalades = ((data["View Position"] == 0) &( data["Finding Labels"] == 0)).sum()
nbPASain = nbPA - nbPAMalades


poidsAPM = (nbAP * nbMalades) / (nbAPMalades * nbtotPosi)
poidsAPS = (nbAP * nbSains) / (nbAPSain * nbtotPosi)

poidsPAM = (nbPA * nbMalades) / (nbPAMalades * nbtotPosi)
poidsPAS = (nbPA * nbSains) / (nbPASain * nbtotPosi)

print(f"Poids calculées pour les positions :\nPA Malade : {poidsPAM}\nPA Sain : {poidsPAS}\nAp Malade : {poidsAPM}\nAP Sain : {poidsAPS}\n")

Poids calculées pour les positions :
PA Malade : 0.7788565407014685
PA Sain : 1.3165105810550617
Ap Malade : 0.9741578820448354
AP Sain : 1.113680807051663



In [None]:
# ------------------------------------------- Répondération en fonction du 'Patient Age' -------------------------------------------

nbtotAge = data['Age Group'].count()
nbVieux = (data['Age Group'] == 1).sum()
nbJeunes = (data['Age Group'] == 0).sum()

nbMalades = (data["Finding Labels"] == 0).sum()
nbSains = nbtotAge - nbMalades


propJeunes = nbJeunes/nbtotAge
propVieux = nbVieux/nbtotAge
propM = nbMalades/nbtotAge
propS = 1 - propM

#nb de PA malades
nbVieuxMalades = ((data["Age Group"] == 1) & (data["Finding Labels"] == 0)).sum()
nbVieuxSain= nbVieux - nbVieuxMalades


#nb de AP malades
nbJeunesMalades = ((data["Age Group"] == 0) & (data["Finding Labels"] == 0)).sum()
nbJeunesSain = nbJeunes - nbJeunesMalades


poidsVieuxM = (nbVieux * nbMalades) / (nbVieuxMalades * nbtotAge)
poidsVieuxS = (nbVieux * nbSains) / (nbVieuxSain * nbtotAge)

poidsJeunesM = (nbJeunes * nbMalades) / (nbJeunesMalades * nbtotAge)
poidsJeunesS = (nbJeunes * nbSains) / (nbJeunesSain * nbtotAge)

print(f"Poids calculées pour les groupes d'ages :\nJeune Malade : {poidsJeunesM}\nJeune Sain : {poidsJeunesS}\nVieux Malade : {poidsVieuxM}\nVieux Sain : {poidsVieuxS}\n")

Poids calculées pour les groupes d'ages :
Jeune Malade : 0.8247558025414787
Jeune Sain : 1.2193846532093517
Vieux Malade : 0.8574555435845033
Vieux Sain : 1.1638221362171366



In [15]:
# First, create the new columns with default values
data_c = metadata_Vlad.drop(columns=colonnes_a_eliminer, inplace=False) 
data_c['Age Group'] = np.where(data_c['Patient Age'] >= seuil_age, 1, 0)
data_c['poids_reweigth_gender'] = 1.0
data_c['poids_reweigth_PA'] = 1.0
data_c['poids_reweigth_age'] = 1.0

# Assign weights based on gender and health status
# For females (Gender == "F")
data_c.loc[(data_c['Patient Gender'] == "F") & (data_c['Finding Labels'] != "No Finding"), 'poids_reweigth_gender'] = poidsFM
data_c.loc[(data_c['Patient Gender'] == "F") & (data_c['Finding Labels'] == "No Finding"), 'poids_reweigth_gender'] = poidsFS

# For males (Gender == 0)
data_c.loc[(data_c['Patient Gender'] == "M") & (data_c['Finding Labels'] != "No Finding"), 'poids_reweigth_gender'] = poidsHM
data_c.loc[(data_c['Patient Gender'] == "M") & (data_c['Finding Labels'] == "No Finding"), 'poids_reweigth_gender'] = poidsHS

# Assign weights based on view position and health status
# For AP position (View Position == 1)
data_c.loc[(data_c['View Position'] == "AP") & (data_c['Finding Labels'] != "No Finding"), 'poids_reweigth_PA'] = poidsAPM
data_c.loc[(data_c['View Position'] == "AP") & (data_c['Finding Labels'] == "No Finding"), 'poids_reweigth_PA'] = poidsAPS

# For PA position (View Position == 0)
data_c.loc[(data_c['View Position'] == "PA") & (data_c['Finding Labels'] != "No Finding"), 'poids_reweigth_PA'] = poidsPAM
data_c.loc[(data_c['View Position'] == "PA") & (data_c['Finding Labels'] == "No Finding"), 'poids_reweigth_PA'] = poidsPAS

# Assign weights based on age and health status
# For older patients (Age Group == 1)
data_c.loc[(data_c['Age Group'] == 1) & (data_c['Finding Labels'] != "No Finding"), 'poids_reweigth_age'] = poidsVieuxM
data_c.loc[(data_c['Age Group'] == 1) & (data_c['Finding Labels'] == "No Finding"), 'poids_reweigth_age'] = poidsVieuxS

# For younger patients (Age Group == 0)
data_c.loc[(data_c['Age Group'] == 0) & (data_c['Finding Labels'] != "No Finding"), 'poids_reweigth_age'] = poidsJeunesM
data_c.loc[(data_c['Age Group'] == 0) & (data_c['Finding Labels'] == "No Finding"), 'poids_reweigth_age'] = poidsJeunesS

# Save the updated data_cset with the new columns
data_c.to_csv("HERRERA_NATIVI_VLADIMIR/metadata_c_with_weights.csv", index=False)

# Display a sample to verify the weights were added correctly
print(data_c[['Patient Gender', 'View Position', 'Age Group', 'Finding Labels', 
           'poids_reweigth_gender', 'poids_reweigth_PA', 'poids_reweigth_age']].head(10))

  Patient Gender View Position  Age Group                   Finding Labels  \
0              M            PA          1                       No Finding   
1              M            PA          1                         Effusion   
2              F            PA          1                       No Finding   
3              F            PA          1                      Atelectasis   
4              F            PA          1                       No Finding   
5              F            AP          1                       No Finding   
6              F            AP          1                       No Finding   
7              F            AP          1                         Effusion   
8              F            AP          1             Atelectasis|Effusion   
9              F            AP          1  Effusion|Emphysema|Infiltration   

   poids_reweigth_gender  poids_reweigth_PA  poids_reweigth_age  
0               1.151864           1.316511            1.163822  
1        

## Uniform-Sampling reweighting

In [53]:
# Favorisés : Jeunes (0-20 ans), Femmes (F), PA

# Défavorisés : Adultes (41-80 ans), Hommes (M), AP

def groupes_bw(S):
    if S == "Patient Gender":
        return [0, 1]
    elif S == "View Position":
        return [1, 0]
    elif S == "Age Group":
        return [1, 0]


def uniform_sampling(D, S, Label):
    W = []
    groupes = groupes_bw(S)
    for s in groupes:
        for c in [0, 1]:
            num = ((D[S] == s).sum())*((D[Label]==c).sum())
            denom = (((D[S] == s) & (D[Label] == c)).sum()) * D.shape[0]
            poid = num / denom
            W.append(poid)

    nb_DN = ((D[S] == groupes[0]) & (D[Label] == 0)).sum()
    nb_DP = ((D[S] == groupes[0]) & (D[Label] == 1)).sum()
    nb_FN = ((D[S] == groupes[1]) & (D[Label] == 0)).sum()
    nb_FP = ((D[S] == groupes[1]) & (D[Label] == 1)).sum()

    nb_samples_DN = int(W[0]*nb_DN)
    nb_samples_DP = int(W[1]*nb_DP)
    nb_samples_FN = int(W[2]*nb_FN)
    nb_samples_FP = int(W[3]*nb_FP)

    samples_DN = D[(D[S] == groupes[0]) & (D[Label] == 0)].sample(n=nb_samples_DN, replace=True)  
    samples_DP = D[(D[S] == groupes[0]) & (D[Label] == 1)].sample(n=nb_samples_DP, replace=True)
    samples_FN = D[(D[S] == groupes[1]) & (D[Label] == 0)].sample(n=nb_samples_FN, replace=True)
    samples_FP = D[(D[S] == groupes[1]) & (D[Label] == 1)].sample(n=nb_samples_FP, replace=True)
    
    df_resampled = pd.concat([samples_DN, samples_DP, samples_FN, samples_FP])

    #print(nb_DN, nb_DP, nb_FN, nb_FP)
    return df_resampled





In [54]:
# Favorisés : Jeunes (0-20 ans), Femmes (F), PA
# Défavorisés : Adultes (41-80 ans), Hommes (M), AP

df_sexe = uniform_sampling(data, "Patient Gender", "Finding Labels")

df_sexe

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,WEIGHTS,train,Age Group
5551,00029976_005.png,0,5,29976,52,0,0,1,False,1
4882,00025237_000.png,0,0,25237,62,0,0,1,True,1
2202,00010896_001.png,0,1,10896,72,0,0,1,False,1
4989,00026197_004.png,0,4,26197,41,0,0,1,True,1
333,00002072_013.png,0,13,2072,10,0,1,1,True,0
...,...,...,...,...,...,...,...,...,...,...
5316,00028108_000.png,1,0,28108,25,1,0,1,False,0
2021,00010530_005.png,1,5,10530,36,1,0,1,True,0
3445,00016397_005.png,1,5,16397,45,1,1,1,True,1
5050,00026560_001.png,1,1,26560,44,1,1,1,True,1


### Creer une instance de StandardDataset de AIF360

In [16]:
import os
from aif360.datasets import StandardDataset

In [17]:
# Transformation du type de la colonne Image Index (string) en int
#  et création de la table de correspondance 

mapping_dict = {val: idx for idx, val in enumerate(data['Image Index'].unique())}
data['Image Index'] = data['Image Index'].map(mapping_dict)
mapping_table = pd.DataFrame(list(mapping_dict.items()), columns=['Ancien Image Index', 'Nouveau Index'])
#print(data_imgIdxInt.head())
print(mapping_table.head())

data

  Ancien Image Index  Nouveau Index
0   00000006_000.png              0
1   00000025_000.png              1
2   00000029_000.png              2
3   00000072_000.png              3
4   00000090_000.png              4


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,WEIGHTS,train,Age Group
0,0,1,0,6,81,0,0,1,True,1
1,1,0,0,25,71,0,0,1,True,1
2,2,1,0,29,59,1,0,1,False,1
3,3,0,0,72,67,1,0,1,True,1
4,4,1,0,90,67,1,0,1,True,1
...,...,...,...,...,...,...,...,...,...,...
5622,5622,1,0,30752,64,1,1,1,True,1
5623,5623,1,0,30772,26,1,1,1,True,0
5624,5624,0,1,30772,26,1,1,1,True,0
5625,5625,0,2,30772,26,1,1,1,True,0


In [18]:
# Get categorical column from one hot encoding (specitic to MEPSdataset)
# Here we create a dictionnary that links each categorical column name
# to the list of corresponding one hot encoded columns
categorical_columns_dic = {}
for col in data.columns:
    col_split = col.split("=")
    if len(col_split) > 1:
        cat_col = col_split[0]
        if not (cat_col in categorical_columns_dic.keys()):
            categorical_columns_dic[cat_col] = []
        categorical_columns_dic[cat_col].append(col)
categorical_features = categorical_columns_dic.keys()

In [None]:
MyDataset = StandardDataset(
    df=data,
    label_name="Finding Labels",
    favorable_classes=[0],
    protected_attribute_names=["Patient Gender"],
    privileged_classes=[[1]], # a analyser 
    instance_weights_name="WEIGHTS",
    categorical_features=categorical_features,
    na_values=["?", "Unknown/Invalid"],
    custom_preprocessing=None,
    metadata=None,
)



## Preprocessing : Implémentations Reweighing AIF360

In [33]:
sens_ind = 0
sens_attr = MyDataset.protected_attribute_names[sens_ind]
unprivileged_groups = [
    {sens_attr: v}
    for v in MyDataset.unprivileged_protected_attributes[sens_ind]
]
privileged_groups = [
    {sens_attr: v}
    for v in MyDataset.privileged_protected_attributes[sens_ind]
]
sens_attr, unprivileged_groups, privileged_groups

('Patient Gender',
 [{'Patient Gender': np.float64(0.0)}],
 [{'Patient Gender': np.float64(1.0)}])

In [34]:
from aif360.algorithms.preprocessing import *

RW = Reweighing(
    unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups
)

In [35]:
RW.fit(MyDataset)
dataset_transf_train = RW.transform(MyDataset)
dataset_transf_val = RW.transform(MyDataset)

In [39]:
dataset_transf_train.instance_weights

array([1.02189253, 0.97532303, 0.97327261, ..., 1.03351918, 1.03351918,
       1.02189253], shape=(5627,))

# Post-processing


## Reject-option classification

In [55]:
from aif360.algorithms.postprocessing.reject_option_classification import (
    RejectOptionClassification,
)

## Equalized-odds

### Calcul de Metrics


In [22]:
from aif360.sklearn.metrics import disparate_impact_ratio, base_rate

dir = disparate_impact_ratio(
    y_true=data["Finding Labels"], y_pred=preds_weights["preds"], prot_attr=data["Patient Gender"] , pos_label=1, sample_weight=data.WEIGHTS
)
br = base_rate(y_true=data["Finding Labels"] , pos_label=1, sample_weight=data.WEIGHTS)
dir, br

(1.0879087689318454, np.float64(0.5414963568508975))

In [23]:
dir = disparate_impact_ratio(
    y_true=data["Finding Labels"], y_pred=preds_weights["preds"], prot_attr=data["Age Group"] , pos_label=1, sample_weight=data.WEIGHTS
)
br = base_rate(y_true=data["Finding Labels"] , pos_label=1, sample_weight=data.WEIGHTS)
dir, br

(1.2696564700765505, np.float64(0.5414963568508975))