## Instalar Librerías e Importar el Dataset

In [None]:
!pip install ucimlrepo
!pip install aif360
!pip install 'aif360[Reductions]'
!pip install 'aif360[inFairness]'

In [2]:
from ucimlrepo import fetch_ucirepo
from aif360.datasets import BinaryLabelDataset
from aif360.algorithms.preprocessing import Reweighing
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'
  vect_normalized_discounted_cumulative_gain = vmap(
  monte_carlo_vect_ndcg = vmap(vect_normalized_discounted_cumulative_gain, in_dims=(0,))


In [3]:
# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets

# metadata
print(adult.metadata)

# variable information
print(adult.variables)


{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Tue Sep 24 2024', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': "Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the fol

## Preparar el Dataset

In [4]:
def clean_labels(label: str):
    return label.replace(".", "").replace(" ", "_")

X_df = pd.DataFrame(X)
y_df = pd.DataFrame(y)

y_df = y_df.map(clean_labels)

df = pd.concat([X_df, y_df], axis=1)
df.dropna(inplace=True)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
X_df_clean = df.drop(columns=['income'])
y_df_clean = df['income']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_df_clean, y_df_clean, test_size=0.3, stratify=y_df_clean, random_state=42)

In [None]:
X_train_pre = X_train.copy()
y_train_pre = y_train.copy()
X_test_pre = X_test.copy()
y_test_pre = y_test.copy()


X_train_in = X_train.copy()
y_train_in = y_train.copy()
X_test_in = X_test.copy()
y_test_in = y_test.copy()

X_train_post = X_train.copy()
y_train_post = y_train.copy()
X_test_post = X_test.copy()
y_test_post = y_test.copy()

In [8]:
# numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
# categorical_cols = df.select_dtypes(include=['object']).columns.drop('income')
# numeric_cols, categorical_cols

In [9]:
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns.drop('income')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ]
)

## Entrenamiento de Modelos
# Logistic Regression Model

In [10]:
logistic_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

logistic_pipeline.fit(X_train, y_train)
logistic_pred = logistic_pipeline.predict(X_test)
acc_lr = accuracy_score(y_test, logistic_pred)
print(f'Accuracy for Logistic Regression Model: {acc_lr:.2f}')

Accuracy for Logistic Regression Model: 0.85


# Random Forest Classifier Model

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

randomF_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

randomF_pipeline.fit(X_train, y_train)
randomF_pred = randomF_pipeline.predict(X_test)
acc_rf = accuracy_score(y_test, randomF_pred)
print(f'Accuracy for Random Forest Classifier Model: {acc_rf:.2f}')

Accuracy for Random Forest Classifier Model: 0.85


# K Neighbors Classifier Model

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

knn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier())
])

knn_pipeline.fit(X_train, y_train)
knn_pred = knn_pipeline.predict(X_test)
acc_kn = accuracy_score(y_test, knn_pred)
print(f'Accuracy for K Neighbors Classifier Model: {acc_kn:.2f}')

Accuracy for K Neighbors Classifier Model: 0.83


# Métricas de Equidad
### Modelo escogido por mejor rendimiento: **Logistic Regression Model**
### Atributos sensibles a estudiar: **age** y **sex**

## Independencia (Demographic Parity)

In [13]:
best_pred = logistic_pred

In [14]:
# Sex map
mapping_sex = {"Female": 0, "Male": 1}
# Income map
mapping_income = {"<=50K": 0, ">50K": 1}

X_test_bias = X_test.copy()

def map_col(df, col, mapping) -> pd.DataFrame:
    df[col] = df[col].map(mapping)
    return df

X_test['age'] = X_test_bias['age'].apply(lambda x: 1 if x < 60 else 0)
X_test_bias = map_col(X_test_bias, "sex", mapping_sex)
X_test_bias = X_test_bias[['age', 'sex']]
X_test_bias['y_pred'] = randomF_pred
X_test_bias = map_col(X_test_bias, "y_pred", mapping_income)
X_test_bias.head()

Unnamed: 0,age,sex,y_pred
25618,45,1,1
42065,56,0,0
45990,56,0,0
38179,22,0,0
39821,17,1,0


In [15]:
!pip install aif360



In [16]:
from aif360.datasets import BinaryLabelDataset

df_aif = BinaryLabelDataset(
    df=X_test_bias,
    label_names=['y_pred'],
    protected_attribute_names=['age', 'sex'],
)

In [17]:
from aif360.metrics import BinaryLabelDatasetMetric

# Para 'age'
metric_age = BinaryLabelDatasetMetric(
    df_aif,
    privileged_groups=[{"age": 1}],
    unprivileged_groups=[{"age": 0}]
)
print("Disparate Impact for age:", metric_age.disparate_impact())

# Para 'sex'
metric_sex = BinaryLabelDatasetMetric(
    df_aif,
    privileged_groups=[{"sex": 1}],
    unprivileged_groups=[{"sex": 0}]
)
print("Disparate Impact for sex:", metric_sex.disparate_impact())

Disparate Impact for age: nan
Disparate Impact for sex: 0.31321716957076073


  return (self.num_positives(privileged=privileged)


In [18]:
# import numpy as np
# from scipy.sparse import issparse
# from aif360.metrics import ClassificationMetric

# # Crear el dataset con las predicciones del modelo
# df_aif_pred = df_aif.copy(deepcopy=True)
# df_aif_pred.labels = randomF_pred.reshape(-1, 1)

# # Definir grupos privilegiados y no privilegiados
# privileged_groups = [{'sex': 1.0, 'age': 1.0}]
# unprivileged_groups = [{'sex': 0.0, 'age': 0.0}]

# # Calcular métricas de clasificación
# classification_metric = ClassificationMetric(
#     df_aif,
#     df_aif_pred,
#     unprivileged_groups=unprivileged_groups,
#     privileged_groups=privileged_groups
# )

# # Calcular el Valor Predictivo Positivo (PPV)
# privileged_ppv = classification_metric.positive_predictive_value(privileged=True)
# unprivileged_ppv = classification_metric.positive_predictive_value(privileged=False)

# # Calcular la diferencia de Predictive Parity
# predictive_parity_difference = privileged_ppv - unprivileged_ppv

# # Calcular la precisión
# accuracy = accuracy_score(y_test, randomF_pred)

# # Mostrar resultados
# print(f"Accuracy: {accuracy}")
# print(f"Privileged PPV: {privileged_ppv}")
# print(f"Unprivileged PPV: {unprivileged_ppv}")
# print(f"Predictive Parity Difference: {predictive_parity_difference}")

In [19]:
# import numpy as np
# from scipy.sparse import issparse
# from aif360.metrics import ClassificationMetric

# # Crear el dataset con las predicciones del modelo
# df_aif_pred = df_aif.copy(deepcopy=True)
# print(df_aif_pred)
# df_aif_pred.labels = randomF_pred.reshape(-1, 1)

# classification_metric_age = ClassificationMetric(
#     df_aif,
#     df_aif_pred,
#     privileged_groups=[{"age": 1}],
#     unprivileged_groups=[{"age": 0}]
# )
# # Calcular el Valor Predictivo Positivo (PPV)
# privileged_ppv_age = classification_metric_age.positive_predictive_value(privileged=True)
# unprivileged_ppv_age = classification_metric_age.positive_predictive_value(privileged=False)

# predictive_parity_difference_age = privileged_ppv_age - unprivileged_ppv_age
# print(f"Privileged PPV age: {privileged_ppv_age}")
# print(f"Unprivileged PPV age: {unprivileged_ppv_age}")
# print(f"Predictive Parity Difference age: {predictive_parity_difference_age}")

# classification_metric_sex = ClassificationMetric(
#     df_aif,
#     df_aif_pred,
#     privileged_groups=[{"sex": 1}],
#     unprivileged_groups=[{"sex": 0}]
# )
# # Calcular el Valor Predictivo Positivo (PPV)
# privileged_ppv_sex = classification_metric_sex.positive_predictive_value(privileged=True)
# unprivileged_ppv_sex = classification_metric_sex.positive_predictive_value(privileged=False)


# predictive_parity_difference_sex = privileged_ppv_sex - unprivileged_ppv_sex

# # Calcular la precisión
# accuracy = accuracy_score(y_test, randomF_pred)

# # Mostrar resultados
# print(f"Accuracy: {accuracy}")
# print(f"Privileged PPV: {privileged_ppv_sex}")
# print(f"Unprivileged PPV: {unprivileged_ppv_sex}")
# print(f"Predictive Parity Difference: {predictive_parity_difference_sex}")

## Separación (Equalized Odds)

In [20]:
def map_col(df, col, mapping) -> pd.DataFrame:
    df[col] = df[col].map(mapping)
    return df

def calculate_tpr_fpr_sex(data, subgroup):
    group_data = data[data['sex'] == subgroup]
    true_positive = np.sum((group_data['y_pred'] == 1) & (group_data['y_true'] == 1))
    false_positive = np.sum((group_data['y_pred'] == 1) & (group_data['y_true'] == 0))
    total_positive = np.sum(group_data['y_true'] == 1)
    total_negative = np.sum(group_data['y_true'] == 0)

    tpr = true_positive / total_positive if total_positive > 0 else 0
    fpr = false_positive / total_negative if total_negative > 0 else 0

    return tpr, fpr

def calculate_tpr_fpr_age(data, subgroup):
    group_data = data[data['age'] == subgroup]
    true_positive = np.sum((group_data['y_pred'] == 1) & (group_data['y_true'] == 1))
    false_positive = np.sum((group_data['y_pred'] == 1) & (group_data['y_true'] == 0))
    total_positive = np.sum(group_data['y_true'] == 1)
    total_negative = np.sum(group_data['y_true'] == 0)

    tpr = true_positive / total_positive if total_positive > 0 else 0
    fpr = false_positive / total_negative if total_negative > 0 else 0

    return tpr, fpr

In [21]:
# Map to convert to binary
# Sex map
mapping_sex = {"Female": 0, "Male": 1}
# Income map
mapping_income = {"<=50K": 0, ">50K": 1}

In [22]:
X_test_bias_ed = X_test.copy()

X_test_bias_ed['age'] = X_test_bias_ed['age'].apply(lambda x: 1 if x < 60 else 0) # Age map to binary
X_test_bias_ed = map_col(X_test_bias_ed, "sex", mapping_sex) # Sex map to binary
X_test_bias_ed = X_test_bias_ed[['age', 'sex']]
X_test_bias_ed.loc[:, 'y_pred'] = logistic_pred
X_test_bias_ed = map_col(X_test_bias_ed, "y_pred", mapping_income) # Income map to binary y_pred
X_test_bias_ed['y_true'] = y_test.values
X_test_bias_ed = map_col(X_test_bias_ed, "y_true", mapping_income) # Income map to binary y_true
X_test_bias_ed.head()

Unnamed: 0,age,sex,y_pred,y_true
25618,1,1,1,1
42065,1,0,0,1
45990,1,0,0,0
38179,1,0,0,0
39821,1,1,0,0


### Equalized Odds for **age** atribute
Map: [**0: Older Adults**, **1: Young Adults**]

In [23]:
for age in [0, 1]:
    tpr, fpr = calculate_tpr_fpr_age(X_test_bias_ed, age)
    age_label = 'Older Adults' if age == 0 else 'Young Adults'
    print(f"Age Group: {age_label}")
    print(f"  True Positive Rate (TPR): {tpr:.2f}")
    print(f"  False Positive Rate (FPR): {fpr:.2f}")

Age Group: Older Adults
  True Positive Rate (TPR): 0.00
  False Positive Rate (FPR): 0.00
Age Group: Young Adults
  True Positive Rate (TPR): 0.61
  False Positive Rate (FPR): 0.07


### Equalized Odds for **sex** atribute
Map: [**0: Female**, **1: Male**]

In [24]:
for sex in [0, 1]:
    tpr, fpr = calculate_tpr_fpr_sex(X_test_bias_ed, sex)
    sex_label = 'Female' if sex == 0 else 'Male'
    print(f"Sex Group: {sex_label}")
    print(f"  True Positive Rate (TPR): {tpr:.2f}")
    print(f"  False Positive Rate (FPR): {fpr:.2f}")

Sex Group: Female
  True Positive Rate (TPR): 0.49
  False Positive Rate (FPR): 0.02
Sex Group: Male
  True Positive Rate (TPR): 0.63
  False Positive Rate (FPR): 0.10


## Suficiencia (Predictive Parity)

In [25]:
# # Instalar librerías necesarias
# !pip install aif360

# import pandas as pd
# import numpy as np
# from scipy.sparse import issparse
# from sklearn.model_selection import train_test_split
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score
# from aif360.datasets import BinaryLabelDataset
# from aif360.metrics import ClassificationMetric

# # Cargar el dataset desde el enlace proporcionado
# url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
# columns = [
#     "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
#     "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
#     "hours-per-week", "native-country", "income"
# ]
# df = pd.read_csv(url, header=None, names=columns, na_values=" ?", skipinitialspace=True)
# df.dropna(inplace=True)

# # Procesar la columna 'income' para tener etiquetas 0 y 1
# df['income'] = df['income'].apply(lambda x: 1 if x.strip() == '>50K' else 0)

# # Convertir 'sex' a valores binarios
# df['sex'] = df['sex'].apply(lambda x: 1 if x.strip() == 'Male' else 0)

# # Separar características y objetivo
# X = df.drop(columns=['income'])
# y = df['income']

# # Mantener atributos sensibles separados
# sensitive_features = ['age', 'sex']
# sensitive_data = X[sensitive_features]

# # Eliminar atributos sensibles de X
# X = X.drop(columns=sensitive_features)

# # Identificar columnas numéricas y categóricas
# numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
# categorical_cols = X.select_dtypes(include=['object']).columns

# # Preprocesador
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numeric_cols),
#         ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
#     ]
# )

# # Dividir los datos
# X_train, X_test, y_train, y_test, sensitive_train, sensitive_test = train_test_split(
#     X, y, sensitive_data, test_size=0.3, random_state=42, stratify=y
# )

# # Crear el pipeline
# pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('classifier', LogisticRegression(max_iter=1000))
# ])

# # Entrenar el modelo
# pipeline.fit(X_train, y_train)

# # Realizar predicciones
# y_pred = pipeline.predict(X_test)

# # Transformar X_test
# X_test_transformed = pipeline['preprocessor'].transform(X_test)

# # Verificar si la matriz transformada es dispersa y convertirla a densa si es necesario
# if issparse(X_test_transformed):
#     X_test_transformed = X_test_transformed.toarray()

# # Crear el DataFrame con los datos transformados
# feature_names = pipeline['preprocessor'].get_feature_names_out()
# X_test_transformed = pd.DataFrame(X_test_transformed, columns=feature_names, index=X_test.index)

# # Diagnosticar valores faltantes en los datos sensibles y etiquetas
# print("Valores faltantes en sensitive_test:", sensitive_test.isna().sum())
# print("Valores faltantes en y_test:", y_test.isna().sum())

# # Combinar con atributos sensibles y etiquetas
# df_test = pd.concat([X_test_transformed, sensitive_test.reset_index(drop=True), y_test.reset_index(drop=True)], axis=1)

# # Diagnosticar valores faltantes después de combinar
# print("Valores faltantes después de combinar:", df_test.isna().sum())

# # Eliminar filas con valores faltantes
# df_test.dropna(inplace=True)

# # Verificar que no haya valores faltantes después de limpiar
# print("Valores faltantes después de limpiar:", df_test.isna().sum())

# # Filtrar predicciones para coincidir con las filas restantes en df_test
# filtered_indices = df_test.index
# y_pred_filtered = y_pred[filtered_indices]

# # Crear el atributo protegido 'age_group'
# df_test['age_group'] = df_test['age'].apply(lambda x: 1.0 if x > 30 else 0.0)

# # Crear el BinaryLabelDataset
# binary_dataset = BinaryLabelDataset(
#     favorable_label=1,
#     unfavorable_label=0,
#     df=df_test,
#     label_names=['income'],
#     protected_attribute_names=['sex', 'age_group']
# )

# # Crear el dataset con las predicciones del modelo
# binary_dataset_pred = binary_dataset.copy(deepcopy=True)
# binary_dataset_pred.labels = y_pred_filtered.reshape(-1, 1)

# # Definir grupos privilegiados y no privilegiados
# privileged_groups = [{'sex': 1.0, 'age_group': 1.0}]
# unprivileged_groups = [{'sex': 0.0, 'age_group': 0.0}]

# # Calcular métricas de clasificación
# classification_metric = ClassificationMetric(
#     binary_dataset,
#     binary_dataset_pred,
#     unprivileged_groups=unprivileged_groups,
#     privileged_groups=privileged_groups
# )

# # Calcular el Valor Predictivo Positivo (PPV)
# privileged_ppv = classification_metric.positive_predictive_value(privileged=True)
# unprivileged_ppv = classification_metric.positive_predictive_value(privileged=False)

# # Calcular la diferencia de Predictive Parity
# predictive_parity_difference = privileged_ppv - unprivileged_ppv

# # Calcular la precisión
# accuracy = accuracy_score(y_test.loc[filtered_indices], y_pred_filtered)

# # Mostrar resultados
# print(f"Accuracy: {accuracy}")
# print(f"Privileged PPV: {privileged_ppv}")
# print(f"Unprivileged PPV: {unprivileged_ppv}")
# print(f"Predictive Parity Difference: {predictive_parity_difference}")

# Mitigación de Sesgos
### Pre-procesamiento: **Reweighing**
### In-procesamiento:
### Post-procesamiento:

In [48]:
from typing import List

## Pre-procesamiento: **Reweighing**

In [None]:
# X_train_pre = X_train.copy()
# y_train_pre = y_train.copy()
# X_test_pre = X_test.copy()
# y_test_pre = y_test.copy()

# X_train_pre = map_col(X_train_pre, "sex", mapping_sex)
# X_test_pre = map_col(X_test_pre, "sex", mapping_sex)
# X_train_pre['age'] = X_train_pre['age'].apply(lambda x: 1 if x < 60 else 0)
# X_test_pre['age'] = X_test_pre['age'].apply(lambda x: 1 if x < 60 else 0)
# y_train_pre = y_train_pre.map({'<=50K': 0, '>50K': 1})
# y_test_pre = y_test_pre.map({'<=50K': 0, '>50K': 1})

In [None]:
# cat_cols = X_train_pre.select_dtypes(include=['object']).columns
# X_train_pre = pd.get_dummies(X_train_pre, columns=cat_cols, drop_first=True)
# X_test_pre = pd.get_dummies(X_test_pre, columns=cat_cols, drop_first=True)
# diff_columns=X_train_pre.columns.symmetric_difference(X_test_pre.columns)
# X_train_pre = X_train_pre.drop(columns=diff_columns)

In [None]:
# data = BinaryLabelDataset(df=X_train_pre.join(y_train_pre),
#                           label_names=['income'],
#                           protected_attribute_names=['sex', 'age'])

# reweigher = Reweighing(unprivileged_groups=[{'sex': 0}, {'age': 1}],
#                         privileged_groups=[{'sex': 1}, {'age': 0}])
# reweighed_data = reweigher.fit_transform(data)

# weights = reweighed_data.instance_weights

In [None]:
# Entrenamiento del modelo (se puede eliminar ya que el pre-procesamiento llega
# hasta obtener los pesos "weights")

# logistic_model_pre = LogisticRegression(max_iter=5000, random_state=42)
# logistic_model_pre.fit(X_train_pre, y_train_pre, sample_weight=weights)
# y_pred_logistic_pre = logistic_model_pre.predict(X_test_pre)
# acc_lr_pre = accuracy_score(y_test_pre, y_pred_logistic_pre)
# print(f'Accuracy for Logistic Regression Model with Pre-processing: {acc_lr_pre:.2f}')

Accuracy for Logistic Regression Model with Pre-processing: 0.85


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
X_train_pre = map_col(X_train_pre, "sex", mapping_sex)
X_test_pre = map_col(X_test_pre, "sex", mapping_sex)
X_train_pre['age'] = X_train_pre['age'].apply(lambda x: 1 if x < 60 else 0)
X_test_pre['age'] = X_test_pre['age'].apply(lambda x: 1 if x < 60 else 0)
y_train_pre = y_train_pre.map({'<=50K': 0, '>50K': 1})
y_test_pre = y_test_pre.map({'<=50K': 0, '>50K': 1})

cat_cols = X_train_pre.select_dtypes(include=['object']).columns
X_train_pre = pd.get_dummies(X_train_pre, columns=cat_cols, drop_first=True)
X_test_pre = pd.get_dummies(X_test_pre, columns=cat_cols, drop_first=True)
diff_columns=X_train_pre.columns.symmetric_difference(X_test_pre.columns)
X_train_pre = X_train_pre.drop(columns=diff_columns)

In [49]:
from aif360.algorithms.preprocessing import Reweighing
from aif360.datasets import BinaryLabelDataset

def reweightingProcessing(
    train_aif_df: BinaryLabelDataset,
    sensitive_features: List[str]
):
    """
    Aplica el algoritmo de preprocesamiento Reweighing para ajustar los pesos de las instancias.

    Parámetros:
    - train_aif_df (BinaryLabelDataset): Datos de entrenamiento.
    - sensitive_features (List[str]): Lista de características sensibles (e.g., ['age', 'sex']).

    Retorno:
    - reweighted_train_aif_df (BinaryLabelDataset): Datos reponderados con los nuevos pesos aplicados.
    """
    reweighing_processors = []
    reweighted_train_aif_df = train_aif_df.copy()

    for sensitive_feature in sensitive_features:
        # Configurar grupos privilegiados y no privilegiados
        privileged_groups = [{sensitive_feature: 1}]
        unprivileged_groups = [{sensitive_feature: 0}]
        
        # Crear el procesador de reweighting
        reweighing_processor = Reweighing(
            privileged_groups=privileged_groups,
            unprivileged_groups=unprivileged_groups
        )
        print(f"Aplicando reweighting para {sensitive_feature}...")
        
        # Aplicar el ajuste de pesos
        reweighted_train_aif_df = reweighing_processor.fit_transform(reweighted_train_aif_df)
        reweighing_processors.append(reweighing_processor)

    return reweighted_train_aif_df


In [50]:
bld_train_pre = BinaryLabelDataset(df=X_train_pre.join(y_train_pre),
                          label_names=['income'],
                          protected_attribute_names=['sex', 'age'])


reweighted_pre_train = reweightingProcessing(
    train_aif_df=bld_train_pre,  # Datos de entrenamiento
    sensitive_features=['age', 'sex']  # Características sensibles
)

Aplicando reweighting para age...
Aplicando reweighting para sex...


In [51]:
# Mostrar los primeros 10 pesos de las instancias
print("Pesos ajustados de las primeras 10 instancias:")
print(reweighted_pre_train.instance_weights[:10])


Pesos ajustados de las primeras 10 instancias:
[1.09493593 0.8507266  0.8507266  0.78650541 1.09493593 1.09493593
 1.09493593 0.8507266  1.09493593 0.8507266 ]


## In-procesamiento: **Inserte Técnica Aquí**

In [None]:
X_train_in = map_col(X_train_in, "sex", mapping_sex)
X_test_in = map_col(X_test_in, "sex", mapping_sex)
X_train_in['age'] = X_train_in['age'].apply(lambda x: 1 if x < 60 else 0)
X_test_in['age'] = X_test_in['age'].apply(lambda x: 1 if x < 60 else 0)
y_train_in = y_train_in.map({'<=50K': 0, '>50K': 1})
y_test_in = y_test_in.map({'<=50K': 0, '>50K': 1})

cat_cols = X_train_in.select_dtypes(include=['object']).columns
X_train_in = pd.get_dummies(X_train_in, columns=cat_cols, drop_first=True)
X_test_in = pd.get_dummies(X_test_in, columns=cat_cols, drop_first=True)
diff_columns=X_train_in.columns.symmetric_difference(X_test_in.columns)
X_train_in = X_train_in.drop(columns=diff_columns)

In [53]:
from typing import List
from aif360.algorithms.inprocessing import AdversarialDebiasing
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import ClassificationMetric
import tensorflow.compat.v1 as tf

def adversarialDebiasingProcessing(
    train_aif_df: BinaryLabelDataset,
    test_aif_df: BinaryLabelDataset,
    sensitive_features: List[str],
    num_epochs: int = 50,
    batch_size: int = 128,
    adversary_loss_weight: float = 0.1
):
    """
    Función para entrenar un modelo debiasado usando AdversarialDebiasing.

    Parámetros:
    - train_aif_df (BinaryLabelDataset): Datos de entrenamiento.
    - test_aif_df (BinaryLabelDataset): Datos de prueba.
    - sensitive_features (List[str]): Lista de características sensibles (e.g., ['age', 'sex']).
    - num_epochs (int): Número de épocas para entrenar.
    - batch_size (int): Tamaño de lote durante el entrenamiento.
    - adversary_loss_weight (float): Peso del adversario para reducir el sesgo.

    Retorno:
    - predicted_test_aif_df (BinaryLabelDataset): Predicciones debiasadas en el conjunto de prueba.
    """

    sess = tf.Session()
    debiased_models = []
    predicted_test_aif_df = test_aif_df.copy()

    for sensitive_feature in sensitive_features:
        # Configurar grupos privilegiados y no privilegiados
        privileged_groups = [{sensitive_feature: 1}]
        unprivileged_groups = [{sensitive_feature: 0}]
        
        # Crear y entrenar el modelo debiasado
        debiased_model = AdversarialDebiasing(
            privileged_groups=privileged_groups,
            unprivileged_groups=unprivileged_groups,
            scope_name=f'debiased_classifier_{sensitive_feature}',
            sess=sess,
            num_epochs=num_epochs,
            batch_size=batch_size,
            adversary_loss_weight=adversary_loss_weight
        )
        print(f"Entrenando modelo debiasado para {sensitive_feature}...")
        debiased_model.fit(train_aif_df)
        debiased_models.append(debiased_model)

        # Generar predicciones
        predicted_test_aif_df = debiased_model.predict(predicted_test_aif_df)

    sess.close()
    return predicted_test_aif_df


ModuleNotFoundError: No module named 'tensorflow'

In [None]:
bld_train_in = BinaryLabelDataset(df=X_train_in.join(y_train_in),
                          label_names=['income'],
                          protected_attribute_names=['sex', 'age'])


bld_test_in = BinaryLabelDataset(df=X_test_in.join(y_test_in),
                          label_names=['income'],
                          protected_attribute_names=['sex', 'age'])

In [None]:
predicted_test_inprocessing = adversarialDebiasingProcessing(
    train_aif_df=bld_train_in,  # Datos de entrenamiento
    test_aif_df=bld_test_in,    # Datos de prueba
    sensitive_features=['age', 'sex'],  # Atributos sensibles
    num_epochs=50,  # Configuración del número de épocas
    batch_size=128,  # Tamaño de lote
    adversary_loss_weight=0.1  # Peso del adversario
)

## Post-procesamiento: **Inserte Técnica Aquí**

In [None]:
X_train_post = map_col(X_train_post, "sex", mapping_sex)
X_test_post = map_col(X_test_post, "sex", mapping_sex)
X_train_post['age'] = X_train_post['age'].apply(lambda x: 1 if x < 60 else 0)
X_test_post['age'] = X_test_post['age'].apply(lambda x: 1 if x < 60 else 0)
y_train_post = y_train_post.map({'<=50K': 0, '>50K': 1})
y_test_post = y_test_post.map({'<=50K': 0, '>50K': 1})

cat_cols = X_train_post.select_dtypes(include=['object']).columns
X_train_post = pd.get_dummies(X_train_post, columns=cat_cols, drop_first=True)
X_test_post = pd.get_dummies(X_test_post, columns=cat_cols, drop_first=True)
diff_columns=X_train_post.columns.symmetric_difference(X_test_post.columns)
X_train_post = X_train_post.drop(columns=diff_columns)

In [None]:
bld_train_post = BinaryLabelDataset(df=X_train_post.join(y_train_post),
                          label_names=['income'],
                          protected_attribute_names=['sex', 'age'])


bld_test_post = BinaryLabelDataset(df=X_test_post.join(y_test_post),
                          label_names=['income'],
                          protected_attribute_names=['sex', 'age'])

In [None]:

#Código Post-Procesamiento
from aif360.algorithms.postprocessing import EqOddsPostprocessing

# Cramos una función que se encargue de reajustar las predicciones
def eqOddsPredictionProccesing(
    test_aif_df: BinaryLabelDataset,
    test_pred_aif_df: BinaryLabelDataset,
    sensitive_features: List[str],
):

    eq_odds_processers = []
    for sensitive_feature in sensitive_features:
        eq_odds_processers.append(
            EqOddsPostprocessing(
                unprivileged_groups=[{sensitive_feature: 0}],
                privileged_groups=[{sensitive_feature: 1}],
                seed=42
            )
        )

    actual_pred_aif_df = test_pred_aif_df.copy()

    for eq_odds_processer in eq_odds_processers:
        eq_odds_processer: EqOddsPostprocessing
        eq_odds_processer.fit(test_aif_df, actual_pred_aif_df)
        actual_pred_aif_df = eq_odds_processer.predict(actual_pred_aif_df)

    return actual_pred_aif_df

In [None]:
post_processed_preds = eqOddsPredictionProccesing(
    bld_train_post,
    bld_test_post,
    ['age', 'sex']
)

# Para 'age'
metric_age = ClassificationMetric(
    bld_test_post,
    post_processed_preds,
    unprivileged_groups=[{"age": 0}],
    privileged_groups=[{"age": 1}]
)

# Para 'sex'
metric_sex = ClassificationMetric(
    bld_test_post,
    post_processed_preds,
    unprivileged_groups=[{"sex": 0}],
    privileged_groups=[{"sex": 1}]
)

print("Disparate Impact for age:", metric_age.disparate_impact())
print("Dispate Impact for sex:", metric_sex.disparate_impact())

# Medición de Mitigación de Sesgos

## Combinación 1: Pre-procesamiento + In-Procesamiento

In [32]:
## Código que combine las dos técnicas

### Independencia (Demographic Parity)

### Separación (Equalized Odds)

In [33]:
X_test_fitted_pre = X_test_pre.copy()

X_test_fitted_pre = X_test_fitted_pre[['age', 'sex']]
X_test_fitted_pre.loc[:, 'y_pred'] = y_pred_logistic_pre
X_test_fitted_pre['y_true'] = y_test_pre.values
X_test_fitted_pre.head()

Unnamed: 0,age,sex,y_pred,y_true
25618,1,1,1,1
42065,1,0,0,1
45990,1,0,0,0
38179,1,0,0,0
39821,1,1,0,0


In [34]:
for age in [0, 1]:
    tpr, fpr = calculate_tpr_fpr_age(X_test_fitted_pre, age)
    age_label = 'Older Adults' if age == 0 else 'Young Adults'
    print(f"Age Group: {age_label}")
    print(f"  True Positive Rate (TPR): {tpr:.2f}")
    print(f"  False Positive Rate (FPR): {fpr:.2f}")

Age Group: Older Adults
  True Positive Rate (TPR): 0.57
  False Positive Rate (FPR): 0.08
Age Group: Young Adults
  True Positive Rate (TPR): 0.54
  False Positive Rate (FPR): 0.05


In [35]:
for sex in [0, 1]:
    tpr, fpr = calculate_tpr_fpr_sex(X_test_fitted_pre, sex)
    sex_label = 'Female' if sex == 0 else 'Male'
    print(f"Sex Group: {sex_label}")
    print(f"  True Positive Rate (TPR): {tpr:.2f}")
    print(f"  False Positive Rate (FPR): {fpr:.2f}")

Sex Group: Female
  True Positive Rate (TPR): 0.47
  False Positive Rate (FPR): 0.02
Sex Group: Male
  True Positive Rate (TPR): 0.56
  False Positive Rate (FPR): 0.08


### Suficiencia (Predictive Parity)

## Combinación 2: In-procesamiento + Post-Procesamiento

In [36]:
## Código que combine las dos técnicas

### Independencia (Demographic Parity)

In [37]:
# Código de Independencia

### Separación (Equalized Odds)

In [38]:
# Código de Separación

### Suficiencia (Predictive Parity)

In [39]:
# Código de Suficiencia

## Combinación 3: Pre-procesamiento + Post-Procesamiento

In [40]:
## Código que combine las dos técnicas

### Independencia (Demographic Parity)

In [41]:
# Código de Independencia

### Separación (Equalized Odds)

In [42]:
# Código de Separación

### Suficiencia (Predictive Parity)

In [43]:
# Código de Suficiencia

## Combinación 4: Pre-procesamiento + In-Procesamiento + Post-Procesamiento

In [44]:
## Código que combine las dos técnicas

### Independencia (Demographic Parity)

In [45]:
# Código de Independencia

### Separación (Equalized Odds)

In [46]:
# Código de Separación

### Suficiencia (Predictive Parity)

In [47]:
# Código de Suficiencia