This notebook presents the implementation of FairGenerate.

**"FairGenerate: Enhancing Fairness Through Synthetic Data Generation and Two-Fold Biased Label Removal "**

FairGenerate is a novel preprocessing method designed to mitigate imbalanced data and biased labels in training datasets.

If you use this work, please cite our TOSEM 2025 paper:

@article{10.1145/3730579,
author = {Joshi, Hem Chandra and Kumar, Sandeep},
title = {FairGenerate: Enhancing Fairness Through Synthetic Data Generation and Two-Fold Biased Labels Removal},
year = {2025},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
issn = {1049-331X},
url = {https://doi.org/10.1145/3730579},
doi = {10.1145/3730579},
note = {Just Accepted},
journal = {ACM Trans. Softw. Eng. Methodol.},
month = apr,
keywords = {ML software, Software fairness, Bias mitigation, Imbalanced Data, Biased Labels}
}

In [1]:
import numpy as np
import random
import pandas as pd

# Learner
from sklearn.neighbors import NearestNeighbors as NN

#Function to find the K nearest neighhours
def get_ngbr(df, knn):
    #np.random.seed(0)
    rand_sample_idx = random.randint(0, df.shape[0] - 1)
    parent_candidate = df.iloc[rand_sample_idx]
    distance,ngbr = knn.kneighbors(parent_candidate.values.reshape(1,-1),3,return_distance=True)    
    candidate_1 = df.iloc[ngbr[0][1]]    
    candidate_2 = df.iloc[ngbr[0][2]]    
    return distance,parent_candidate,candidate_1,candidate_2


def generate_samples(no_of_samples,df,df_name,protected_attribute):

    #--------------------------------------------------------------------------------------------------
    #Calling function to find the KNN
    total_data = df.values.tolist()
    knn = NN(n_neighbors=5,algorithm='auto').fit(df)

    column_name=df.columns.tolist()
    #added by own
    #new_candidate_df=pd.DataFrame(columns=column_name)
    #added by own  end

    #--------------------------------------------------------------------------------------------------------------
    #Logic to create synthetic data

    for _ in range(no_of_samples):
    
        f = .3
        distance,parent_candidate, child_candidate_1, child_candidate_2 = get_ngbr(df, knn)      
        mutant = []
        for key,value in parent_candidate.items():        
            #x1=distance[0][0]  
            x1=distance[0][1] 
            x2=distance[0][2] 
            x3=abs(x2-x1)
                
            if isinstance(parent_candidate[key], (bool, str)):
                if x1 <=x3:
                    mutant.append(np.random.choice([parent_candidate[key], child_candidate_1[key]]))   
                else:
                    mutant.append(np.random.choice([child_candidate_1[key], child_candidate_2[key]]))                      
            else:             
                if x1 <= x3:
                    mutant.append(parent_candidate[key] + f * (child_candidate_1[key] - parent_candidate[key]))
                else:
                    mutant.append(abs(child_candidate_1[key] + f * (child_candidate_2[key] - child_candidate_1[key])))
        total_data.append(mutant)
   
    final_df = pd.DataFrame(total_data)
    #--------------------------------------------------------------------------------------------------------------
    #Rename dataframe columns
    final_df=final_df.set_axis(column_name, axis=1)

    return final_df
 

In [2]:
#Importing Library

# %%
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

# %%
#Import libraries

import pandas as pd
import numpy as np
import os

#classifier
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import MinMaxScaler

#Fairness Metrics
from aif360.datasets import BinaryLabelDataset, StructuredDataset
from aif360.algorithms.preprocessing import Reweighing
from aif360.metrics import ClassificationMetric
from aif360.metrics import BinaryLabelDatasetMetric

import time
import calendar
import copy

#To split-training and testing dataset
from sklearn.model_selection import train_test_split

pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'


In [3]:
#Situation Testing Code

def situation(clf,X_train,y_train,keyword):
    #they have used  as a classifier
    X_flip = X_train.copy()
    X_flip[keyword] = np.where(X_flip[keyword]==1, 0, 1)
    a = np.array(clf.predict(X_train))
    b = np.array(clf.predict(X_flip))
    same = (a==b)
    #print(same) #[True  True  True ... False  True  True  True]
    same = [1 if each else 0 for each in same]  #[1 1 1... 0 1 1 1] if true makes it 1 ,else 0
    X_train['same'] = same #make a new column 'same' and put above list into it.
    X_train['y'] = y_train #make a new column 'y' and put y_train value into it.
    X_rest = X_train[X_train['same']==1] #This creates a new DataFrame (X_rest) that contains only the rows where the 'same' column is 1.
    y_rest = X_rest['y']
    X_rest = X_rest.drop(columns=['same','y'])

    print("Removed Points:",np.round((X_train.shape[0] - X_rest.shape[0]) / X_train.shape[0] * 100, 4),"% || ", X_train.shape[0]-X_rest.shape[0])
    point_removed=np.round((X_train.shape[0] - X_rest.shape[0]) / X_train.shape[0] * 100, 4)
    
    return X_rest,y_rest,point_removed


In [4]:
dataset_names=['Compas','Compas','German','Heart','Student','Bank','Meps15','Meps16','Default','Adult','Adult']
protected_attributes=['sex','race','sex','age','sex','age','race','race','sex','sex','race']

for dataset_index in range(0,len(dataset_names)):

    dataset_name= dataset_names[dataset_index].lower()
    protected_attribute=protected_attributes[dataset_index]

    folder_name=dataset_name.lower()+"_"+ protected_attribute
    
    test_size_input=0.20
    shuffle_value=True
    learner='LGR'

    original_accuracy,original_recall,original_f1score,original_precision=[],[],[],[]
    original_aod,original_eod,original_spd,original_di=[],[],[],[]
    
    processed_accuracy,processed_recall,processed_f1score,processed_precision=[],[],[],[]
    processed_aod,processed_eod,processed_spd,processed_di=[],[],[],[]

    for random_seed in range(1,21):  

        store_value={}
       
        current_GMT = time.gmtime()
        global_timestamp = calendar.timegm(current_GMT)
        
        os.makedirs(f"generated_data/{folder_name}/{global_timestamp}", exist_ok=True)
        
        print("\nRunning:",random_seed,"---------")
        #Import dataset
        dataset_orig=pd.read_csv("preprocessed_dataset/"+dataset_name+"_processed.csv")
        
        #Normalize dataset
        scaler = MinMaxScaler()
        dataset_orig = pd.DataFrame(scaler.fit_transform(dataset_orig),columns = dataset_orig.columns)
        #dataset_orig


        #Split dataset into training and test set
        dataset_orig_train, dataset_orig_test = train_test_split(dataset_orig, test_size=test_size_input, shuffle=shuffle_value, random_state=random_seed)
        print(dataset_orig_train.shape)
        print(dataset_orig_test.shape)


        #Train Set
        X_train , y_train = dataset_orig_train.loc[:, dataset_orig_train.columns != 'Probability'],\
        dataset_orig_train['Probability']
        
        #Test Set
        X_test , y_test = dataset_orig_test.loc[:, dataset_orig_test.columns != 'Probability'],\
        dataset_orig_test['Probability']

        ##========================Without any bias mitigation techniques==================
        
        clf_lr = LogisticRegression(random_state=random_seed)
        clf_lr.fit(X_train,y_train)
        
        # Prepare dataset_t for metrics calculation
        dataset_t = BinaryLabelDataset(favorable_label=1.0,
                                       unfavorable_label=0.0,
                                       df=dataset_orig_test,
                                       label_names=['Probability'],
                                       protected_attribute_names=[protected_attribute])
        
        y_pred = clf_lr.predict(X_test)
            
        dataset_pred = dataset_t.copy()  
        dataset_pred.labels = y_pred    
        attr = dataset_t.protected_attribute_names[0]
        idx = dataset_t.protected_attribute_names.index(attr)
        
        privileged_groups = [{attr: dataset_pred.privileged_protected_attributes[idx][0]}]
        unprivileged_groups = [{attr: dataset_pred.unprivileged_protected_attributes[idx][0]}]
        
        class_metrics = ClassificationMetric(dataset_t, dataset_pred, unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)
    
        original_accuracy.append(class_metrics.accuracy())
        original_recall.append(class_metrics.recall())
        original_precision.append(class_metrics.precision())
        original_f1score.append((2 * class_metrics.recall() * class_metrics.precision()) / (class_metrics.precision() + class_metrics.recall()))
        
        original_aod.append(np.abs(class_metrics.average_odds_difference()))
        original_eod.append(np.abs(class_metrics.equal_opportunity_difference()))
        original_spd.append(np.abs(class_metrics.statistical_parity_difference()))
        original_di.append(np.abs(1 - class_metrics.disparate_impact()))

        #================= FairGenerate Applies Below ===============

        #Step 1 - Data Cleaning.


        clf1 = LogisticRegression(random_state=random_seed)
        clf1.fit(X_train, y_train)

        print("Situation Testing Before Synthetic Data Generation......")
        X_train, y_train,point_removed = situation(clf1, X_train, y_train, protected_attribute) #dataset is changing. 
    
        #For Further Situation Testing
        clf2 = LogisticRegression(random_state=random_seed)
        clf2.fit(X_train, y_train)

        
        dataset_orig_train=X_train
        dataset_orig_train['Probability']=y_train

        #Step 2 - Data Balancing
           
        # first one is class value and second one is protected attribute value
        zero_zero = len(dataset_orig_train[(dataset_orig_train['Probability'] == 0) & (dataset_orig_train[protected_attribute] == 0)])
        zero_one = len(dataset_orig_train[(dataset_orig_train['Probability'] == 0) & (dataset_orig_train[protected_attribute] == 1)])
        one_zero = len(dataset_orig_train[(dataset_orig_train['Probability'] == 1) & (dataset_orig_train[protected_attribute] == 0)])
        one_one = len(dataset_orig_train[(dataset_orig_train['Probability'] == 1) & (dataset_orig_train[protected_attribute] == 1)])

        print("Before Synthetic Data Generation")

        print(zero_zero,zero_one,one_zero,one_one)
    
        maximum = max(zero_zero,zero_one,one_zero,one_one)
             
        zero_zero_to_be_increased = maximum - zero_zero ## where class is 0 attribute is 0
        zero_one_to_be_increased = maximum - zero_one ## where class is 0 attribute is 1
        one_zero_to_be_increased = maximum - one_zero ## where class is 1 attribute is 0
        one_one_to_be_increased = maximum - one_one ## where class is 1 attribute is 1
        
        df_zero_zero = dataset_orig_train[(dataset_orig_train['Probability'] == 0) & (dataset_orig_train[protected_attribute] == 0)]
        df_one_one = dataset_orig_train[(dataset_orig_train['Probability'] == 1) & (dataset_orig_train[protected_attribute] == 1)]
        df_one_zero = dataset_orig_train[(dataset_orig_train['Probability'] == 1) & (dataset_orig_train[protected_attribute] == 0)]
        df_zero_one = dataset_orig_train[(dataset_orig_train['Probability'] == 0) & (dataset_orig_train[protected_attribute] == 1)]
    
      
        if zero_zero_to_be_increased==0:
            
            df_zero_one[protected_attribute] = df_zero_one[protected_attribute].astype(str)
            df_one_zero[protected_attribute] = df_one_zero[protected_attribute].astype(str)
            df_one_one[protected_attribute] = df_one_one[protected_attribute].astype(str)
    
            #calling generate_samples
            df_zero_one = generate_samples(zero_one_to_be_increased,df_zero_one,dataset_name, protected_attribute)
            df_one_zero = generate_samples(one_zero_to_be_increased,df_one_zero,dataset_name, protected_attribute)
            df_one_one = generate_samples(one_one_to_be_increased,df_one_one,dataset_name, protected_attribute)
            
            #appending dataframes
            df = df_one_zero.append(df_zero_one)
            df = df.append(df_one_one)
            df[protected_attribute]=df[protected_attribute].astype(float)
            df_zero_zero = dataset_orig_train[(dataset_orig_train['Probability'] == 0) & (dataset_orig_train[protected_attribute] == 0)]
            df = df.append(df_zero_zero)
    
        
        elif zero_one_to_be_increased==0:
    
            df_zero_zero[protected_attribute] = df_zero_zero[protected_attribute].astype(str)
            df_one_zero[protected_attribute] = df_one_zero[protected_attribute].astype(str)
            df_one_one[protected_attribute] = df_one_one[protected_attribute].astype(str)
      
            #calling generate_samples
            df_zero_zero = generate_samples(zero_zero_to_be_increased,df_zero_zero,dataset_name, protected_attribute)
            df_one_zero = generate_samples(one_zero_to_be_increased,df_one_zero,dataset_name, protected_attribute)
            df_one_one = generate_samples(one_one_to_be_increased,df_one_one,dataset_name, protected_attribute)
    
            #appending dataframes
            df = df_one_zero.append(df_zero_zero)
            df = df.append(df_one_one)
            df[protected_attribute]= df[protected_attribute].astype(float)
            df_zero_one = dataset_orig_train[(dataset_orig_train['Probability'] == 0) & (dataset_orig_train[protected_attribute] == 1)]
            df = df.append(df_zero_one)
        
        elif one_zero_to_be_increased==0:
            df_zero_one[protected_attribute] = df_zero_one[protected_attribute].astype(str)
            df_zero_zero[protected_attribute] = df_zero_zero[protected_attribute].astype(str)
            df_one_one[protected_attribute] = df_one_one[protected_attribute].astype(str)
    
            #calling generate_samples
            df_zero_zero = generate_samples(zero_zero_to_be_increased,df_zero_zero,dataset_name, protected_attribute)
            df_zero_one = generate_samples(zero_one_to_be_increased,df_zero_one,dataset_name, protected_attribute)
            df_one_one = generate_samples(one_one_to_be_increased,df_one_one,dataset_name, protected_attribute)
    
            #appending dataframes
            df = df_zero_one.append(df_zero_zero)
            df = df.append(df_one_one)
            df[protected_attribute] = df[protected_attribute].astype(float)
            df_one_zero = dataset_orig_train[(dataset_orig_train['Probability'] == 1) & (dataset_orig_train[protected_attribute] == 0)]
            df = df.append(df_one_zero)
        
        elif one_one_to_be_increased==0:
    
            df_zero_zero[protected_attribute] = df_zero_zero[protected_attribute].astype(str)
            df_zero_one[protected_attribute] = df_zero_one[protected_attribute].astype(str)
            df_one_zero[protected_attribute] = df_one_zero[protected_attribute].astype(str)
               
            #calling generate_samples
            df_zero_zero = generate_samples(zero_zero_to_be_increased,df_zero_zero,dataset_name, protected_attribute)
            df_zero_one = generate_samples(zero_one_to_be_increased,df_zero_one,dataset_name, protected_attribute)
            df_one_zero = generate_samples(one_zero_to_be_increased,df_one_zero,dataset_name, protected_attribute)
            
            #appending dataframes
            df = df_zero_one.append(df_zero_zero)
            df = df.append(df_one_zero)
            df[protected_attribute] = df[protected_attribute].astype(float)
            df_one_one = dataset_orig_train[(dataset_orig_train['Probability'] == 1) & (dataset_orig_train[protected_attribute] == 1)]
            df = df.append(df_one_one)
    
        # first one is class value and second one is protected attribute value
        zero_zero = len(df[(df['Probability'] == 0) & (df[protected_attribute] == 0)])
        zero_one = len(df[(df['Probability'] == 0) & (df[protected_attribute] == 1)])
        one_zero = len(df[(df['Probability'] == 1) & (df[protected_attribute] == 0)])
        one_one = len(df[(df['Probability'] == 1) & (df[protected_attribute] == 1)])

        print("After Synthetic Data Generation")
        print(zero_zero,zero_one,one_zero,one_one)
        
        X_train, y_train = df.loc[:, df.columns != 'Probability'], df['Probability']
    
        print("Situation Testing After Synthetic Data Generation......")

        #---------------- Step 3 : Fair-Situation Testing -----------------
        
        X_train, y_train, point_removed = situation(clf2, X_train, y_train, protected_attribute) #dataset is changing. 

        df=copy.deepcopy(X_train)
        df['Probability']=y_train

        
        clf_lr = LogisticRegression(random_state=random_seed)
        clf_lr.fit(X_train,y_train)
        
        # Prepare dataset_t for metrics calculation
        dataset_t = BinaryLabelDataset(favorable_label=1.0,
                                       unfavorable_label=0.0,
                                       df=dataset_orig_test,
                                       label_names=['Probability'],
                                       protected_attribute_names=[protected_attribute])
        
        y_pred = clf_lr.predict(X_test)
            
        dataset_pred = dataset_t.copy()  
        dataset_pred.labels = y_pred    
        attr = dataset_t.protected_attribute_names[0]
        idx = dataset_t.protected_attribute_names.index(attr)
        
        privileged_groups = [{attr: dataset_pred.privileged_protected_attributes[idx][0]}]
        unprivileged_groups = [{attr: dataset_pred.unprivileged_protected_attributes[idx][0]}]
        
        class_metrics = ClassificationMetric(dataset_t, dataset_pred, unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)
    
        processed_accuracy.append(class_metrics.accuracy())
        processed_recall.append(class_metrics.recall())
        processed_precision.append(class_metrics.precision())
        processed_f1score.append((2 * class_metrics.recall() * class_metrics.precision()) / (class_metrics.precision() + class_metrics.recall()))
        
        processed_aod.append(np.abs(class_metrics.average_odds_difference()))
        processed_eod.append(np.abs(class_metrics.equal_opportunity_difference()))
        processed_spd.append(np.abs(class_metrics.statistical_parity_difference()))
        processed_di.append(np.abs(1 - class_metrics.disparate_impact()))
    
              
        # Compute the metrics
    metrics = {
        "Metric": ["Recall", "Precision", "Accuracy", "F1 Score", "AOD", "EOD", "SPD", "DI"],
        "Original": [
            round(np.median(original_recall), 4),
            round(np.median(original_precision), 4),
            round(np.median(original_accuracy), 4),
            round(np.median(original_f1score), 4),
            round(np.median(original_aod), 4),
            round(np.median(original_eod), 4),
            round(np.median(original_spd), 4),
            round(np.median(original_di), 4),
           
        ],
        "Processed": [
            round(np.median(processed_recall), 4),
            round(np.median(processed_precision), 4),
            round(np.median(processed_accuracy), 4),
            round(np.median(processed_f1score), 4),
            round(np.median(processed_aod), 4),
            round(np.median(processed_eod), 4),
            round(np.median(processed_spd), 4),
            round(np.median(processed_di), 4),
        ],
    }
    
    # Create DataFrame
    df = pd.DataFrame(metrics)
    df=df.T
    df.to_csv("Results_ST1_Synthetic_Data/fair_generate_"+dataset_name+" "+protected_attribute+" "+learner+".csv",index=True)
    # Display DataFrame
    print(df)



Running: 1 ---------
(5771, 6)
(1443, 6)
Situation Testing Before Synthetic Data Generation......
Removed Points: 14.157 % ||  817
Before Synthetic Data Generation
1849 341 2134 630
After Synthetic Data Generation
2134 2134 2134 2134
Situation Testing After Synthetic Data Generation......
Removed Points: 0.0 % ||  0

Running: 2 ---------
(5771, 6)
(1443, 6)
Situation Testing Before Synthetic Data Generation......
Removed Points: 14.2783 % ||  824
Before Synthetic Data Generation
1846 337 2128 636
After Synthetic Data Generation
2128 2128 2128 2128
Situation Testing After Synthetic Data Generation......
Removed Points: 6.438 % ||  548

Running: 3 ---------
(5771, 6)
(1443, 6)
Situation Testing Before Synthetic Data Generation......
Removed Points: 17.7266 % ||  1023
Before Synthetic Data Generation
1754 308 2055 631
After Synthetic Data Generation
2055 2055 2055 2055
Situation Testing After Synthetic Data Generation......
Removed Points: 1.0706 % ||  88

Running: 4 ---------
(5771, 6)


KeyboardInterrupt: 