In [4]:
import os
import pandas as pd
import numpy as np
import random
import json

random.seed(42)

# Pre-Process Income

In [5]:
def pre_process_income(df):
    """
    Pre-process the income dataset to make it ready for the simulation
    In this function we consider "SEX" as the sensitive value and "PINCP" as the target value.

    Args:
        data: the raw data
        years_list: the list of years to be considered
        states_list: the list of states to be considered

    Returns:
        Returns a list of pre-processed data for each state, if multiple years are
        selected, the data are concatenated.
        We return three lists:
        - The first list contains a pandas dataframe of features for each state
        - The second list contains a pandas dataframe of labels for each state
        - The third list contains a pandas dataframe of groups for each state
        The values in the list are numpy array of the dataframes
    """

    categorical_columns = ["COW", "SCHL"]  # , "RAC1P"]
    continuous_columns = ["AGEP", "WKHP", "OCCP", "POBP", "RELP"]

    # get the target and sensitive attributes
    target_attributes = df[">50K"]
    sensitive_attributes = df["SEX"]

    # convert the columns to one-hot encoding
    df = pd.get_dummies(df, columns=categorical_columns)

    # normalize the continuous columns between 0 and 1
    for col in continuous_columns:
        df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

    return pd.DataFrame(df)


def pre_process_single_datasets(df):
    dataframe = pd.DataFrame()
    label = pd.DataFrame()
    group = pd.DataFrame()
    second_group = pd.DataFrame()
    dataframes = []
    labels = []
    groups = []
    second_groups = []

    target_attributes = df[">50K"]
    sensitive_attributes = df["SEX"]
    second_sensitive_attributes = df["MAR"]
    target_attributes = target_attributes.astype(int)

    sensitive_attributes = [1 if item == 1 else 0 for item in sensitive_attributes]

    second_sensitive_attributes = [
        1 if item == 1 else 0 for item in second_sensitive_attributes
    ]

    df = df.drop([">50K"], axis=1)

    # concatenate the dataframes
    dataframe = pd.concat([dataframe, df])
    # convert the labels and groups to dataframes
    label = pd.concat([label, pd.DataFrame(target_attributes)])
    group = pd.concat([group, pd.DataFrame(sensitive_attributes)])
    second_group = pd.concat([second_group, pd.DataFrame(second_sensitive_attributes)])

    assert len(dataframe) == len(label) == len(group) == len(second_group)
    dataframes.append(dataframe.to_numpy())
    labels.append(label.to_numpy())
    groups.append(group.to_numpy())
    second_groups.append(second_group.to_numpy())
    return dataframes, labels, groups, second_groups

In [6]:
folder = "../data/"
list_files = !ls {folder}
unfair_dfs = []
print(list_files)

states = ["AK", "CT"]

partitions_names = []

for state in states:
    partitions = set()
    for file in list_files:
        if file.endswith(".csv"):
            partition = int(file.split("_")[-1].split(".")[0])
            if partition not in partitions:
                partitions_names.append(f"{state}_{partition}")
                partitions.add(partition)
                train = pd.read_csv(f"../data/{state}_train_{partition}.csv")
                val = pd.read_csv(f"../data/{state}_val_{partition}.csv")
                concatenated_data = pd.concat([train, val])
                unfair_dfs.append(concatenated_data)
                df = pd.read_csv(f"../data/{state}_test_{partition}.csv")
                unfair_dfs.append(df)

['2018', 'AK_test_0.csv', 'AK_test_1.csv', 'AK_test_2.csv', 'AK_test_3.csv', 'AK_test_4.csv', 'AK_test_5.csv', 'AK_test_6.csv', 'AK_test_7.csv', 'AK_test_8.csv', 'AK_test_9.csv', 'AK_train_0.csv', 'AK_train_1.csv', 'AK_train_2.csv', 'AK_train_3.csv', 'AK_train_4.csv', 'AK_train_5.csv', 'AK_train_6.csv', 'AK_train_7.csv', 'AK_train_8.csv', 'AK_train_9.csv', 'AK_val_0.csv', 'AK_val_1.csv', 'AK_val_2.csv', 'AK_val_3.csv', 'AK_val_4.csv', 'AK_val_5.csv', 'AK_val_6.csv', 'AK_val_7.csv', 'AK_val_8.csv', 'AK_val_9.csv', 'CT_test_0.csv', 'CT_test_1.csv', 'CT_test_2.csv', 'CT_test_3.csv', 'CT_test_4.csv', 'CT_test_5.csv', 'CT_test_6.csv', 'CT_test_7.csv', 'CT_test_8.csv', 'CT_test_9.csv', 'CT_train_0.csv', 'CT_train_1.csv', 'CT_train_2.csv', 'CT_train_3.csv', 'CT_train_4.csv', 'CT_train_5.csv', 'CT_train_6.csv', 'CT_train_7.csv', 'CT_train_8.csv', 'CT_train_9.csv', 'CT_val_0.csv', 'CT_val_1.csv', 'CT_val_2.csv', 'CT_val_3.csv', 'CT_val_4.csv', 'CT_val_5.csv', 'CT_val_6.csv', 'CT_val_7.csv', 'CT

In [7]:
partitions_names

['AK_0',
 'AK_1',
 'AK_2',
 'AK_3',
 'AK_4',
 'AK_5',
 'AK_6',
 'AK_7',
 'AK_8',
 'AK_9',
 'CT_0',
 'CT_1',
 'CT_2',
 'CT_3',
 'CT_4',
 'CT_5',
 'CT_6',
 'CT_7',
 'CT_8',
 'CT_9']

In [8]:
concatenated_df = pd.concat(unfair_dfs, ignore_index=True)
concatenated_df["PINCP"] = [1 if item == True else 0 for item in concatenated_df["PINCP"]]

# rename the column PINCP to >50K
concatenated_df.rename(columns={"PINCP": ">50K"}, inplace=True)

In [9]:
# Apply one-hot encoding
pre_processed_df = pre_process_income(concatenated_df)

split_dfs = []
start_idx = 0
for df in unfair_dfs:
    end_idx = start_idx + len(df)
    split_dfs.append(pre_processed_df.iloc[start_idx:end_idx])
    start_idx = end_idx

for index in range(0, len(split_dfs), 2):
    train_state = split_dfs[index]
    test_state = split_dfs[index + 1]

    (
        train_data,
        train_labels,
        train_groups,
        train_second_groups,
    ) = pre_process_single_datasets(train_state)
    (
        test_data,
        test_labels,
        test_groups,
        test_second_groups,
    ) = pre_process_single_datasets(test_state)

    print(index // 2, train_data[0].shape, test_data[0].shape)

    if not os.path.exists(
        f"../data/FL_data/federated/{index // 2}"
    ):
        os.makedirs(f"../data/FL_data/federated/{index // 2}")
        # save partitions_names 
    json_file = {index:data for index, data in enumerate(partitions_names)}
    with open(f"../data/FL_data/federated/partitions_names.json", "w") as f:
        json.dump(json_file, f)
    np.save(
        f"../data/FL_data/federated/{index // 2}/income_dataframes_{index // 2}_train.npy",
        train_data[0],
    )
    np.save(
        f"../data/FL_data/federated/{index // 2}/income_labels_{index // 2}_train.npy",
        train_labels[0],
    )
    np.save(
        f"../data/FL_data/federated/{index // 2}/income_groups_{index // 2}_train.npy",
        train_groups[0],
    )
    np.save(
        f"../data/FL_data/federated/{index // 2}/income_second_groups_{index // 2}_train.npy",
        train_second_groups[0],
    )


    np.save(
        f"../data/FL_data/federated/{index // 2}/income_dataframes_{index // 2}_test.npy",
        test_data[0],
    )
    np.save(
        f"../data/FL_data/federated/{index // 2}/income_labels_{index // 2}_test.npy",
        test_labels[0],
    )
    np.save(
        f"../data/FL_data/federated/{index // 2}/income_groups_{index // 2}_test.npy",
        test_groups[0],
    )
    np.save(
        f"../data/FL_data/federated/{index // 2}/income_second_groups_{index // 2}_test.npy",
        test_second_groups[0],
    )

0 (303, 40) (54, 40)
1 (302, 40) (53, 40)
2 (301, 40) (53, 40)
3 (301, 40) (53, 40)
4 (301, 40) (53, 40)
5 (301, 40) (53, 40)
6 (301, 40) (53, 40)
7 (301, 40) (53, 40)
8 (301, 40) (53, 40)
9 (301, 40) (53, 40)
10 (1682, 40) (297, 40)
11 (1682, 40) (297, 40)
12 (1682, 40) (297, 40)
13 (1682, 40) (297, 40)
14 (1682, 40) (297, 40)
15 (1682, 40) (297, 40)
16 (1682, 40) (297, 40)
17 (1681, 40) (296, 40)
18 (1681, 40) (296, 40)
19 (1680, 40) (296, 40)


In [10]:
train_data[0]

array([[0.45569620253164556, 1.0, 0.23421588594704684, ..., True, False,
        False],
       [0.012658227848101266, 5.0, 0.40835030549898166, ..., False,
        False, False],
       [0.569620253164557, 1.0, 0.4318737270875764, ..., False, False,
        False],
       ...,
       [0.27848101265822783, 1.0, 0.5081466395112016, ..., True, False,
        False],
       [0.569620253164557, 1.0, 0.3304480651731161, ..., False, False,
        False],
       [0.17721518987341772, 1.0, 0.36568228105906314, ..., False, False,
        False]], dtype=object)

In [12]:
import numpy as np
np.load("../data/FL_data/federated/0/income_dataframes_0_train.npy", allow_pickle=True)

array([[0.6075949367088608, 1.0, 0.36578411405295314, ..., False, False,
        False],
       [0.21518987341772153, 5.0, 0.5244399185336049, ..., False, False,
        False],
       [0.4050632911392405, 1.0, 0.23421588594704684, ..., True, False,
        False],
       ...,
       [0.3670886075949367, 1.0, 0.40020366598778007, ..., False, False,
        False],
       [0.24050632911392406, 1.0, 0.5613034623217923, ..., False, False,
        False],
       [0.5189873417721519, 5.0, 0.47963340122199594, ..., False, False,
        False]], dtype=object)