In [21]:
import os
import pandas as pd
import numpy as np
import random
import json

random.seed(42)

# Pre-Process Income

In [22]:
def pre_process_income(df):
    """
    Pre-process the income dataset to make it ready for the simulation
    In this function we consider "SEX" as the sensitive value and "PINCP" as the target value.

    Args:
        data: the raw data
        years_list: the list of years to be considered
        states_list: the list of states to be considered

    Returns:
        Returns a list of pre-processed data for each state, if multiple years are
        selected, the data are concatenated.
        We return three lists:
        - The first list contains a pandas dataframe of features for each state
        - The second list contains a pandas dataframe of labels for each state
        - The third list contains a pandas dataframe of groups for each state
        The values in the list are numpy array of the dataframes
    """

    categorical_columns = ["COW", "SCHL"] #, "RAC1P"]
    continuous_columns = ["AGEP", "WKHP", "OCCP", "POBP", "RELP"]

    # get the target and sensitive attributes
    target_attributes = df[">50K"]
    sensitive_attributes = df["SEX"]

    # convert the columns to one-hot encoding
    df = pd.get_dummies(df, columns=categorical_columns, dtype=int)

    # normalize the continuous columns between 0 and 1
    for col in continuous_columns:
        df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

    return pd.DataFrame(df)


def pre_process_single_datasets(df):
    dataframe = pd.DataFrame()
    label = pd.DataFrame()
    group = pd.DataFrame()
    second_group = pd.DataFrame()
    third_group = pd.DataFrame()
    dataframes = []
    labels = []
    groups = []
    second_groups = []
    third_groups = []
    target_attributes = df[">50K"]
    sensitive_attributes = df["SEX"]
    second_sensitive_attributes = df["MAR"]
    
    third_sensitive_attributes = df["RAC1P"]
    third_sensitive_attributes = third_sensitive_attributes.astype(int)
    target_attributes = target_attributes.astype(int)

    sensitive_attributes = [1 if item == 1 else 0 for item in sensitive_attributes]

    second_sensitive_attributes = [
        1 if item == 1 else 0 for item in second_sensitive_attributes
    ]

    third_sensitive_attributes = [
        1 if item == 1 else 0 for item in third_sensitive_attributes
    ]

    df = df.drop([">50K"], axis=1)
    # df.drop(['RAC1P_1.0', 'RAC1P_2.0'], axis=1, inplace=True)

    # concatenate the dataframes
    dataframe = pd.concat([dataframe, df])
    # remove RAC1P from dataframe

    # convert the labels and groups to dataframes
    label = pd.concat([label, pd.DataFrame(target_attributes)])
    group = pd.concat([group, pd.DataFrame(sensitive_attributes)])
    second_group = pd.concat([second_group, pd.DataFrame(second_sensitive_attributes)])
    third_group = pd.concat([third_group, pd.DataFrame(third_sensitive_attributes)])

    assert len(dataframe) == len(label) == len(group) == len(second_group)
    dataframes.append(dataframe.to_numpy())
    labels.append(label.to_numpy())
    groups.append(group.to_numpy())
    second_groups.append(second_group.to_numpy())
    third_groups.append(third_group.to_numpy())
    return dataframes, labels, groups, second_groups, third_groups

In [None]:
from sklearn.model_selection import train_test_split
folder = "../data/cross_silo_attribute_final/"
# folder = "../data/cross_silo_value_final/"
list_files = !ls {folder}
unfair_dfs = []
print(list_files)

states = ['CT',
 'RI',
 'VT',
 'TX',
 'GA',
 'PR',
 'OH',
 'NE',
 'HI',
 'MO',
 'PA',
 'DE',
 'WV',
 'MD',
 'AZ',
 'LA',
 'WA',
 'TN',
 'MA',
 'NJ',
 'ME',
 'SC',
 'MI',
 'OK',
 'IL',
 'FL',
 'UT',
 'AK',
 'WI',
 'NH',
 'VA',
 'SD',
 'MS',
 'ND',
 'NC',
 'AL',
 'IA',
 'ID',
 'WY',
 'NV',
 'NM',
 'NY',
 'CA',
 'AR',
 'MN',
 'OR',
 'MT',
 'KY',
 'KS',
 'IN',
 'CO']

partitions_names = []

for state in states:
    partitions = set()
    for file in list_files:
        if file.endswith(".csv"):
            partition = int(file.split("_")[-1].split(".")[0])
            if partition not in partitions:
                partitions_names.append(f"{state}_{partition}")
                partitions.add(partition)
                train = pd.read_csv(f"{folder}{state}_{partition}.csv")
                # split the train csv into train and test
                train, test = train_test_split(train, test_size=0.2)

                # val = pd.read_csv(f"../data/{state}_val_{partition}.csv")
                # concatenated_data = pd.concat([train, val])
                unfair_dfs.append(train)
                # df = pd.read_csv(f"../data/{state}_test_{partition}.csv")
                unfair_dfs.append(test)
    
partitions_names

['AK_0.csv', 'AL_0.csv', 'AR_0.csv', 'AZ_0.csv', 'CA_0.csv', 'CO_0.csv', 'CT_0.csv', 'DE_0.csv', 'FL_0.csv', 'FL_data', 'GA_0.csv', 'HI_0.csv', 'IA_0.csv', 'ID_0.csv', 'IL_0.csv', 'IN_0.csv', 'KS_0.csv', 'KY_0.csv', 'LA_0.csv', 'MA_0.csv', 'MD_0.csv', 'ME_0.csv', 'MI_0.csv', 'MN_0.csv', 'MO_0.csv', 'MS_0.csv', 'MT_0.csv', 'NC_0.csv', 'ND_0.csv', 'NE_0.csv', 'NH_0.csv', 'NJ_0.csv', 'NM_0.csv', 'NV_0.csv', 'NY_0.csv', 'OH_0.csv', 'OK_0.csv', 'OR_0.csv', 'PA_0.csv', 'PR_0.csv', 'RI_0.csv', 'SC_0.csv', 'SD_0.csv', 'TN_0.csv', 'TX_0.csv', 'UT_0.csv', 'VA_0.csv', 'VT_0.csv', 'WA_0.csv', 'WI_0.csv', 'WV_0.csv', 'WY_0.csv']


['CT_0',
 'RI_0',
 'VT_0',
 'TX_0',
 'GA_0',
 'PR_0',
 'OH_0',
 'NE_0',
 'HI_0',
 'MO_0',
 'PA_0',
 'DE_0',
 'WV_0',
 'MD_0',
 'AZ_0',
 'LA_0',
 'WA_0',
 'TN_0',
 'MA_0',
 'NJ_0',
 'ME_0',
 'SC_0',
 'MI_0',
 'OK_0',
 'IL_0',
 'FL_0',
 'UT_0',
 'AK_0',
 'WI_0',
 'NH_0',
 'VA_0',
 'SD_0',
 'MS_0',
 'ND_0',
 'NC_0',
 'AL_0',
 'IA_0',
 'ID_0',
 'WY_0',
 'NV_0',
 'NM_0',
 'NY_0',
 'CA_0',
 'AR_0',
 'MN_0',
 'OR_0',
 'MT_0',
 'KY_0',
 'KS_0',
 'IN_0',
 'CO_0']

In [24]:
concatenated_df = pd.concat(unfair_dfs, ignore_index=True)
concatenated_df["PINCP"] = [1 if item == True else 0 for item in concatenated_df["PINCP"]]

# rename the column PINCP to >50K
concatenated_df.rename(columns={"PINCP": ">50K"}, inplace=True)

In [25]:
concatenated_df.drop(["__index_level_0__"], axis=1, inplace=True)

In [26]:
concatenated_df

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,>50K
0,22.0,1.0,16.0,2.0,4055.0,9.0,16.0,12.0,1.0,2.0,0
1,17.0,1.0,14.0,2.0,4110.0,6.0,2.0,6.0,1.0,1.0,0
2,51.0,1.0,19.0,1.0,3500.0,303.0,1.0,40.0,2.0,1.0,0
3,31.0,1.0,24.0,1.0,1650.0,25.0,0.0,50.0,2.0,1.0,0
4,63.0,7.0,16.0,1.0,440.0,9.0,0.0,70.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...
1626653,59.0,1.0,20.0,2.0,5940.0,110.0,0.0,40.0,2.0,1.0,0
1626654,61.0,1.0,19.0,2.0,9130.0,8.0,0.0,40.0,1.0,1.0,0
1626655,44.0,1.0,21.0,1.0,440.0,18.0,1.0,45.0,1.0,1.0,0
1626656,39.0,1.0,21.0,1.0,4700.0,10.0,0.0,30.0,2.0,1.0,0


In [27]:
len(unfair_dfs[0])

15828

In [28]:
len(unfair_dfs[1])

3957

In [29]:
# Apply one-hot encoding
pre_processed_df = pre_process_income(concatenated_df)

split_dfs = []
start_idx = 0
for df in unfair_dfs:
    end_idx = start_idx + len(df)
    split_dfs.append(pre_processed_df.iloc[start_idx:end_idx])
    start_idx = end_idx


In [30]:
for index in range(0, len(split_dfs), 2):
    train_state = split_dfs[index]
    test_state = split_dfs[index + 1]
    print(len(train_state), len(test_state))
    (
        train_data,
        train_labels,
        train_groups,
        train_second_groups,
        train_third_groups,
    ) = pre_process_single_datasets(train_state)
    (
        test_data,
        test_labels,
        test_groups,
        test_second_groups,
        test_third_groups,
    ) = pre_process_single_datasets(test_state)

    print(index // 2, train_data[0].shape, test_data[0].shape)

    if not os.path.exists(
        f"{folder}FL_data/federated/{index // 2}"
    ):
        os.makedirs(f"{folder}FL_data/federated/{index // 2}")
        # save partitions_names 
    json_file = {index:data for index, data in enumerate(partitions_names)}
    with open(f"{folder}FL_data/federated/partitions_names.json", "w") as f:
        json.dump(json_file, f)
    np.save(
        f"{folder}FL_data/federated/{index // 2}/income_dataframes_{index // 2}_train.npy",
        train_data[0],
    )
    np.save(
        f"{folder}FL_data/federated/{index // 2}/income_labels_{index // 2}_train.npy",
        train_labels[0],
    )
    np.save(
        f"{folder}FL_data/federated/{index // 2}/income_groups_{index // 2}_train.npy",
        train_groups[0],
    )
    np.save(
        f"{folder}FL_data/federated/{index // 2}/income_second_groups_{index // 2}_train.npy",
        train_second_groups[0],
    )
    np.save(
        f"{folder}FL_data/federated/{index // 2}/income_third_groups_{index // 2}_train.npy",
        train_third_groups[0],
    )


    np.save(
        f"{folder}FL_data/federated/{index // 2}/income_dataframes_{index // 2}_test.npy",
        test_data[0],
    )
    np.save(
        f"{folder}FL_data/federated/{index // 2}/income_labels_{index // 2}_test.npy",
        test_labels[0],
    )
    np.save(
        f"{folder}FL_data/federated/{index // 2}/income_groups_{index // 2}_test.npy",
        test_groups[0],
    )
    np.save(
        f"{folder}FL_data/federated/{index // 2}/income_second_groups_{index // 2}_test.npy",
        test_second_groups[0],
    )
    np.save(
        f"{folder}FL_data/federated/{index // 2}/income_third_groups_{index // 2}_test.npy",
        test_third_groups[0],
    )

15828 3957
0 (15828, 40) (3957, 40)
4569 1143
1 (4569, 40) (1143, 40)
3008 753
2 (3008, 40) (753, 40)
102981 25746
3 (102981, 40) (25746, 40)
40378 10095
4 (40378, 40) (10095, 40)
7092 1774
5 (7092, 40) (1774, 40)
46666 11667
6 (46666, 40) (11667, 40)
8628 2157
7 (8628, 40) (2157, 40)
6184 1547
8 (6184, 40) (1547, 40)
23646 5912
9 (23646, 40) (5912, 40)
51803 12951
10 (51803, 40) (12951, 40)
3660 916
11 (3660, 40) (916, 40)
6245 1562
12 (6245, 40) (1562, 40)
26433 6609
13 (26433, 40) (6609, 40)
26621 6656
14 (26621, 40) (6656, 40)
16354 4089
15 (16354, 40) (4089, 40)
30444 7612
16 (30444, 40) (7612, 40)
25729 6433
17 (25729, 40) (6433, 40)
32091 8023
18 (32091, 40) (8023, 40)
38224 9557
19 (38224, 40) (9557, 40)
5601 1401
20 (5601, 40) (1401, 40)
19903 4976
21 (19903, 40) (4976, 40)
37666 9417
22 (37666, 40) (9417, 40)
13848 3462
23 (13848, 40) (3462, 40)
51248 12812
24 (51248, 40) (12812, 40)
79140 19785
25 (79140, 40) (19785, 40)
13069 3268
26 (13069, 40) (3268, 40)
2836 710
27 (2836

In [31]:
train_data[0].shape

(25044, 40)

In [32]:
import numpy as np
np.load(f"{folder}FL_data/federated/0/income_dataframes_0_train.npy", allow_pickle=True)

array([[0.06329114, 2.        , 0.41191446, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 2.        , 0.41751527, ..., 0.        , 0.        ,
        0.        ],
       [0.43037975, 1.        , 0.35539715, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.17721519, 2.        , 0.4287169 , ..., 0.        , 0.        ,
        0.        ],
       [0.5443038 , 2.        , 0.53258656, ..., 0.        , 0.        ,
        0.        ],
       [0.55696203, 1.        , 0.53258656, ..., 0.        , 0.        ,
        0.        ]])

In [33]:
import numpy as np
np.load(f"{folder}FL_data/federated/0/income_labels_0_train.npy", allow_pickle=True)

array([[0],
       [0],
       [0],
       ...,
       [0],
       [1],
       [0]])

In [34]:
import numpy as np
np.load(f"{folder}FL_data/federated/0/income_groups_0_train.npy", allow_pickle=True)

array([[1],
       [1],
       [0],
       ...,
       [0],
       [1],
       [0]])

In [35]:
import numpy as np
np.load(f"{folder}FL_data/federated/0/income_second_groups_0_train.npy", allow_pickle=True)

array([[0],
       [0],
       [1],
       ...,
       [0],
       [0],
       [1]])

In [36]:
import numpy as np
np.load(f"{folder}FL_data/federated/0/income_second_groups_0_test.npy", allow_pickle=True)

array([[1],
       [0],
       [0],
       ...,
       [0],
       [1],
       [0]])

In [37]:
import numpy as np
np.load(f"{folder}FL_data/federated/0/income_third_groups_0_train.npy", allow_pickle=True)

array([[0],
       [1],
       [1],
       ...,
       [0],
       [1],
       [1]])

In [38]:
import numpy as np
np.load(f"{folder}FL_data/federated/0/income_third_groups_0_test.npy", allow_pickle=True)

array([[1],
       [1],
       [0],
       ...,
       [1],
       [0],
       [1]])

In [39]:
import numpy as np
np.load(f"{folder}FL_data/federated/0/income_groups_0_test.npy", allow_pickle=True)

array([[0],
       [0],
       [1],
       ...,
       [0],
       [1],
       [0]])

In [None]:
import numpy as np
np.load(f"{folder}FL_data/federated/0/income_labels_0_test.npy", allow_pickle=True)

array([[1],
       [0],
       [0],
       ...,
       [1],
       [1],
       [0]])

: 