In [76]:
import utils
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale

In [77]:
def initial_preprocess(name):
    DATA_PATH = "data"
    if name == "adult_income":
        df = pd.read_csv(f"{DATA_PATH}/{name}.csv")
        df.rename(
            columns={
                "fnlwgt": "final_weight",
                "educational-num": "educational_num",
                "marital-status": "marital_status",
                "capital-gain": "capital_gain",
                "capital-loss": "capital_loss",
                "hours-per-week": "hours_per_week",
                "native-country": "native_country",
            },
            inplace=True,
        )
        df.drop(df[df["occupation"] == "?"].index, inplace=True)
        df.drop(["final_weight", "educational_num"], axis=1, inplace=True)
        return df

In [96]:
def get_dataset(
    name,
    save_transformed=False,
    random_state=42,
    test_size=0.2,
    return_original_dataframe=False,
    return_dataframe=False,
    return_dataloader=True,
    train_batch_size=64,
    test_batch_size=64,
    df=None,
):
    DATA_PATH = "data"
    
    if name == "adult_income":
        if df is None:
            df = initial_preprocess(name)
            #print(df.shape)
        if return_original_dataframe:
            return df
        #print('$$$', df.shape)
        categorical_columns = [
            "workclass",
            "education",
            "marital_status",
            "occupation",
            "relationship",
            "race",
            "native_country",
        ]
        #print(df.shape)
        encoder = ce.OneHotEncoder(cols=categorical_columns, use_cat_names=True)
        df1 = encoder.fit_transform(df)
        #print('gender' in df1.columns)
        df1["gender"] = df1["gender"].apply(lambda x: 1 if x == "Female" else 0)
        df1["income"] = df1["income"].apply(lambda x: 1 if x == ">50K" else 0)
        X = df1.drop("income", axis=1)
        y = df1["income"]
        #print(df1.shape, X.shape)
        X_cols = df.select_dtypes(np.number).columns.tolist()
        #print(X_cols)
        non_num_cols = [x for x in X.columns if x not in X_cols]
        X_cols.extend(non_num_cols)
        X = X.reindex(columns=X_cols)
        #print(X.shape)
        if return_dataframe:
            return X, y
        if save_transformed:
            os.makedirs(f"{DATA_PATH}/{name}/", exist_ok=True)
            X.to_csv(f"{DATA_PATH}/{name}/X_unscaled.csv")
            y.to_csv(f"{DATA_PATH}/{name}/y.csv")

        num_cols = df.select_dtypes(np.number).columns
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state
        )
        X_train[num_cols] = minmax_scale(X_train[num_cols])
        X_test[num_cols] = minmax_scale(X_test[num_cols])
        yt_train = torch.tensor(y_train.values)
        Xt_train = torch.tensor(X_train.values).float()
        yt_test = torch.tensor(y_test.values)
        Xt_test = torch.tensor(X_test.values).float()
        train = torch.utils.data.TensorDataset(Xt_train, yt_train)
        test = torch.utils.data.TensorDataset(Xt_test, yt_test)
        if not return_dataloader:
            return train, test
        train_loader = torch.utils.data.DataLoader(
            train, batch_size=train_batch_size, shuffle=True
        )
        test_loader = torch.utils.data.DataLoader(
            test, batch_size=test_batch_size, shuffle=False
        )

        return train_loader, test_loader, len(train), len(test), X_train.shape[1], len(num_cols)

In [99]:
def get_split_dataset(
    name, xA, random_state=42, test_size=0.2, train_batch_size=64, test_batch_size=64, 
):
    if name == "adult_income":
        if xA != "gender":
            raise ValueError(f"For {name}: expected xA = gender but received {xA}")
        X, y = get_dataset(name, return_original_dataframe=True, return_dataloader=False)
        
        df_female = df[df["gender"] == "Female"]
        
        df_male = df[df["gender"] == "Male"]

        (
            train_loader_female,
            test_loader_female,
            len_train_female,
            len_test_female,
            feature_dim_female,
            num_cols_female
        ) = get_dataset(
            name,
            random_state=random_state,
            test_size=test_size,
            train_batch_size=train_batch_size,
            test_batch_size=test_batch_size,
            df=df_female,
        )
        (
            train_loader_male,
            test_loader_male,
            len_train_male,
            len_test_male,
            feature_dim_male,
            num_cols_male
        ) = get_dataset(
            name,
            random_state=random_state,
            test_size=test_size,
            train_batch_size=train_batch_size,
            test_batch_size=test_batch_size,
            df=df_male,
        )

        return (
            train_loader_female,
            test_loader_female,
            len_train_female,
            len_test_female,
            feature_dim_female,
            num_cols_female,
            train_loader_male,
            test_loader_male,
            len_train_male,
            len_test_male,
            feature_dim_male,
            num_cols_male
        )

In [100]:
x = get_split_dataset('adult_income', 'gender')

age 74
workclass 7
education 16
marital_status 7
occupation 14
relationship 6
race 5
gender 2
capital_gain 121
capital_loss 97
hours_per_week 96
native_country 42
income 2
------------------
age 72
workclass 7
education 16
marital_status 7
occupation 13
relationship 6
race 5
gender 1
capital_gain 101
capital_loss 75
hours_per_week 81
native_country 42
income 2
------------------
age 73
workclass 7
education 16
marital_status 7
occupation 14
relationship 6
race 5
gender 1
capital_gain 110
capital_loss 85
hours_per_week 94
native_country 41
income 2
------------------


In [101]:
df = get_dataset('adult_income', return_original_dataframe=True)

In [104]:
df1 = df[df['gender'] == 'Female']

In [108]:
df['occupation'].unique()

array(['Machine-op-inspct', 'Farming-fishing', 'Protective-serv',
       'Other-service', 'Prof-specialty', 'Craft-repair', 'Adm-clerical',
       'Exec-managerial', 'Tech-support', 'Sales', 'Priv-house-serv',
       'Transport-moving', 'Handlers-cleaners', 'Armed-Forces'],
      dtype=object)

In [107]:
df1['occupation'].unique().__len__()

13

In [115]:
df4 = df.astype(float)

ValueError: could not convert string to float: 'Private'

In [111]:
df3 = df[df['occupation'] == 'Armed-Forces']

In [113]:
df3['gender'].unique()

array(['Male'], dtype=object)

In [130]:
X, y = utils.get_dataset('adult_income', return_dataframe=True)

(46033, 13)
True
['age', 'capital_gain', 'capital_loss', 'hours_per_week', 'workclass_Private', 'workclass_Local-gov', 'workclass_Self-emp-not-inc', 'workclass_Federal-gov', 'workclass_State-gov', 'workclass_Self-emp-inc', 'workclass_Without-pay', 'education_11th', 'education_HS-grad', 'education_Assoc-acdm', 'education_Some-college', 'education_10th', 'education_Prof-school', 'education_7th-8th', 'education_Bachelors', 'education_Masters', 'education_Doctorate', 'education_5th-6th', 'education_Assoc-voc', 'education_9th', 'education_12th', 'education_1st-4th', 'education_Preschool', 'marital_status_Never-married', 'marital_status_Married-civ-spouse', 'marital_status_Widowed', 'marital_status_Separated', 'marital_status_Divorced', 'marital_status_Married-spouse-absent', 'marital_status_Married-AF-spouse', 'occupation_Machine-op-inspct', 'occupation_Farming-fishing', 'occupation_Protective-serv', 'occupation_Other-service', 'occupation_Prof-specialty', 'occupation_Craft-repair', 'occupa

In [133]:
X['gender']

0        0
1        0
2        0
3        0
5        0
        ..
48837    1
48838    0
48839    1
48840    0
48841    1
Name: gender, Length: 46033, dtype: int64

In [131]:
index = X['gender'] == 1

In [134]:
tt = X[index]

In [136]:
index

0        False
1        False
2        False
3        False
5        False
         ...  
48837     True
48838    False
48839     True
48840    False
48841     True
Name: gender, Length: 46033, dtype: bool

In [129]:
X['gender'].unique()

array([0, 1])

In [132]:
len(index)

46033

In [137]:
from utils import get_split_dataset

In [138]:
tt = get_split_dataset('adult_income', 'gender')

(46033, 13)
True


In [139]:
tt[4]

102

In [140]:
tt[3]

2984

In [141]:
tt[-2]

102