In [28]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.utils import resample

# Load data
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header=None)

# Set column names
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
data.columns = columns
data['race'] = np.where(data['race'] == ' Black', 'Black', 'Non-black')

# Convert target variable to binary values
data['income'] = np.where(data['income'] == ' >50K', 1, 0)

# Split data into features and target
X = data.drop('income', axis=1)
y = data['income']

# One-hot encode categorical variables
cat_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
enc = OneHotEncoder(handle_unknown='ignore')
X_cat = enc.fit_transform(X[cat_cols]).toarray()

# Normalize numerical variables
num_cols = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
scaler = StandardScaler()
X_num = scaler.fit_transform(X[num_cols])

# Combine categorical and numerical features
X = np.concatenate((X_num, X_cat), axis=1)

feature_names = num_cols + enc.get_feature_names_out().tolist()

mask = X[:, 59] == 1

# identify the indices of the samples in the group that have y=1
idx_to_keep = np.where(~(mask & (y == 1)))[0]

# randomly sample 50% of the samples with y=1 that need to be removed
idx_to_remove = np.random.choice(np.where(mask & (y == 1))[0], size=int(0.2 * np.sum(mask & (y == 1))), replace=False)

# combine the indices to keep and indices to remove
idx_keep_remove = np.concatenate([idx_to_keep, idx_to_remove])

# subset X and y using the combined indices
X = X[idx_keep_remove, :]
y = y.iloc[idx_keep_remove]


mask = X[:, 61] == 1

# identify the indices of the samples in the group that have y=1
idx_to_keep = np.where(~(mask & (y == 1)))[0]

# randomly sample 50% of the samples with y=1 that need to be removed
idx_to_remove = np.random.choice(np.where(mask & (y == 1))[0], size=int(0.2 * np.sum(mask & (y == 1))), replace=False)

# combine the indices to keep and indices to remove
idx_keep_remove = np.concatenate([idx_to_keep, idx_to_remove])

# subset X and y using the combined indices
X = X[idx_keep_remove, :]
y = y.iloc[idx_keep_remove]


In [24]:
X.shape

(31364, 105)

In [25]:
from sklearn.utils import resample

# Separate the minority and majority classes in X and y
minority_X = X[(X[:, 61] == 1) | (X[:, 59] == 1)]
minority_y = y[(X[:, 61] == 1) | (X[:, 59] == 1)]
majority_X = X[(X[:, 61] == 0) & (X[:, 59] == 0)]
majority_y = y[(X[:, 61] == 0) & (X[:, 59] == 0)]

# Upsample the minority class
minority_X_upsampled, minority_y_upsampled = resample(minority_X, minority_y, replace=True, n_samples=len(majority_X))

# Combine the majority class with the upsampled minority class
X = np.vstack((majority_X, minority_X_upsampled))
y = pd.concat([majority_y, minority_y_upsampled])

In [26]:
X.shape

(40442, 105)

In [29]:


# Balance dataset
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Save training and testing data
np.save('X_train.npy', X_train)
np.save('y_train.npy', y_train)
np.save('X_test.npy', X_test)
np.save('y_test.npy', y_test)


np.save("feature_names.npy", np.array(feature_names))

In [12]:
feature_names[59]

'race_Black'

In [20]:
X_train[:, 59].shape

(26048,)

In [21]:
X_train[:, 60].shape

(26048,)