In [1]:
from sklearn.datasets import fetch_openml
import pandas as pd

X, y = fetch_openml("adult", version=2, as_frame=True, return_X_y=True)
X.isna().sum()

age                  0
workclass         2799
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     857
dtype: int64

In [2]:
# Make target binary: 1 for '>50K', 0 for '<=50K'
y = y.astype(str)
y = (y == ">50K").astype(int)

In [3]:
# Create a single train/val split with a fixed seed and save indices
from sklearn.model_selection import train_test_split

SEED = 42
TEST_SIZE = 1.0/3

# Perform stratified split to preserve class balance
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=SEED, stratify=y
)

In [4]:
X_train.index

Index([43630, 40796, 33798, 17454, 41161, 14063, 34730,  9186,  4559,  9488,
       ...
       12204, 47400,   969, 26955, 43143,  3820, 20602, 22129, 43502, 39897],
      dtype='int64', length=32561)

In [5]:
X_val

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
38758,25,Private,130397,10th,6,Never-married,Farming-fishing,Unmarried,Amer-Indian-Eskimo,Male,0,0,40,United-States
36393,20,Private,110597,Some-college,10,Never-married,Sales,Own-child,White,Female,0,0,30,United-States
37685,37,Private,238959,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,40,United-States
2221,28,Local-gov,104329,Some-college,10,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,50,United-States
32902,53,Federal-gov,105788,Bachelors,13,Divorced,Exec-managerial,Unmarried,Black,Female,0,0,50,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1931,43,Federal-gov,136105,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,Black,Female,0,1848,40,United-States
29561,23,Private,388811,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Female,0,0,40,United-States
44384,52,Self-emp-not-inc,183146,12th,8,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States
8746,34,Private,154120,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,15024,0,60,United-States


In [6]:

X_train.to_csv('X_train.csv', index=True)
y_train.to_csv('y_train.csv', index=True)
X_val.to_csv('X_val.csv', index=True)
y_val.to_csv('y_val.csv', index=True)

print('Saved X_train.csv, y_train.csv, X_val.csv, y_val.csv')

Saved X_train.csv, y_train.csv, X_val.csv, y_val.csv


In [7]:
X_train = pd.read_csv('X_train.csv', index_col=0)
print(X_train.index)
print(y_train.index)

Index([43630, 40796, 33798, 17454, 41161, 14063, 34730,  9186,  4559,  9488,
       ...
       12204, 47400,   969, 26955, 43143,  3820, 20602, 22129, 43502, 39897],
      dtype='int64', length=32561)
Index([43630, 40796, 33798, 17454, 41161, 14063, 34730,  9186,  4559,  9488,
       ...
       12204, 47400,   969, 26955, 43143,  3820, 20602, 22129, 43502, 39897],
      dtype='int64', length=32561)
