In [3]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 1️. Load the UCI Adult dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
]
df = pd.read_csv(url, names=columns, sep=',', skipinitialspace=True)

print(f" Dataset loaded: {df.shape[0]} samples, {df.shape[1]} features")

# 2️. Handle missing values (represented by '?')
df = df.replace('?', np.nan)
df = df.dropna()

# 3️. Define features and target
X = df.drop('income', axis=1)
y = df['income'].apply(lambda x: 1 if x.strip() == '>50K' else 0)

# 4️. Define categorical and numerical columns manually (important fix)
cat_cols = [
    'workclass', 'education', 'marital-status', 'occupation',
    'relationship', 'race', 'sex', 'native-country'
]
num_cols = [
    'age', 'fnlwgt', 'education-num', 'capital-gain',
    'capital-loss', 'hours-per-week'
]

print(f"Categorical columns: {len(cat_cols)}")
print(f"Numerical columns: {len(num_cols)}")

# 5️. Create preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

# 6️. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 7️. Fit and transform
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(f" Processed shapes → Train: {X_train_processed.shape}, Test: {X_test_processed.shape}")

# 8️. Save processed data and preprocessor
os.makedirs('data', exist_ok=True)
joblib.dump((X_train_processed, X_test_processed, y_train, y_test, preprocessor), 'data/adult_prepared.pkl')

print("\n Data preprocessing complete — saved to data/adult_prepared.pkl")


 Dataset loaded: 32561 samples, 15 features
Categorical columns: 8
Numerical columns: 6
 Processed shapes → Train: (24129, 104), Test: (6033, 104)

 Data preprocessing complete — saved to data/adult_prepared.pkl
