In [28]:
# -----------------------------
# Load Packages
# -----------------------------
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from scipy import sparse


In [29]:
# -----------------------------
# Load the datasets
# -----------------------------
train_df = pd.read_csv(r'C:/Users/conre/OneDrive/Desktop/Coding Folder/.csv files/project_adult.csv')
valid_df = pd.read_csv(r'C:/Users/conre/OneDrive/Desktop/Coding Folder/.csv files/project_validation_inputs.csv')

# Drop the saved index column if present
for df in (train_df, valid_df):
    if "Unnamed: 0" in df.columns:
        df.drop(columns=["Unnamed: 0"], inplace=True)
# Show first 10 rows of original training data
print("\nOriginal Training Data (first 10 rows):")
print(train_df.head(10))




Original Training Data (first 10 rows):
   age         workclass  fnlwgt     education  education-num  \
0   33         Local-gov  198183     Bachelors             13   
1   36           Private   86459     Assoc-voc             11   
2   58  Self-emp-not-inc  203039           9th              5   
3   21           Private  180190     Assoc-voc             11   
4   27           Private  279872  Some-college             10   
5   44           Private  175485     Bachelors             13   
6   33           Private   67006          10th              6   
7   62  Self-emp-not-inc   75478     Bachelors             13   
8   20           Private  374116       HS-grad              9   
9   33           Private   23871       HS-grad              9   

       marital-status         occupation    relationship   race     sex  \
0       Never-married     Prof-specialty   Not-in-family  White  Female   
1  Married-civ-spouse    Exec-managerial         Husband  White    Male   
2           Separa

In [30]:

# Separate features and target (keep ALL rows—both <=50K and >50K)
X_train_raw = train_df.drop(columns=["income"])
y_raw = train_df["income"]            # kept for later parts; Part 1 focuses on X
X_valid_raw = valid_df.copy()         # validation has no label

# Identify numeric and categorical feature columns
numeric_cols = X_train_raw.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X_train_raw.select_dtypes(include=["object"]).columns.tolist()

print(f"\nNumeric columns ({len(numeric_cols)}): {numeric_cols}")
print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")



Numeric columns (6): ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
Categorical columns (8): ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']


# Part 1a: Handle the Missing Values

In [31]:
# Convert literal '?' placeholders to true NaN so imputers can work
for df in (X_train_raw, X_valid_raw):
    df.replace("?", np.nan, inplace=True)

# Impute numeric columns with median
num_imputer = SimpleImputer(strategy="median")
X_train_num_imp = pd.DataFrame(
    num_imputer.fit_transform(X_train_raw[numeric_cols]),
    columns=numeric_cols,
    index=X_train_raw.index
)
X_valid_num_imp = pd.DataFrame(
    num_imputer.transform(X_valid_raw[numeric_cols]),
    columns=numeric_cols,
    index=X_valid_raw.index
)

# Impute categorical columns with most frequent (mode)
cat_imputer = SimpleImputer(strategy="most_frequent")
X_train_cat_imp = pd.DataFrame(
    cat_imputer.fit_transform(X_train_raw[categorical_cols]),
    columns=categorical_cols,
    index=X_train_raw.index
)
X_valid_cat_imp = pd.DataFrame(
    cat_imputer.transform(X_valid_raw[categorical_cols]),
    columns=categorical_cols,
    index=X_valid_raw.index
)

# Reassemble the imputed train/valid feature DataFrames (still raw types)
X_train_imp = pd.concat([X_train_num_imp, X_train_cat_imp], axis=1)
X_valid_imp = pd.concat([X_valid_num_imp, X_valid_cat_imp], axis=1)

print("\nAFTER Part 1a (imputed, not encoded/scaled) — first 10 rows:")
print(X_train_imp.head(10)[numeric_cols + categorical_cols][:10])



AFTER Part 1a (imputed, not encoded/scaled) — first 10 rows:
    age    fnlwgt  education-num  capital-gain  capital-loss  hours-per-week  \
0  33.0  198183.0           13.0           0.0           0.0            50.0   
1  36.0   86459.0           11.0           0.0        1887.0            50.0   
2  58.0  203039.0            5.0           0.0           0.0            40.0   
3  21.0  180190.0           11.0           0.0           0.0            46.0   
4  27.0  279872.0           10.0           0.0           0.0            40.0   
5  44.0  175485.0           13.0           0.0           0.0            12.0   
6  33.0   67006.0            6.0           0.0           0.0            45.0   
7  62.0   75478.0           13.0           0.0           0.0            40.0   
8  20.0  374116.0            9.0           0.0           0.0            40.0   
9  33.0   23871.0            9.0           0.0           0.0            30.0   

          workclass     education      marital-status    

# Part 1b: Encode categorical features (One-Hot) 

In [32]:
# Create OneHotEncoder in a version-robust way:
# - Newer sklearn: use 'sparse_output'
# - Older sklearn: fall back to 'sparse'
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)

# Fit/transform categorical blocks (still using the imputed DataFrames from Part 1a)
cat_train_enc = ohe.fit_transform(X_train_cat_imp)   # may be sparse or dense depending on version/args
cat_valid_enc = ohe.transform(X_valid_cat_imp)

# Ensure we have CSR sparse matrices for consistent downstream hstack
from scipy import sparse as sp
if not sp.issparse(cat_train_enc):
    cat_train_enc = sp.csr_matrix(cat_train_enc)
if not sp.issparse(cat_valid_enc):
    cat_valid_enc = sp.csr_matrix(cat_valid_enc)

# Get encoded categorical feature names (e.g., 'workclass_Private', ...)
cat_feature_names_raw = ohe.get_feature_names_out(input_features=categorical_cols)
cat_feature_names = [f"cat__{n}" for n in cat_feature_names_raw]

# Numeric part (from Part 1a) stays unscaled for now (we scale in Part 1c)
num_train = X_train_num_imp.values
num_valid = X_valid_num_imp.values
num_feature_names = [f"num__{c}" for c in numeric_cols]

# Combine numeric (dense->sparse) + categorical (sparse) into a single design matrix
X_train_enc_unscaled = sp.hstack(
    [sp.csr_matrix(num_train), cat_train_enc],
    format="csr"
)
X_valid_enc_unscaled = sp.hstack(
    [sp.csr_matrix(num_valid), cat_valid_enc],
    format="csr"
)

feature_names_unscaled = num_feature_names + cat_feature_names

print("\nShape AFTER Part 1b (encoded, unscaled):", X_train_enc_unscaled.shape)

# Preview ONLY the first 10 rows (convert small slice to dense for readability)
X_train_part1b_preview = pd.DataFrame(
    X_train_enc_unscaled[:10].toarray(),
    columns=feature_names_unscaled
)
print("\nAFTER Part 1b (encoded, unscaled) — first 10 rows:")
print(X_train_part1b_preview.head(10))



Shape AFTER Part 1b (encoded, unscaled): (26048, 105)

AFTER Part 1b (encoded, unscaled) — first 10 rows:
   num__age  num__fnlwgt  num__education-num  num__capital-gain  \
0      33.0     198183.0                13.0                0.0   
1      36.0      86459.0                11.0                0.0   
2      58.0     203039.0                 5.0                0.0   
3      21.0     180190.0                11.0                0.0   
4      27.0     279872.0                10.0                0.0   
5      44.0     175485.0                13.0                0.0   
6      33.0      67006.0                 6.0                0.0   
7      62.0      75478.0                13.0                0.0   
8      20.0     374116.0                 9.0                0.0   
9      33.0      23871.0                 9.0                0.0   

   num__capital-loss  num__hours-per-week  cat__workclass_Federal-gov  \
0                0.0                 50.0                         0.0   
1        

# Part 1c: Standardize numerical features 

In [33]:
scaler = StandardScaler()
num_train_scaled = scaler.fit_transform(X_train_num_imp.values)  # fit on train only
num_valid_scaled = scaler.transform(X_valid_num_imp.values)

# Re-combine: scaled numeric (dense->sparse) + one-hot categorical (sparse)
X_train_final = sparse.hstack(
    [sparse.csr_matrix(num_train_scaled), cat_train_enc],
    format="csr"
)
X_valid_final = sparse.hstack(
    [sparse.csr_matrix(num_valid_scaled), cat_valid_enc],
    format="csr"
)

feature_names_final = num_feature_names + cat_feature_names

print("\nShape AFTER Part 1c (final preprocessed):", X_train_final.shape)

# Preview ONLY the first 10 rows of the FINAL preprocessed training features
X_train_part1c_preview = pd.DataFrame(
    X_train_final[:10].toarray(),
    columns=feature_names_final
)
print("\nAFTER Part 1c (final preprocessed) — first 10 rows:")
print(X_train_part1c_preview.head(10))



Shape AFTER Part 1c (final preprocessed): (26048, 105)

AFTER Part 1c (final preprocessed) — first 10 rows:
   num__age  num__fnlwgt  num__education-num  num__capital-gain  \
0 -0.408756     0.080051            1.133702          -0.145715   
1 -0.188857    -0.981653            0.357049          -0.145715   
2  1.423734     0.126197           -1.972910          -0.145715   
3 -1.288351    -0.090935            0.357049          -0.145715   
4 -0.848554     0.856334           -0.031277          -0.145715   
5  0.397539    -0.135646            1.133702          -0.145715   
6 -0.408756    -1.166513           -1.584583          -0.145715   
7  1.716932    -1.086004            1.133702          -0.145715   
8 -1.361651     1.751927           -0.419604          -0.145715   
9 -0.408756    -1.576421           -0.419604          -0.145715   

   num__capital-loss  num__hours-per-week  cat__workclass_Federal-gov  \
0          -0.217998             0.779460                         0.0   
1      