In [None]:

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pickle
import os

In [None]:

# Load the data saved from the EDA notebook
data_dir = '../data/processed'

X_train = pd.read_csv(f'{data_dir}/X_train.csv')
X_test = pd.read_csv(f'{data_dir}/X_test.csv')
y_train = pd.read_csv(f'{data_dir}/y_train.csv').squeeze() # squeeze() turns to Series
y_test = pd.read_csv(f'{data_dir}/y_test.csv').squeeze()

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

X_train shape: (156156, 8)
y_train shape: (156156,)


In [None]:
# Based on our Data Dictionary exploration [cite: 1547, 1885, 2612, 1893, 1904]

# Numerical features: These will be imputed (with the median) and scaled.
numerical_features = ['NACCAGE', 'EDUC', 'SMOKYRS']

# Categorical features: These will be imputed (with the most frequent value) 
categorical_features = ['TOBAC30', 'ALCOCCAS', 'ALCFREQ', 'MARISTAT', 'NACCLIVS']

In [None]:
#CREATE PREPROCESSING PIPELINES 

def to_numeric(df):
    # na_values from 01_eda.ipynb
    na_values = [-4, 9, 88, 99, 888, 999, 9999] 
    # Coerce errors will turn any remaining text (like 'missing') into NaN
    return df.apply(pd.to_numeric, errors='coerce').replace(na_values, np.nan)

#  A helper function to force columns to string (for categories)
# This will fix the 'cannot cast str to int64' error.
def to_string(df):
    return df.astype(str)


# Pipeline for numerical data
numeric_transformer = Pipeline(steps=[
 
    ('to_numeric', FunctionTransformer(to_numeric, feature_names_out='one-to-one')), 
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Pipeline for categorical data
categorical_transformer = Pipeline(steps=[

    ('to_string', FunctionTransformer(to_string, feature_names_out='one-to-one')),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [None]:
# CREATE THE COLUMN TRANSFORMER 
# This object applies the correct pipeline to the correct columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

In [None]:
# FIT AND TRANSFORM THE DATA 

print("Applying preprocessor to training data...")
X_train_processed = preprocessor.fit_transform(X_train)

print("Applying preprocessor to test data...")
X_test_processed = preprocessor.transform(X_test)

print(f"X_train_processed shape: {X_train_processed.shape}")
print(f"X_test_processed shape: {X_test_processed.shape}")

Applying preprocessor to training data...
Applying preprocessor to test data...
X_train_processed shape: (156156, 32)
X_test_processed shape: (39040, 32)


In [None]:
# Get the new feature names created by the OneHotEncoder
feature_names = preprocessor.get_feature_names_out()

# Clean up the names (e.g., remove "num__" and "cat__" prefixes)
clean_feature_names = [name.split('__')[-1] for name in feature_names]

print("Processed Feature Names:")
print(clean_feature_names)

Processed Feature Names:
['NACCAGE', 'EDUC', 'SMOKYRS', 'TOBAC30_-4', 'TOBAC30_0', 'TOBAC30_1', 'TOBAC30_9', 'ALCOCCAS_-4', 'ALCOCCAS_0', 'ALCOCCAS_1', 'ALCOCCAS_9', 'ALCFREQ_-4', 'ALCFREQ_0', 'ALCFREQ_1', 'ALCFREQ_2', 'ALCFREQ_3', 'ALCFREQ_4', 'ALCFREQ_8', 'ALCFREQ_9', 'MARISTAT_1', 'MARISTAT_2', 'MARISTAT_3', 'MARISTAT_4', 'MARISTAT_5', 'MARISTAT_6', 'MARISTAT_9', 'NACCLIVS_1', 'NACCLIVS_2', 'NACCLIVS_3', 'NACCLIVS_4', 'NACCLIVS_5', 'NACCLIVS_9']


In [None]:
# SAVE PROCESSED DATA FOR MODELING
# We will save the processed arrays and the feature names list

processed_dir = '../data/processed'

# Save the processed numpy arrays
np.save(f'{processed_dir}/X_train_processed.npy', X_train_processed)
np.save(f'{processed_dir}/X_test_processed.npy', X_test_processed)

# Save the target variables 
y_train.to_csv(f'{processed_dir}/y_train.csv', index=False)
y_test.to_csv(f'{processed_dir}/y_test.csv', index=False)

# Save the preprocessor itself and the feature names 
with open(f'{processed_dir}/preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)
    
with open(f'{processed_dir}/feature_names.pkl', 'wb') as f:
    pickle.dump(clean_feature_names, f)

print(f"Processed data and artifacts saved to {processed_dir}")

Processed data and artifacts saved to ../data/processed
