In [65]:
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin


In [66]:
def preprocess_data(data):
    # Target variable
    y = data['Heart Disease Mortality']

    columns = data.columns.tolist()

    
    # Remove target variable from features
    if 'Heart Disease Mortality' in columns:
        columns.remove('Heart Disease Mortality')
    
    cat_features = []
    num_features = []
    preprocess_features = []
    
    # Identify categorical, numerical and percentage features
    for col in columns:
        if data[col].dtype == 'object':
            if data[col].astype(str).str.contains('%').any():
                preprocess_features.append(col)
            else:
                cat_features.append(col)
        else:
            num_features.append(col)
    
    # Custom preprocessor to handle percentage features
    class CustomPreprocessor(BaseEstimator, TransformerMixin):
        def __init__(self, preprocess_features):
            self.preprocess_features = preprocess_features
            
        def fit(self, X, y=None):
            return self
            
        def transform(self, X):
            X_processed = X.copy()
            for col in self.preprocess_features:
                if col in X_processed.columns:
                    X_processed[col] = X_processed[col].astype(str).str.replace('%', '').astype(float) / 100
            return X_processed
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    percentage_transformer = Pipeline(steps=[
        ('custom', CustomPreprocessor(preprocess_features)),
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    transformers = []
    
    if num_features:
        transformers.append(('num', numeric_transformer, num_features))
    
    if cat_features:
        transformers.append(('cat', categorical_transformer, cat_features))
    
    if preprocess_features:
        transformers.append(('pct', percentage_transformer, preprocess_features))
    
    preprocessor = ColumnTransformer(transformers=transformers)
    
    X = data[columns]
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test, preprocessor

In [67]:
df = pd.read_csv("../cleaned_data/merged_data.csv")

In [68]:
df.columns

Index(['County', 'State', 'Total', 'Less than $10,000', '$10,000 to $14,999',
       '$15,000 to $24,999', '$25,000 to $34,999', '$35,000 to $49,999',
       '$50,000 to $74,999', '$75,000 to $99,999', '$100,000 to $149,999',
       '$150,000 to $199,999', '$200,000 or more', 'Median income',
       'Mean income', 'Heart Disease Mortality', 'Sex', 'ethnicity'],
      dtype='object')

In [69]:
X_me_train, X_me_test, y_me_train, y_me_test, preprocessor= preprocess_data(df)

In [70]:
pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', LinearRegression())
        ])

In [71]:
pipeline.fit(X_me_train, y_me_train)

In [72]:
y_hat_me_train = pipeline.predict(X_me_train)
y_hat_me_test = pipeline.predict(X_me_test)

In [73]:
train_me_mse = mean_squared_error(y_me_train, y_hat_me_train)
test_me_mse = mean_squared_error(y_me_test, y_hat_me_test)

print(train_me_mse)
print(test_me_mse)


7929.71670272723
8719.422830204812


In [74]:
mse = mean_squared_error(y_me_test, y_hat_me_test )
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_me_test, y_hat_me_test )
r2 = r2_score(y_me_test, y_hat_me_test)

In [75]:
print(f"  - RMSE: {rmse:.2f}")
print(f"  - MAE: {mae:.2f}")
print(f"  - R² Score: {r2:.4f}")

  - RMSE: 93.38
  - MAE: 62.16
  - R² Score: 0.6504
