In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')



In [2]:
def load_data(filepath):
    data = pd.read_csv(filepath)
    print(f"Data shape: {data.shape}")
    return data



In [3]:


def preprocess_data(data):
    y = data['Heart Disease Mortality']
    
    cat_features = ['LocationAbbr', 'GeographicLevel', 'Sex', 'ethnicity']
    num_features = ['Year', 'Y_lat', 'X_lon']
    
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_features),
            ('cat', categorical_transformer, cat_features)
        ])
    
    X = data[num_features + cat_features]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test, preprocessor



In [4]:
def train_and_evaluate_models(X_train, X_test, y_train, y_test, preprocessor):
    models = {
        'Linear Regression': LinearRegression(),
        'Ridge Regression': Ridge(),
        'Lasso Regression': Lasso(),
        'Random Forest': RandomForestRegressor(random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(random_state=42),
        'SVR': SVR()
    }
    
    results = {}
    
    for name, model in models.items():
        print(f"\nTrain {name}")
        
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        
        pipeline.fit(X_train, y_train)
        
        y_pred = pipeline.predict(X_test)
        
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        results[name] = {
            'RMSE': rmse,
            'MAE': mae,
            'R2': r2,
            'Model': pipeline
        }
        
        print(f"{name} performence:")
        print(f"  - RMSE: {rmse:.2f}")
        print(f"  - MAE: {mae:.2f}")
        print(f"  - R² Score: {r2:.4f}")
    
    return results



In [5]:

data_path = "heart_disease_mortality_cleaned.csv" 

try:
    data = load_data(data_path)
except FileNotFoundError:
    print(f"Error: {data_path} is not found.")

X_train, X_test, y_train, y_test, preprocessor = preprocess_data(data)

results = train_and_evaluate_models(X_train, X_test, y_train, y_test, preprocessor)

best_model_name = max(results, key=lambda k: results[k]['R2'])
print(f"\nBest Model: {best_model_name} (R² = {results[best_model_name]['R2']:.4f})")


Data shape: (34430, 13)

Train Linear Regression
Linear Regression performence:
  - RMSE: 100.17
  - MAE: 67.02
  - R² Score: 0.6116

Train Ridge Regression
Ridge Regression performence:
  - RMSE: 100.14
  - MAE: 67.00
  - R² Score: 0.6118

Train Lasso Regression
Lasso Regression performence:
  - RMSE: 106.21
  - MAE: 72.53
  - R² Score: 0.5633

Train Random Forest
Random Forest performence:
  - RMSE: 77.58
  - MAE: 40.61
  - R² Score: 0.7670

Train Gradient Boosting
Gradient Boosting performence:
  - RMSE: 93.58
  - MAE: 61.37
  - R² Score: 0.6610

Train SVR
SVR performence:
  - RMSE: 102.49
  - MAE: 65.95
  - R² Score: 0.5934

Best Model: Random Forest (R² = 0.7670)


sdf
