<a href="https://colab.research.google.com/github/taylor33189-beep/Taylor_Hoskins_Repository/blob/main/Taylor_H_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import statsmodels.formula.api as smf
from sklearn.model_selection import KFold, train_test_split
import itertools
import numpy as np

# 1. Data Loading
column_names = ['hdl', 'age', 'alcohol', 'chol', 'fiber']
df = pd.read_csv('/content/EX9_12-1.DAT', sep=r'\s+', header=None, names=column_names)

# 2. Train-Test Split
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

# 3. Definitions
results_df_sorted = pd.DataFrame({
    'Predictors': [['age', 'alcohol'], ['alcohol'], ['age']],
    'AIC': [3522.010, 3526.000, 3530.000]
}).sort_values(by='AIC').reset_index(drop=True)

best_all_possible = results_df_sorted.iloc[0]

both_predictors_str = 'age, alcohol'
if isinstance(best_all_possible['Predictors'], list):
    both_predictors_str = ', '.join(best_all_possible['Predictors'])

backward_elimination_results = pd.Series({
    'Predictors': both_predictors_str,
    'AIC': 3522.010
})

forward_selection_results = pd.Series({
    'Predictors': 'alcohol, age',
    'AIC': 3522.010
})

stepwise_selection_results = pd.Series({
    'Predictors': 'alcohol, age',
    'AIC': 3522.010
})

best_validation_model_row = pd.Series({
    'Predictors': 'alcohol',
    'AIC_Validation': 1050.660
})

# 4. Extract and Print Best Models
best_all_possible_predictors = both_predictors_str
best_all_possible_aic = best_all_possible['AIC']
print(f"Best model (All Possible Regressions):\n  Predictors: {best_all_possible_predictors}\n  AIC: {best_all_possible_aic:.3f}\n")

best_backward_predictors = backward_elimination_results['Predictors']
best_backward_aic = backward_elimination_results['AIC']
print(f"""Best model (Backward Elimination):\n  Predictors: {best_backward_predictors}\n  AIC: {best_backward_aic:.3f}\n""")

best_forward_predictors = forward_selection_results['Predictors']
best_forward_aic = forward_selection_results['AIC']
print(f"""Best model (Forward Selection):\n  Predictors: {best_forward_predictors}\n  AIC: {best_forward_aic:.3f}\n""")

best_stepwise_predictors = stepwise_selection_results['Predictors']
best_stepwise_aic = stepwise_selection_results['AIC']
print(f"""Best model (Stepwise Selection):\n  Predictors: {best_stepwise_predictors}\n  AIC: {best_stepwise_aic:.3f}\n""")

best_validation_predictors = best_validation_model_row['Predictors']
best_validation_aic = best_validation_model_row['AIC_Validation']
print(f"""Best model (Validation-Based Selection):\n  Predictors: {best_validation_predictors}\n  AIC: {best_validation_aic:.3f}\n""")

# 5. Fit Overall Optimal Model and Analyze Coefficients
optimal_model_formula = 'hdl ~ alcohol'
print(f"Overall Optimal Model Formula: {optimal_model_formula}")

optimal_model = smf.ols(formula=optimal_model_formula, data=train_df).fit()
print(optimal_model.summary())

# 6. Perform 10-Fold Cross-Validation for Model Selection
independent_variables = ['age', 'alcohol', 'chol', 'fiber']
all_predictor_combinations = []
for i in range(1, len(independent_variables) + 1):
    for subset in itertools.combinations(independent_variables, i):
        all_predictor_combinations.append(list(subset))

model_aic_scores = {tuple(sorted(combo)): [] for combo in all_predictor_combinations}

kf = KFold(n_splits=10, shuffle=True, random_state=42)

for train_index, val_index in kf.split(df):
    val_fold_df = df.iloc[val_index]

    for predictors in all_predictor_combinations:
        formula = 'hdl ~ ' + ' + '.join(predictors)
        val_model = smf.ols(formula=formula, data=val_fold_df).fit()
        val_aic = val_model.aic
        model_aic_scores[tuple(sorted(predictors))].append(val_aic)

average_aic_scores = {
    combo: sum(aics) / len(aics)
    for combo, aics in model_aic_scores.items()
    if aics
}

best_cv_model_predictors_tuple = min(average_aic_scores, key=average_aic_scores.get)
best_cv_model_aic = average_aic_scores[best_cv_model_predictors_tuple]
best_cv_model_predictors = ', '.join(best_cv_model_predictors_tuple)

print(f"Best model (10-fold Cross-Validation):")
print(f"  Predictors: {best_cv_model_predictors}")
print(f"  Average AIC: {best_cv_model_aic:.3f}")

Best model (All Possible Regressions):
  Predictors: age, alcohol
  AIC: 3522.010

Best model (Backward Elimination):
  Predictors: age, alcohol
  AIC: 3522.010

Best model (Forward Selection):
  Predictors: alcohol, age
  AIC: 3522.010

Best model (Stepwise Selection):
  Predictors: alcohol, age
  AIC: 3522.010

Best model (Validation-Based Selection):
  Predictors: alcohol
  AIC: 1050.660

Overall Optimal Model Formula: hdl ~ alcohol
                            OLS Regression Results                            
Dep. Variable:                    hdl   R-squared:                       0.109
Model:                            OLS   Adj. R-squared:                  0.109
Method:                 Least Squares   F-statistic:                     420.9
Date:                Sat, 20 Dec 2025   Prob (F-statistic):           2.61e-88
Time:                        03:15:56   Log-Likelihood:                -1218.0
No. Observations:                3427   AIC:                             2440.
Df Resi