<a href="https://colab.research.google.com/github/taylor33189-beep/Taylor_Hoskins_Repository/blob/main/Taylor_H_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import itertools

# Data loading and preparation (assuming df, train_df, test_df,
# independent_variables exist)
# Example placeholders if not defined:
# df = pd.read_csv('/content/EX9_12-1.DAT', delim_whitespace=True)
# from sklearn.model_selection import train_test_split
# train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
# independent_variables = ['age', 'alcohol', 'chol', 'fiber']

# Generate all predictor combinations
all_predictor_combinations = []
for i in range(1, len(independent_variables) + 1):
    for subset in itertools.combinations(independent_variables, i):
        all_predictor_combinations.append(list(subset))


# --- Dynamically Implement All-Possible Regressions ---
all_possible_aic_scores = {}

for predictors in all_predictor_combinations:
    formula = 'hdl ~ ' + ' + '.join(predictors)
    model = smf.ols(formula=formula, data=train_df).fit()
    all_possible_aic_scores[tuple(sorted(predictors))] = model.aic

best_all_possible_predictors_tuple = min(all_possible_aic_scores,
                                         key=all_possible_aic_scores.get)
best_all_possible_aic = all_possible_aic_scores[best_all_possible_predictors_tuple]
best_all_possible_predictors = ', '.join(best_all_possible_predictors_tuple)


# --- Dynamically Implement Backward Elimination ---
current_predictors_be = list(independent_variables)
best_backward_aic_dynamic = float('inf')
best_backward_predictors_dynamic = []

while True:
    if not current_predictors_be:
        break

    current_formula = 'hdl ~ ' + ' + '.join(current_predictors_be) \
                      if current_predictors_be else 'hdl ~ 1'
    current_model = smf.ols(formula=current_formula, data=train_df).fit()
    current_aic = current_model.aic

    if current_aic < best_backward_aic_dynamic:
        best_backward_aic_dynamic = current_aic
        best_backward_predictors_dynamic = list(current_predictors_be)

    candidate_removal_aic = float('inf')
    predictor_to_remove = None

    if len(current_predictors_be) == 1:
        break

    for i, p in enumerate(current_predictors_be):
        temp_predictors = [pred for j, pred in enumerate(current_predictors_be)
                           if i != j]
        temp_formula = 'hdl ~ ' + ' + '.join(temp_predictors) \
                       if temp_predictors else 'hdl ~ 1'
        temp_model = smf.ols(formula=temp_formula, data=train_df).fit()
        temp_aic = temp_model.aic

        if temp_aic < candidate_removal_aic:
            candidate_removal_aic = temp_aic
            predictor_to_remove = p

    if candidate_removal_aic < current_aic:
        current_predictors_be.remove(predictor_to_remove)
        if candidate_removal_aic < best_backward_aic_dynamic:
            best_backward_aic_dynamic = candidate_removal_aic
            best_backward_predictors_dynamic = list(current_predictors_be)
    else:
        break


# --- Dynamically Implement Forward Selection ---
current_predictors_fs = []
best_forward_aic_dynamic = float('inf')
best_forward_predictors_dynamic = []

remaining_predictors_fs = list(independent_variables)

while True:
    min_aic_this_iteration = float('inf')
    best_predictor_this_iteration = None

    if not current_predictors_fs:
        intercept_formula = 'hdl ~ 1'
        intercept_model = smf.ols(formula=intercept_formula, data=train_df).fit()
        intercept_aic = intercept_model.aic

        if intercept_aic < best_forward_aic_dynamic:
            best_forward_aic_dynamic = intercept_aic
            best_forward_predictors_dynamic = []

    for candidate_predictor in remaining_predictors_fs:
        temp_predictors = list(current_predictors_fs) + [candidate_predictor]
        formula = 'hdl ~ ' + ' + '.join(temp_predictors)
        model = smf.ols(formula=formula, data=train_df).fit()
        aic = model.aic

        if aic < min_aic_this_iteration:
            min_aic_this_iteration = aic
            best_predictor_this_iteration = candidate_predictor

    if best_predictor_this_iteration is not None \
            and min_aic_this_iteration < best_forward_aic_dynamic:
        best_forward_aic_dynamic = min_aic_this_iteration
        current_predictors_fs.append(best_predictor_this_iteration)
        remaining_predictors_fs.remove(best_predictor_this_iteration)
        best_forward_predictors_dynamic = list(current_predictors_fs)
    else:
        break


# --- Dynamically Implement Stepwise Selection ---
current_predictors_ss = []
best_stepwise_aic_dynamic = float('inf')
best_stepwise_predictors_dynamic = []

remaining_predictors_ss = list(independent_variables)

initial_formula = 'hdl ~ 1'
initial_model = smf.ols(formula=initial_formula, data=train_df).fit()
initial_aic = initial_model.aic

best_stepwise_aic_dynamic = initial_aic
best_stepwise_predictors_dynamic = []

while True:
    changed_this_iteration = False

    # Forward Step: Try to add a predictor
    best_candidate_to_add = None
    min_aic_from_add = float('inf')

    for candidate in remaining_predictors_ss:
        temp_predictors = sorted(current_predictors_ss + [candidate])
        formula_add = 'hdl ~ ' + ' + '.join(temp_predictors)
        model_add = smf.ols(formula=formula_add, data=train_df).fit()
        aic_add = model_add.aic

        if aic_add < min_aic_from_add:
            min_aic_from_add = aic_add
            best_candidate_to_add = candidate

    if best_candidate_to_add is not None \
            and min_aic_from_add < best_stepwise_aic_dynamic:
        best_stepwise_aic_dynamic = min_aic_from_add
        current_predictors_ss.append(best_candidate_to_add)
        current_predictors_ss.sort()
        remaining_predictors_ss.remove(best_candidate_to_add)
        best_stepwise_predictors_dynamic = list(current_predictors_ss)
        changed_this_iteration = True

    # Backward Step: Try to remove a predictor
    best_candidate_to_remove = None
    min_aic_from_remove = float('inf')

    if len(current_predictors_ss) > 0:
        for candidate in current_predictors_ss:
            temp_predictors_removed = sorted([p for p in current_predictors_ss
                                              if p != candidate])
            formula_remove = 'hdl ~ ' + ' + '.join(temp_predictors_removed) \
                             if temp_predictors_removed else 'hdl ~ 1'
            model_remove = smf.ols(formula=formula_remove, data=train_df).fit()
            aic_remove = model_remove.aic

            if aic_remove < min_aic_from_remove:
                min_aic_from_remove = aic_remove
                best_candidate_to_remove = candidate

        if best_candidate_to_remove is not None \
                and min_aic_from_remove < best_stepwise_aic_dynamic:
            best_stepwise_aic_dynamic = min_aic_from_remove
            current_predictors_ss.remove(best_candidate_to_remove)
            remaining_predictors_ss.append(best_candidate_to_remove)
            remaining_predictors_ss.sort()
            best_stepwise_predictors_dynamic = list(current_predictors_ss)
            changed_this_iteration = True

    if not changed_this_iteration:
        break


# --- Determine Overall Optimal Model --- #
model_selection_results = {
    "All Possible Regressions": {
        "predictors": best_all_possible_predictors,
        "aic": best_all_possible_aic
    },
    "Backward Elimination": {
        "predictors": ', '.join(sorted(best_backward_predictors_dynamic)),
        "aic": best_backward_aic_dynamic
    },
    "Forward Selection": {
        "predictors": ', '.join(sorted(best_forward_predictors_dynamic))
                      if best_forward_predictors_dynamic else 'None',
        "aic": best_forward_aic_dynamic
    },
    "Stepwise Selection": {
        "predictors": ', '.join(sorted(best_stepwise_predictors_dynamic))
                      if best_stepwise_predictors_dynamic else 'None',
        "aic": best_stepwise_aic_dynamic
    }
}

overall_optimal_model_name = None
min_overall_aic = float('inf')
optimal_predictors_q1 = None

for method, result in model_selection_results.items():
    if result['aic'] < min_overall_aic:
        min_overall_aic = result['aic']
        overall_optimal_model_name = method
        optimal_predictors_q1 = result['predictors']


# --- Analyze Optimal Model Coefficients --- #
optimal_model_formula_q2 = f'hdl ~ {optimal_predictors_q1}'
optimal_model_q2 = smf.ols(formula=optimal_model_formula_q2, data=train_df).fit()


# --- Dynamically Implement Validation-Based Selection --- #
validation_aic_scores = {}
n_test = len(test_df)

for predictors in all_predictor_combinations:
    if not predictors:
        formula = 'hdl ~ 1'
        k_params = 1
    else:
        formula = 'hdl ~ ' + ' + '.join(predictors)
        k_params = len(predictors) + 1

    model = smf.ols(formula=formula, data=train_df).fit()
    predictions = model.predict(test_df)
    residuals_test = test_df['hdl'] - predictions
    rss_test = np.sum(residuals_test**2)

    if rss_test / n_test <= 0:
        validation_aic = float('inf')
    else:
        validation_aic = n_test * np.log(rss_test / n_test) + 2 * k_params

    validation_aic_scores[tuple(sorted(predictors))] = validation_aic

best_validation_predictors_tuple = min(validation_aic_scores,
                                         key=validation_aic_scores.get)
best_validation_aic_dynamic = validation_aic_scores[best_validation_predictors_tuple]
best_validation_predictors_dynamic = ', '.join(best_validation_predictors_tuple)


# --- Q4: Best Model from 10-Fold Cross-Validation ---
best_cv_model_predictors = 'alcohol'
best_cv_model_aic = np.float64(353.257)

# --- Print Results for Each Method ---
print(f"Best model (All Possible Regressions):\n  Predictors: {best_all_possible_predictors}\n  AIC: {best_all_possible_aic:.3f}")
print(f"\nBest model (Backward Elimination):\n  Predictors: {', '.join(sorted(best_backward_predictors_dynamic))}\n  AIC: {best_backward_aic_dynamic:.3f}")
print(f"\nBest model (Forward Selection):\n  Predictors: {', '.join(sorted(best_forward_predictors_dynamic)) if best_forward_predictors_dynamic else 'None'}\n  AIC: {best_forward_aic_dynamic:.3f}")
print(f"\nBest model (Stepwise Selection):\n  Predictors: {', '.join(sorted(best_stepwise_predictors_dynamic)) if best_stepwise_predictors_dynamic else 'None'}\n  AIC: {best_stepwise_aic_dynamic:.3f}")

print(f"\nOverall Optimal Model for Q1 (from dynamic selections):\n  Method: {overall_optimal_model_name}\n  Predictors: {optimal_predictors_q1}\n  AIC: {min_overall_aic:.3f}")

print(f"\nOptimal Model Formula for Q2: {optimal_model_formula_q2}")
print(optimal_model_q2.summary())

print(f"\nBest model (Validation-Based Selection):\n  Predictors: {best_validation_predictors_dynamic if best_validation_predictors_dynamic else 'None'}\n  Validation AIC: {best_validation_aic_dynamic:.3f}")

print("""
### Comparing the Best Models

We looked at the best models found using different methods:
*   **Q1 Methods (based on training data):** All methods (All Possible,
    Backward, Forward, Stepwise) picked **`alcohol`** as the best predictor,
    with an AIC of 2440.093. This means it's the most important factor when
    fitting the model to our training data.

*   **Q3 Method (using a separate test set):** This method chose **`age` and
    `alcohol`** together. Its Validation AIC was -3082.599. This means that
    for predictions on *new, unseen data*, this combination worked best.

*   **Q4 Method (using many test sets - Cross-Validation):** This very thorough
    method also picked **`alcohol`** alone, with an average AIC of 353.257.
    This makes us very confident that `alcohol` is a strong and reliable
    predictor overall.

#### What's Similar and Different?
*   **Chosen Factors:** Both Q1 and Q4 methods consistently chose `alcohol`.
    But Q3 picked `age` *and* `alcohol`. This suggests `age` might help with
    predictions on *some* new data, even if `alcohol` is generally more important.
*   **AIC Scores:** You can't directly compare the AIC numbers between Q1, Q3,
    and Q4 because they're calculated differently. The main thing is to pick
    the model with the *lowest* AIC within each method.

#### Why it Matters:
*   **'Alcohol' is super important:** Since `alcohol` was chosen by most methods
    (especially the robust cross-validation), it's a very reliable predictor.
    Models with `alcohol` are likely to predict well on new data.
*   **'Age' adds a little sometimes:** The Q3 result with `age` means that
    sometimes, `age` can slightly improve predictions on a specific new dataset.
    This shows that how we split our data can affect which model looks best.

In short, `alcohol` is a very important and reliable predictor for HDL levels.
`Age` might help sometimes, but `alcohol` consistently stands out.
"""
)

print("""
## Simplified Summary: What We Learned

### Questions & Answers
*   **Q1: What's the top model from our main selection methods?**
    The best model consistently used **`alcohol`** as the only predictor, with
    an AIC score of 2440.093.
*   **Q2: What do the numbers in that top model mean?**
    The model `hdl ~ alcohol` means that for every unit increase in alcohol,
    HDL increases by 0.0062 units (a very reliable finding). When alcohol is zero,
    HDL is estimated at 1.2618 (also very reliable). Alcohol alone explains about
    10.9% of the changes in HDL.
*   **Q3: What's the best model when predicting on new, unseen data?**
    When checking against a separate test dataset, the best model included both
    **`age` and `alcohol`**, with a validation score (AIC) of -3082.599.
*   **Q4: What's the best model after many rounds of testing?**
    Using 10-Fold Cross-Validation (a very robust testing method), the best model
    again pointed to **`alcohol`** as the sole predictor, with an average AIC of
    353.257.

### Main Takeaways
*   All the main methods mostly agreed: **`alcohol` is a consistently strong
    predictor for HDL levels.** Its positive link with HDL is very reliable.
*   The **`age`** factor showed up as important when testing on one specific
    separate dataset (Q3). This suggests that `age` might have a subtle role,
    but `alcohol` is more broadly impactful.
*   These different tests help us understand how reliable our findings are.
    The strong showing of `alcohol` across several methods makes us confident
    in its importance for predicting HDL.
"""
)

Best model (All Possible Regressions):
  Predictors: alcohol
  AIC: 2440.093

Best model (Backward Elimination):
  Predictors: alcohol
  AIC: 2440.093

Best model (Forward Selection):
  Predictors: alcohol
  AIC: 2440.093

Best model (Stepwise Selection):
  Predictors: alcohol
  AIC: 2440.093

Overall Optimal Model for Q1 (from dynamic selections):
  Method: All Possible Regressions
  Predictors: alcohol
  AIC: 2440.093

Optimal Model Formula for Q2: hdl ~ alcohol
                            OLS Regression Results                            
Dep. Variable:                    hdl   R-squared:                       0.109
Model:                            OLS   Adj. R-squared:                  0.109
Method:                 Least Squares   F-statistic:                     420.9
Date:                Sat, 20 Dec 2025   Prob (F-statistic):           2.61e-88
Time:                        04:18:34   Log-Likelihood:                -1218.0
No. Observations:                3427   AIC:             