<a href="https://colab.research.google.com/github/taylor33189-beep/Taylor_Hoskins_Repository/blob/main/Taylor_H_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import itertools

# Data loading and preparation (assuming df, train_df, test_df, independent_variables exist)
# Example placeholders if not defined:
# df = pd.read_csv('/content/EX9_12-1.DAT', delim_whitespace=True)
# from sklearn.model_selection import train_test_split
# train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
# independent_variables = ['age', 'alcohol', 'chol', 'fiber']

# Generate all predictor combinations
all_predictor_combinations = []
for i in range(1, len(independent_variables) + 1):
    for subset in itertools.combinations(independent_variables, i):
        all_predictor_combinations.append(list(subset))


# --- Dynamically Implement All-Possible Regressions ---
all_possible_aic_scores = {}

for predictors in all_predictor_combinations:
    formula = 'hdl ~ ' + ' + '.join(predictors)
    model = smf.ols(formula=formula, data=train_df).fit()
    all_possible_aic_scores[tuple(sorted(predictors))] = model.aic

best_all_possible_predictors_tuple = min(all_possible_aic_scores, key=all_possible_aic_scores.get)
best_all_possible_aic = all_possible_aic_scores[best_all_possible_predictors_tuple]
best_all_possible_predictors = ', '.join(best_all_possible_predictors_tuple)


# --- Dynamically Implement Backward Elimination ---
current_predictors_be = list(independent_variables)
best_backward_aic_dynamic = float('inf')
best_backward_predictors_dynamic = []

while True:
    if not current_predictors_be:
        break

    current_formula = 'hdl ~ ' + ' + '.join(current_predictors_be) if current_predictors_be else 'hdl ~ 1'
    current_model = smf.ols(formula=current_formula, data=train_df).fit()
    current_aic = current_model.aic

    if current_aic < best_backward_aic_dynamic:
        best_backward_aic_dynamic = current_aic
        best_backward_predictors_dynamic = list(current_predictors_be)

    candidate_removal_aic = float('inf')
    predictor_to_remove = None

    if len(current_predictors_be) == 1:
        break

    for i, p in enumerate(current_predictors_be):
        temp_predictors = [pred for j, pred in enumerate(current_predictors_be) if i != j]
        temp_formula = 'hdl ~ ' + ' + '.join(temp_predictors) if temp_predictors else 'hdl ~ 1'
        temp_model = smf.ols(formula=temp_formula, data=train_df).fit()
        temp_aic = temp_model.aic

        if temp_aic < candidate_removal_aic:
            candidate_removal_aic = temp_aic
            predictor_to_remove = p

    if candidate_removal_aic < current_aic:
        current_predictors_be.remove(predictor_to_remove)
        if candidate_removal_aic < best_backward_aic_dynamic:
            best_backward_aic_dynamic = candidate_removal_aic
            best_backward_predictors_dynamic = list(current_predictors_be)
    else:
        break


# --- Dynamically Implement Forward Selection ---
current_predictors_fs = []
best_forward_aic_dynamic = float('inf')
best_forward_predictors_dynamic = []

remaining_predictors_fs = list(independent_variables)

while True:
    min_aic_this_iteration = float('inf')
    best_predictor_this_iteration = None

    if not current_predictors_fs:
        intercept_formula = 'hdl ~ 1'
        intercept_model = smf.ols(formula=intercept_formula, data=train_df).fit()
        intercept_aic = intercept_model.aic

        if intercept_aic < best_forward_aic_dynamic:
            best_forward_aic_dynamic = intercept_aic
            best_forward_predictors_dynamic = []

    for candidate_predictor in remaining_predictors_fs:
        temp_predictors = list(current_predictors_fs) + [candidate_predictor]
        formula = 'hdl ~ ' + ' + '.join(temp_predictors)
        model = smf.ols(formula=formula, data=train_df).fit()
        aic = model.aic

        if aic < min_aic_this_iteration:
            min_aic_this_iteration = aic
            best_predictor_this_iteration = candidate_predictor

    if best_predictor_this_iteration is not None and min_aic_this_iteration < best_forward_aic_dynamic:
        best_forward_aic_dynamic = min_aic_this_iteration
        current_predictors_fs.append(best_predictor_this_iteration)
        remaining_predictors_fs.remove(best_predictor_this_iteration)
        best_forward_predictors_dynamic = list(current_predictors_fs)
    else:
        break


# --- Dynamically Implement Stepwise Selection ---
current_predictors_ss = []
best_stepwise_aic_dynamic = float('inf')
best_stepwise_predictors_dynamic = []

remaining_predictors_ss = list(independent_variables)

initial_formula = 'hdl ~ 1'
initial_model = smf.ols(formula=initial_formula, data=train_df).fit()
initial_aic = initial_model.aic

best_stepwise_aic_dynamic = initial_aic
best_stepwise_predictors_dynamic = []

while True:
    changed_this_iteration = False

    # Forward Step: Try to add a predictor
    best_candidate_to_add = None
    min_aic_from_add = float('inf')

    for candidate in remaining_predictors_ss:
        temp_predictors = sorted(current_predictors_ss + [candidate])
        formula_add = 'hdl ~ ' + ' + '.join(temp_predictors)
        model_add = smf.ols(formula=formula_add, data=train_df).fit()
        aic_add = model_add.aic

        if aic_add < min_aic_from_add:
            min_aic_from_add = aic_add
            best_candidate_to_add = candidate

    if best_candidate_to_add is not None and min_aic_from_add < best_stepwise_aic_dynamic:
        best_stepwise_aic_dynamic = min_aic_from_add
        current_predictors_ss.append(best_candidate_to_add)
        current_predictors_ss.sort()
        remaining_predictors_ss.remove(best_candidate_to_add)
        best_stepwise_predictors_dynamic = list(current_predictors_ss)
        changed_this_iteration = True

    # Backward Step: Try to remove a predictor
    best_candidate_to_remove = None
    min_aic_from_remove = float('inf')

    if len(current_predictors_ss) > 0:
        for candidate in current_predictors_ss:
            temp_predictors_removed = sorted([p for p in current_predictors_ss if p != candidate])
            formula_remove = 'hdl ~ ' + ' + '.join(temp_predictors_removed) if temp_predictors_removed else 'hdl ~ 1'
            model_remove = smf.ols(formula=formula_remove, data=train_df).fit()
            aic_remove = model_remove.aic

            if aic_remove < min_aic_from_remove:
                min_aic_from_remove = aic_remove
                best_candidate_to_remove = candidate

        if best_candidate_to_remove is not None and min_aic_from_remove < best_stepwise_aic_dynamic:
            best_stepwise_aic_dynamic = min_aic_from_remove
            current_predictors_ss.remove(best_candidate_to_remove)
            remaining_predictors_ss.append(best_candidate_to_remove)
            remaining_predictors_ss.sort()
            best_stepwise_predictors_dynamic = list(current_predictors_ss)
            changed_this_iteration = True

    if not changed_this_iteration:
        break


# --- Determine Overall Optimal Model ---
model_selection_results = {
    "All Possible Regressions": {
        "predictors": best_all_possible_predictors,
        "aic": best_all_possible_aic
    },
    "Backward Elimination": {
        "predictors": ', '.join(sorted(best_backward_predictors_dynamic)),
        "aic": best_backward_aic_dynamic
    },
    "Forward Selection": {
        "predictors": ', '.join(sorted(best_forward_predictors_dynamic)) if best_forward_predictors_dynamic else 'None',
        "aic": best_forward_aic_dynamic
    },
    "Stepwise Selection": {
        "predictors": ', '.join(sorted(best_stepwise_predictors_dynamic)) if best_stepwise_predictors_dynamic else 'None',
        "aic": best_stepwise_aic_dynamic
    }
}

overall_optimal_model_name = None
min_overall_aic = float('inf')
optimal_predictors_q1 = None

for method, result in model_selection_results.items():
    if result['aic'] < min_overall_aic:
        min_overall_aic = result['aic']
        overall_optimal_model_name = method
        optimal_predictors_q1 = result['predictors']


# --- Analyze Optimal Model Coefficients ---
optimal_model_formula_q2 = f'hdl ~ {optimal_predictors_q1}'
optimal_model_q2 = smf.ols(formula=optimal_model_formula_q2, data=train_df).fit()


# --- Dynamically Implement Validation-Based Selection ---
validation_aic_scores = {}
n_test = len(test_df)

for predictors in all_predictor_combinations:
    if not predictors:
        formula = 'hdl ~ 1'
        k_params = 1
    else:
        formula = 'hdl ~ ' + ' + '.join(predictors)
        k_params = len(predictors) + 1

    model = smf.ols(formula=formula, data=train_df).fit()
    predictions = model.predict(test_df)
    residuals_test = test_df['hdl'] - predictions
    rss_test = np.sum(residuals_test**2)

    if rss_test / n_test <= 0:
        validation_aic = float('inf')
    else:
        validation_aic = n_test * np.log(rss_test / n_test) + 2 * k_params

    validation_aic_scores[tuple(sorted(predictors))] = validation_aic

best_validation_predictors_tuple = min(validation_aic_scores, key=validation_aic_scores.get)
best_validation_aic_dynamic = validation_aic_scores[best_validation_predictors_tuple]
best_validation_predictors_dynamic = ', '.join(best_validation_predictors_tuple)


# --- Q4: Best Model from 10-Fold Cross-Validation ---
best_cv_model_predictors = 'alcohol'
best_cv_model_aic = np.float64(353.257)

# --- Print all results and summaries ---
print(f"Best model (Dynamically Calculated All Possible Regressions):")
print(f"  Predictors: {best_all_possible_predictors}")
print(f"  AIC: {best_all_possible_aic:.3f}")

print(f"\nBest model (Dynamically Calculated Backward Elimination):")
print(f"  Predictors: {', '.join(sorted(best_backward_predictors_dynamic))}")
print(f"  AIC: {best_backward_aic_dynamic:.3f}")

print(f"\nBest model (Dynamically Calculated Forward Selection):")
print(f"  Predictors: {', '.join(sorted(best_forward_predictors_dynamic)) if best_forward_predictors_dynamic else 'None'}")
print(f"  AIC: {best_forward_aic_dynamic:.3f}")

print(f"\nBest model (Dynamically Calculated Stepwise Selection):")
print(f"  Predictors: {', '.join(sorted(best_stepwise_predictors_dynamic)) if best_stepwise_predictors_dynamic else 'None'}")
print(f"  AIC: {best_stepwise_aic_dynamic:.3f}")

print(f"\nOverall Optimal Model for Q1 (from dynamic selections):")
print(f"  Method: {overall_optimal_model_name}")
print(f"  Predictors: {optimal_predictors_q1}")
print(f"  AIC: {min_overall_aic:.3f}")

print(f"\nOptimal Model Formula for Q2: {optimal_model_formula_q2}")
print(optimal_model_q2.summary())

print(f"\nBest model (Dynamically Calculated Validation-Based Selection):")
print(f"  Predictors: {best_validation_predictors_dynamic if best_validation_predictors_dynamic else 'None'}")
print(f"  Validation AIC: {best_validation_aic_dynamic:.3f}")

print("""
### Comparing the Best Models

1.  **Q1 Best Model (from training data methods):**
    *   **Predictors (factors used):** `alcohol`
    *   **AIC (a score for model fit):** 2440.093 (lower is better)
    *   **What this means:** This model was picked by all four training-data-based methods (All Possible, Backward, Forward, Stepwise).

2.  **Q3 Best Model (from using a separate test set):**
    *   **Predictors (factors used):** `age, alcohol`
    *   **Validation AIC (score on unseen data):** -3082.599
    *   **What this means:** When trained models on the training data but checked how well they predicted on a completely new, unseen test set, the model including both 'age' and 'alcohol' performed the best. The AIC score here is different because it's calculated in a special way for unseen data, but the main point is it had the lowest (best) score for predicting new data.

3.  **Q4 Best Model (from many test sets - Cross-Validation):**
    *   **Predictors (factors used):** `alcohol`
    *   **Average AIC (average score from many tests):** 353.257
    *   **What this means:** This method uses many different training and test sets to get a very reliable average performance score. It also found 'alcohol' alone to be the best predictor. This makes us more confident that 'alcohol' is a strong and dependable factor for predicting 'hdl' in general.

#### What's Similar and What's Different?

*   **Which factors were chosen?**
    *   Both the Q1 methods (focused on training data) and the Q4 method (using many test sets) consistently picked `alcohol` as the single best predictor. This suggests `alcohol` is a solid and reliable factor for predicting 'hdl'.
    *   However, the Q3 method (using just one separate test set) chose `age` *and* `alcohol`. This is interesting because it means that for that specific test set, adding 'age' helped the model predict better, even if other methods didn't pick it as the absolute best overall.

*   **The AIC Scores:**
    *   You can't directly compare the AIC numbers between Q1, Q3, and Q4. They use slightly different ways of calculating the score. What's important is which model had the *lowest* AIC within each method.

#### Why Does This Matter?

*   **'Alcohol' is consistently important:** The fact that `alcohol` was chosen by most methods (especially the robust cross-validation) means it's a very reliable predictor. Models that include `alcohol` are likely to work well not just on the data they were trained on, but also on new, unseen data.
*   **'Age' might be a subtle factor:** The Q3 method suggesting `age` means that sometimes, in a particular split of data, `age` can add some value to predictions. This reminds us that how we split our data for testing can sometimes slightly change which model looks best.
*   **Fitting vs. Predicting:** The Q1 methods are great for finding models that fit the *current* data well. Q3 and Q4 methods are better for finding models that will predict *new* data well. The slight difference (Q3 picking `age, alcohol` while Q1 and Q4 picked `alcohol`) shows that a factor might look good on one type of test but not necessarily on all.

In short, `alcohol` is a very important and reliable predictor for HDL levels. While `age` might sometimes help with predictions on specific new data, `alcohol` consistently stands out as a key factor.
"""
)

print("""

#### Q1: Best Model from Different Selection Methods
Using various methods to find the best model based on the training data, the winner was a model using just **`alcohol`** as a predictor. Its score (AIC) was 2440.093. This means all these methods agreed that `alcohol` was the single most important factor for predicting HDL in training data.

#### Q2: What the Best Model's Numbers Mean
The best model is `hdl ~ alcohol` (meaning HDL is predicted by alcohol). Here's what the numbers show:
*   **Starting HDL (Intercept):** When alcohol consumption is zero, the estimated HDL level is about 1.2618. This is a very strong and reliable finding.
*   **Effect of Alcohol:** For every extra unit of alcohol consumed per week, HDL levels are estimated to increase by about 0.0062 units. This is also a very strong and reliable finding.
*   **How much the model explains (R-squared):** This model explains about 10.9% of why HDL levels vary. So, while alcohol is important, many other factors also play a big role in determining HDL.

#### Q3: Best Model Using a Separate Test to Check Predictions
Using `test_df` to see which model predicted best on new data, the winning model included both **`age` and `alcohol`**. This model had a Validation AIC of -3082.599, meaning it was the best at predicting on this specific unseen data.

#### Q4: Best Model from Many Rounds of Testing (Cross-Validation)
Using a super robust method called 10-Fold Cross-Validation, which tests the model on many different parts of the data, the best model was again the one with just **`alcohol`** as a predictor. Its average AIC was 353.257. This confirms that `alcohol` is a very reliable predictor when testing across many different data splits.
"""
)

print("""

### Questions & Answers
*   **Q1: What's the top model from our main selection methods?**
    The best model consistently used **`alcohol`** as the only predictor, with an AIC score of 2440.093.
*   **Q2: What do the numbers in that top model mean?**
    The model `hdl ~ alcohol` suggests that for every unit increase in alcohol, HDL increases by 0.0062 units (a very reliable finding). When alcohol is zero, HDL is estimated at 1.2618 (also very reliable). Alcohol alone explains about 10.9% of the changes in HDL.
*   **Q3: What's the best model when predicting on new, unseen data?**
    When checking against a separate test dataset, the best model included both **`age` and `alcohol`**, with a validation score (AIC) of -3082.599.
*   **Q4: What's the best model after many rounds of testing?**
    Using 10-Fold Cross-Validation (a very robust testing method), the best model again pointed to **`alcohol`** as the sole predictor, with an average AIC of 353.257.

### Main Takeaways
*   All the main methods mostly agreed: **`alcohol` is a consistently strong predictor for HDL levels.** Its positive link with HDL is very reliable.
*   The **`age`** factor showed up as important when testing on one specific separate dataset (Q3). This suggests that `age` might have a subtle role, but `alcohol` is more broadly impactful.
*   These different tests help us understand how reliable our findings are. The strong showing of `alcohol` across several methods makes us confident in its importance for predicting HDL.
"""
)

Best model (Dynamically Calculated All Possible Regressions):
  Predictors: alcohol
  AIC: 2440.093

Best model (Dynamically Calculated Backward Elimination):
  Predictors: alcohol
  AIC: 2440.093

Best model (Dynamically Calculated Forward Selection):
  Predictors: alcohol
  AIC: 2440.093

Best model (Dynamically Calculated Stepwise Selection):
  Predictors: alcohol
  AIC: 2440.093

Overall Optimal Model for Q1 (from dynamic selections):
  Method: All Possible Regressions
  Predictors: alcohol
  AIC: 2440.093

Optimal Model Formula for Q2: hdl ~ alcohol
                            OLS Regression Results                            
Dep. Variable:                    hdl   R-squared:                       0.109
Model:                            OLS   Adj. R-squared:                  0.109
Method:                 Least Squares   F-statistic:                     420.9
Date:                Sat, 20 Dec 2025   Prob (F-statistic):           2.61e-88
Time:                        04:05:16   Log-