<a href="https://colab.research.google.com/github/tkacha467/end-to-end-ml-projects/blob/main/auto_mpg_forward_feature_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# ==========================================================
# 1. IMPORT REQUIRED LIBRARIES
# ==========================================================
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score


In [4]:
# ==========================================================
# 2. LOAD DATASET
# ==========================================================
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Marwadi_uiniversity/SEM-II/ML/auto-mpg.csv')
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0.1,Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0
1,1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0
2,2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0
3,3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0
4,4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0


In [5]:
# ==========================================================
# 3. DATA PREPROCESSING
# ==========================================================
# Replace '?' values (missing indicators) with NaN
df.replace('?', np.nan, inplace=True)

# Convert 'horsepower' column to numeric (it may be stored as string)
df['horsepower'] = pd.to_numeric(df['horsepower'])

# Remove rows containing missing values
df = df.dropna()

# Drop non-numeric column ('car name') if present
# Linear Regression requires numeric input
if 'car name' in df.columns:
    df = df.drop(columns=['car name'])

In [6]:
# ==========================================================
# 4. DEFINE FEATURES (X) AND TARGET (y)
# ==========================================================
# X → independent variables (car specifications)
# y → dependent variable (mpg to predict)

X = df.drop(columns=['mpg'])
y = df['mpg']

# Split dataset into:
# 80% training data
# 20% testing data
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42   # ensures reproducibility
)

In [7]:
# ==========================================================
# 5. FORWARD FEATURE SELECTION USING CROSS-VALIDATION
# ==========================================================
# Goal:
# Start with no features.
# Add one feature at a time.
# Keep the feature only if model performance improves.

remaining_features = list(X.columns)  # Features not yet selected
selected_features = []               # Final chosen features
best_score = -np.inf                 # Track best R² score found

print("Forward Feature Selection Process:\n")

# Continue until no features remain
while remaining_features:

    scores = []

    # Try adding each remaining feature individually
    for feature in remaining_features:

        # Create a temporary feature set
        features_to_test = selected_features + [feature]

        # Initialize Linear Regression model
        model = LinearRegression()

        # Perform 5-fold cross-validation on training data
        cv_scores = cross_val_score(
            model,
            X_train[features_to_test],
            y_train,
            cv=5,             # 5 folds
            scoring='r2'      # Evaluation metric
        )

        # Calculate average R² score
        score = np.mean(cv_scores)

        # Store score and feature name
        scores.append((score, feature))

    # Sort features by best performance (highest R² first)
    scores.sort(reverse=True)

    # Get best performing feature in this iteration
    current_best_score, best_feature = scores[0]

    # Add feature only if it improves model performance
    if current_best_score > best_score:
        best_score = current_best_score
        selected_features.append(best_feature)
        remaining_features.remove(best_feature)

        print(f"Added: {best_feature}, CV R2: {best_score:.4f}")
    else:
        # Stop if no improvement
        break


Forward Feature Selection Process:

Added: weight, CV R2: 0.6738
Added: model year, CV R2: 0.7983
Added: origin, CV R2: 0.8020


In [8]:
# ==========================================================
# 6. TRAIN FINAL MODEL WITH SELECTED FEATURES
# ==========================================================

final_model = LinearRegression()

# Fit model on selected features
final_model.fit(X_train[selected_features], y_train)

# Predict on unseen test data
y_pred = final_model.predict(X_test[selected_features])

# Evaluate final model performance
final_r2 = r2_score(y_test, y_pred)

In [10]:
# Display model coefficients for interpretation
coefficients = pd.Series(final_model.coef_, index=selected_features)
print("\nFeature Coefficients:")
print(coefficients)


Feature Coefficients:
weight       -0.006126
model year    0.795262
origin        1.081389
dtype: float64


In [9]:
# ==========================================================
# 7. DISPLAY RESULTS
# ==========================================================

print("\nFinal Selected Features:")
print(selected_features)

print(f"\nFinal Test R2 Score: {final_r2:.4f}")


Final Selected Features:
['weight', 'model year', 'origin']

Final Test R2 Score: 0.8409
