<a href="https://colab.research.google.com/github/tkacha467/end-to-end-ml-projects/blob/main/auto_mpg_backward_elimination.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==========================================================
# 1. IMPORT LIBRARIES
# ==========================================================
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

In [None]:
# ==========================================================
# 2. LOAD DATA
# ==========================================================
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Marwadi_uiniversity/SEM-II/ML/auto-mpg.csv')
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0.1,Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0
1,1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0
2,2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0
3,3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0
4,4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0


In [None]:

# ==========================================================
# 3. DATA CLEANING
# ==========================================================

# Replace '?' with NaN
df.replace('?', np.nan, inplace=True)

# Convert horsepower to numeric
df['horsepower'] = pd.to_numeric(df['horsepower'])

# Drop missing values
df.dropna(inplace=True)

# Drop non-numeric column if exists
if 'car name' in df.columns:
    df.drop(columns=['car name'], inplace=True)


# ==========================================================
# 4. DEFINE FEATURES & TARGET
# ==========================================================
X = df.drop(columns=['mpg'])
y = df['mpg']


# ==========================================================
# 5. TRAIN-TEST SPLIT
# ==========================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


# ==========================================================
# 6. FEATURE SCALING (IMPORTANT)
# ==========================================================
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)


# ==========================================================
# 7. BACKWARD FEATURE ELIMINATION USING CV
# ==========================================================

selected_features = list(X.columns)
best_score = -np.inf

print("Backward Feature Elimination Process:\n")

while len(selected_features) > 1:

    scores = []

    for feature in selected_features:

        features_to_test = selected_features.copy()
        features_to_test.remove(feature)

        model = LinearRegression()

        cv_scores = cross_val_score(
            model,
            X_train_scaled[features_to_test],
            y_train,
            cv=5,
            scoring='r2'
        )

        mean_score = np.mean(cv_scores)
        scores.append((mean_score, feature))

    # Sort by highest R2
    scores.sort(reverse=True)
    current_best_score, feature_to_remove = scores[0]

    if current_best_score > best_score:
        best_score = current_best_score
        selected_features.remove(feature_to_remove)
        print(f"Removed: {feature_to_remove}, CV R2: {best_score:.4f}")
    else:
        break


# ==========================================================
# 8. TRAIN FINAL MODEL
# ==========================================================
final_model = LinearRegression()

final_model.fit(
    X_train_scaled[selected_features],
    y_train
)


# ==========================================================
# 9. TEST SET EVALUATION
# ==========================================================
y_pred = final_model.predict(
    X_test_scaled[selected_features]
)

final_r2 = r2_score(y_test, y_pred)

print("\nFinal Selected Features:")
print(selected_features)

print(f"\nFinal Test R2 Score: {final_r2:.4f}")


Backward Feature Elimination Process:

Removed: Unnamed: 0, CV R2: 0.8006
Removed: cylinders, CV R2: 0.8010
Removed: acceleration, CV R2: 0.8013
Removed: displacement, CV R2: 0.8016
Removed: horsepower, CV R2: 0.8020

Final Selected Features:
['weight', 'model year', 'origin']

Final Test R2 Score: 0.8409
