<a href="https://colab.research.google.com/github/vimalthomas/deeplearning/blob/main/MPGModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
"""MPG Model - Feature Engineering & Training"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor

# ------------------------------
# Load and Preprocess Data
# ------------------------------
uploaded = files.upload()  # Upload dataset file

Saving auto-mpg.data to auto-mpg (1).data


In [None]:

columns = ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model_year", "origin", "car_name"]
df = pd.read_fwf("auto-mpg.data", names=columns, na_values="?")

# Convert horsepower to numeric
df["horsepower"] = pd.to_numeric(df["horsepower"], errors="coerce")
df["horsepower"].fillna(df["horsepower"].mean(), inplace=True)  # Fill missing values

# Encode categorical features
df = pd.get_dummies(df, columns=["origin"], prefix="origin")
df = pd.get_dummies(df, columns=["cylinders"], prefix="cyl")

# Normalize numerical features
scaler = MinMaxScaler()
num_cols = ["displacement", "horsepower", "weight", "acceleration"]
df[num_cols] = scaler.fit_transform(df[num_cols])

# Drop unneeded columns
df.drop(columns=["car_name"], inplace=True)

# ------------------------------
# Feature Engineering
# ------------------------------
df["power_weight_ratio"] = df["horsepower"] / df["weight"]
df["torque_estimate"] = df["displacement"] * df["horsepower"]
df["acceleration_squared"] = df["acceleration"] ** 2
df["car_age"] = df["model_year"].max() - df["model_year"]
df["log_weight"] = np.log(df["weight"])
df["log_horsepower"] = np.log(df["horsepower"])
df["log_displacement"] = np.log(df["displacement"])
df["horsepower_acceleration"] = df["horsepower"] * df["acceleration"]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["horsepower"].fillna(df["horsepower"].mean(), inplace=True)  # Fill missing values
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [None]:
display(df)

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,model_year,origin_1,origin_2,origin_3,cyl_3,...,cyl_6,cyl_8,power_weight_ratio,torque_estimate,acceleration_squared,car_age,log_weight,log_horsepower,log_displacement,horsepower_acceleration
0,18.0,0.617571,0.456522,0.536150,0.238095,70,True,False,False,False,...,False,True,0.851482,0.281935,0.056689,12,-0.623342,-0.784119,-0.481961,0.108696
1,15.0,0.728682,0.646739,0.589736,0.208333,70,True,False,False,False,...,False,True,1.096658,0.471267,0.043403,12,-0.528080,-0.435812,-0.316518,0.134737
2,18.0,0.645995,0.565217,0.516870,0.178571,70,True,False,False,False,...,False,True,1.093539,0.365128,0.031888,12,-0.659964,-0.570545,-0.436964,0.100932
3,16.0,0.609819,0.565217,0.516019,0.238095,70,True,False,False,False,...,False,True,1.095342,0.344680,0.056689,12,-0.661611,-0.570545,-0.494593,0.134576
4,17.0,0.604651,0.510870,0.520556,0.148810,70,True,False,False,False,...,False,True,0.981393,0.308898,0.022144,12,-0.652858,-0.671641,-0.503104,0.076022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,27.0,0.186047,0.217391,0.333711,0.452381,82,True,False,False,False,...,False,False,0.651435,0.040445,0.204649,0,-1.097479,-1.526056,-1.681759,0.098344
394,44.0,0.074935,0.032609,0.146583,0.988095,82,False,True,False,False,...,False,False,0.222458,0.002444,0.976332,0,-1.920160,-3.423176,-2.591129,0.032220
395,32.0,0.173127,0.206522,0.193365,0.214286,82,True,False,False,False,...,False,False,1.068038,0.035754,0.045918,0,-1.643173,-1.577350,-1.753732,0.044255
396,28.0,0.134367,0.179348,0.286929,0.630952,82,True,False,False,False,...,False,False,0.625059,0.024098,0.398101,0,-1.248519,-1.718428,-2.007181,0.113160


In [None]:
# Remove less important features
df.drop(columns=["cyl_5", "cyl_3", "weight_squared", "displacement"], inplace=True)

# Handle infinity values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(df.mean(), inplace=True)  # Fill NaNs with column means

# ------------------------------
# Feature Selection (Lasso & Decision Tree)
# ------------------------------
X = df.drop(columns=["mpg"])
y = df["mpg"]

# Standardize for Lasso
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Lasso Regression Feature Selection
lasso = Lasso(alpha=0.01)
lasso.fit(X_train, y_train)
feature_importance = pd.Series(lasso.coef_, index=X.columns).sort_values(ascending=False)
dropped_features = feature_importance[feature_importance == 0].index.tolist()

KeyError: "['weight_squared'] not found in axis"

In [None]:

# Decision Tree Feature Selection
tree = DecisionTreeRegressor(max_depth=3, random_state=42)
tree.fit(X_train, y_train)
tree_importance = pd.Series(tree.feature_importances_, index=X.columns).sort_values(ascending=False)

# Drop Lasso-identified unimportant features
X_final = X.drop(columns=dropped_features)
X_train_final, X_test_final, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

# Train Decision Tree on refined features
tree_final = DecisionTreeRegressor(max_depth=3, random_state=42)
tree_final.fit(X_train_final, y_train)

# ------------------------------
# Variance Inflation Factor (Multicollinearity Check)
# ------------------------------
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
features_to_drop = vif_data[vif_data["VIF"] > 5]["Feature"].tolist()
df_cleaned = df.drop(columns=features_to_drop)


In [None]:


# ------------------------------
# Train-Validation-Test Split
# ------------------------------
np.random.seed(42)
shuffled_indices = np.random.permutation(len(df_cleaned))
train_size = int(0.7 * len(df_cleaned))
val_size = int(0.15 * len(df_cleaned))
test_size = len(df_cleaned) - train_size - val_size

train_indices = shuffled_indices[:train_size]
val_indices = shuffled_indices[train_size:train_size + val_size]
test_indices = shuffled_indices[train_size + val_size:]

train_x, train_y = df_cleaned.iloc[train_indices].drop(columns=["mpg"]).values, df_cleaned.iloc[train_indices]["mpg"].values
val_x, val_y = df_cleaned.iloc[val_indices].drop(columns=["mpg"]).values, df_cleaned.iloc[val_indices]["mpg"].values
test_x, test_y = df_cleaned.iloc[test_indices].drop(columns=["mpg"]).values, df_cleaned.iloc[test_indices]["mpg"].values

train_y = train_y.reshape(-1, 1)
val_y = val_y.reshape(-1, 1)
test_y = test_y.reshape(-1, 1)


In [None]:
# Clone the GitHub repository (if required)
!git clone https://github.com/vimalthomas/deeplearning.git
%cd deeplearning/
# Run the specific notebook/script
%run "Multilayerperceptron_drop.ipynb"


In [None]:


# ------------------------------
# Model Training: Multi-Layer Perceptron (MLP)
# ------------------------------
mlp = MultilayerPerceptron([
    Layer(train_x.shape[1], 128, Relu(), dropout_rate=0.02),
    Layer(128, 64, Relu(), dropout_rate=0.02),
    Layer(64, 32, Relu(), dropout_rate=0.02),
    Layer(32, 1, Linear())
])

loss_function = SquaredError()
train_losses, val_losses = mlp.train(
    train_x=train_x, train_y=train_y,
    val_x=val_x, val_y=val_y,
    loss_func=loss_function,
    learning_rate=0.001, batch_size=32, epochs=1000, model_type='regression'
)


In [None]:
# ------------------------------
# Loss Visualization
# ------------------------------
plt.figure(figsize=(8, 5))
epochs = np.arange(1, 1001)
plt.plot(epochs, train_losses, label='Training Loss', marker='o', linestyle='-')
plt.plot(epochs, val_losses, label='Validation Loss', marker='s', linestyle='--')
plt.xlabel('Epochs')
plt.ylabel('Loss (MSE)')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)
plt.show()

In [None]:




# ------------------------------
# Model Evaluation on Test Set
# ------------------------------
def evaluate_test_set(model, test_x, test_y):
    y_pred_test = model.forward(test_x)

    mse_test = np.mean((test_y - y_pred_test) ** 2)
    mae_test = np.mean(np.abs(test_y - y_pred_test))
    ss_total = np.sum((test_y - np.mean(test_y)) ** 2)
    ss_residual = np.sum((test_y - y_pred_test) ** 2)
    r2_test = 1 - (ss_residual / ss_total) if ss_total != 0 else 0

    print(f"Test Loss (MSE): {mse_test:.4f}")
    print(f"Test Loss (MAE): {mae_test:.4f}")
    print(f"Test R² Score: {r2_test:.4f}")

    sample_size = 10
    sample_indices = np.random.choice(len(test_x), size=sample_size, replace=False)
    sample_true = test_y[sample_indices].flatten()
    sample_pred = y_pred_test[sample_indices].flatten()

    results_df = pd.DataFrame({"True MPG": sample_true, "Predicted MPG": sample_pred})
    return mse_test, mae_test, r2_test, results_df

evaluate_test_set(mlp, test_x, test_y)
