In [1]:
#### Step 1: Load and Prepare the Data

import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load diabetes dataset
diabetes = load_diabetes()
X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = diabetes.target

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Sample of unscaled training data:")
print(X_train.head())

Sample of unscaled training data:
          age       sex       bmi        bp        s1        s2        s3  \
17   0.070769  0.050680  0.012117  0.056301  0.034206  0.049416 -0.039719   
66  -0.009147  0.050680 -0.018062 -0.033213 -0.020832  0.012152 -0.072854   
137  0.005383 -0.044642  0.049840  0.097615 -0.015328 -0.016345 -0.006584   
245 -0.027310 -0.044642 -0.035307 -0.029770 -0.056607 -0.058620  0.030232   
31  -0.023677 -0.044642 -0.065486 -0.081413 -0.038720 -0.053610  0.059685   

           s4        s5        s6  
17   0.034309  0.027364 -0.001078  
66   0.071210  0.000272  0.019633  
137 -0.002592  0.017036 -0.013504  
245 -0.039493 -0.049872 -0.129483  
31  -0.076395 -0.037129 -0.042499  


In [2]:
# Step 2: Exaggerate Scale Differences
""" To make the effect more pronounced, let’s artificially scale some features (e.g., multiply `bmi` by 1000 and `bp` by 100). """


# Exaggerate scales
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled['bmi'] = X_train['bmi'] * 1000
X_test_scaled['bmi'] = X_test['bmi'] * 1000
X_train_scaled['bp'] = X_train['bp'] * 100
X_test_scaled['bp'] = X_test['bp'] * 100

print("Sample of training data with exaggerated scales:")
print(X_train_scaled.head())

Sample of training data with exaggerated scales:
          age       sex        bmi        bp        s1        s2        s3  \
17   0.070769  0.050680  12.116851  5.630090  0.034206  0.049416 -0.039719   
66  -0.009147  0.050680 -18.061887 -3.321323 -0.020832  0.012152 -0.072854   
137  0.005383 -0.044642  49.840274  9.761511 -0.015328 -0.016345 -0.006584   
245 -0.027310 -0.044642 -35.306880 -2.977038 -0.056607 -0.058620  0.030232   
31  -0.023677 -0.044642 -65.485618 -8.141314 -0.038720 -0.053610  0.059685   

           s4        s5        s6  
17   0.034309  0.027364 -0.001078  
66   0.071210  0.000272  0.019633  
137 -0.002592  0.017036 -0.013504  
245 -0.039493 -0.049872 -0.129483  
31  -0.076395 -0.037129 -0.042499  


In [3]:
# Step 3: Train Model Without Scaling (Baseline)

# Train Linear Regression without scaling
lr_unscaled = LinearRegression()
lr_unscaled.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred_unscaled = lr_unscaled.predict(X_test_scaled)
mse_unscaled = mean_squared_error(y_test, y_pred_unscaled)
r2_unscaled = r2_score(y_test, y_pred_unscaled)

print("Performance without Scaling:")
print(f"Mean Squared Error: {mse_unscaled:.2f}")

#R² values between 0 and 1 indicate the degree of fit, with higher values suggesting a better fit.
print(f"R² Score: {r2_unscaled:.2f}")

Performance without Scaling:
Mean Squared Error: 2900.19
R² Score: 0.45


In [4]:
#### Step 4: Apply Scaling (StandardScaler)
""" Now, let’s apply **Standard Scaling** (zero mean, unit variance) and retrain the model. """


from sklearn.preprocessing import StandardScaler

# Apply StandardScaler
scaler = StandardScaler()
X_train_standard = scaler.fit_transform(X_train_scaled)
X_test_standard = scaler.transform(X_test_scaled)

# Convert back to DataFrame for readability (optional)
X_train_standard_df = pd.DataFrame(X_train_standard, columns=X_train.columns)
print("Sample of standardized training data:")
print(X_train_standard_df.head())

# Train Linear Regression with scaled data
lr_scaled = LinearRegression()
lr_scaled.fit(X_train_standard, y_train)

# Predict and evaluate
y_pred_scaled = lr_scaled.predict(X_test_standard)
mse_scaled = mean_squared_error(y_test, y_pred_scaled)
r2_scaled = r2_score(y_test, y_pred_scaled)

print("Performance with Standard Scaling:")
print(f"Mean Squared Error: {mse_scaled:.2f}")
print(f"R² Score: {r2_scaled:.2f}")

Sample of standardized training data:
        age       sex       bmi        bp        s1        s2        s3  \
0  1.498365  1.061370  0.219902  1.138874  0.728473  1.055893 -0.824451   
1 -0.228858  1.061370 -0.419366 -0.710591 -0.424929  0.272425 -1.529791   
2  0.085182 -0.942179  1.018987  1.992473 -0.309589 -0.326699 -0.119111   
3 -0.621409 -0.942179 -0.784662 -0.639458 -1.174640 -1.215508  0.664600   
4 -0.542899 -0.942179 -1.423930 -1.706457 -0.799784 -1.110167  1.291569   

         s4        s5        s6  
0  0.711038  0.547482 -0.061449  
1  1.484286 -0.019757  0.367236  
2 -0.062210  0.331237 -0.318660  
3 -0.835458 -1.069682 -2.719299  
4 -1.608706 -0.802859 -0.918820  
Performance with Standard Scaling:
Mean Squared Error: 2900.19
R² Score: 0.45


In [7]:
# Step 5: Apply Normalization (MinMaxScaler)
# Let’s also try **Min-Max Scaling** (to [0, 1]) for comparison.


from sklearn.preprocessing import MinMaxScaler

# Apply MinMaxScaler
minmax_scaler = MinMaxScaler()
X_train_minmax = minmax_scaler.fit_transform(X_train_scaled)
X_test_minmax = minmax_scaler.transform(X_test_scaled)

print(X_test_minmax)

# Train Linear Regression with Min-Max scaled data
lr_minmax = LinearRegression()
lr_minmax.fit(X_train_minmax, y_train)

# Predict and evaluate
y_pred_minmax = lr_minmax.predict(X_test_minmax)
mse_minmax = mean_squared_error(y_test, y_pred_minmax)
r2_minmax = r2_score(y_test, y_pred_minmax)

print("Performance with Min-Max Scaling:")
print(f"Mean Squared Error: {mse_minmax:.2f}")
print(f"R² Score: {r2_minmax:.2f}")

[[ 0.7         0.          0.33189655  0.3943662   0.89005236  0.76593625
   0.42857143  0.42313117  0.61044614  0.48484848]
 [ 0.91666667  0.          0.50431034  0.54929577  0.31937173  0.31474104
   0.36363636  0.14104372  0.3988557   0.42424242]
 [ 0.78333333  1.          0.34051724  0.4084507   0.80628272  0.52290837
   0.55844156  0.28208745  0.81090947  0.43939394]
 [ 0.93333333  0.          0.56465517  0.78408451  0.62303665  0.48406375
   0.09090909  0.83215797  0.86542174  0.72727273]
 [ 0.55        1.          0.27586207  0.45070423  0.56020942  0.53685259
   0.33766234  0.42313117  0.46575871  0.46969697]
 [ 0.53333333  0.          0.25862069  0.35211268  0.79057592  0.66733068
   0.61038961  0.28208745  0.40317315  0.53030303]
 [ 0.45        1.          1.0387931   0.52112676  0.52879581  0.4750996
   0.28571429  0.42313117  0.61514971  0.62121212]
 [ 0.38333333  1.          0.5387931   0.54929577  0.83246073  0.65039841
   0.36363636  0.42313117  0.77124504  0.72727273]
 