## 2. Machine Learning for Regression


In [6]:
import pandas as pd
import numpy as np

## 2.2 Data preparation

In [7]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'

In [8]:
!wget $data 

--2025-10-09 12:06:45--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv.4’


2025-10-09 12:06:45 (4,72 MB/s) - ‘car_fuel_efficiency.csv.4’ saved [874188/874188]



In [9]:
df = pd.read_csv('car_fuel_efficiency.csv')

In [10]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [None]:
# Missing values overview
missing_cols = df.columns[df.isnull().any()]
print("Columns with missing values:", missing_cols.tolist())

# 'engine_displacement',
# 'horsepower',
# 'vehicle_weight',
# 'model_year',
# 'fuel_efficiency_mpg'


#horsepower

Columns with missing values: ['num_cylinders', 'horsepower', 'acceleration', 'num_doors']


In [None]:
# Median (50th percentile) for 'horsepower'
if 'horsepower' in df.columns:
    median_hp = float(pd.to_numeric(df['horsepower'], errors='coerce').median(skipna=True))
    print('Median horsepower:', median_hp)
else:
    print("Column 'horsepower' not found.")
#149

Median horsepower: 149.0


In [18]:
# Prepare filtered dataset: select columns, drop missing, shuffle with seed 42
requested_cols = [
    'engine_displacement',
    'horsepower',
    'vehicle_weight',
    'model_year',
    'fuel_efficiency_mpg',
]

existing_cols = [c for c in requested_cols if c in df.columns]
missing_requested = [c for c in requested_cols if c not in existing_cols]

if missing_requested:
    print('Requested columns not found and will be skipped:', missing_requested)

if not existing_cols:
    raise ValueError('None of the requested columns are present')

# Filter and clean
filtered = df[existing_cols].copy()
filtered = filtered.dropna(subset=existing_cols)

# Shuffle with seed 42
filtered = filtered.sample(frac=1.0, random_state=42).reset_index(drop=True)

print('Filtered columns:', existing_cols)
print('Filtered rows:', len(filtered))


Filtered columns: ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
Filtered rows: 8996


In [19]:
# Split into train/val/test = 60%/20%/20%
from math import floor

n = len(filtered)

n_train = floor(n * 0.6)
n_val = floor(n * 0.2)
# ensure leftover goes to test
n_test = n - n_train - n_val

train = filtered.iloc[:n_train].reset_index(drop=True)
val = filtered.iloc[n_train:n_train + n_val].reset_index(drop=True)
test = filtered.iloc[n_train + n_val:].reset_index(drop=True)

print({'n': n, 'train': len(train), 'val': len(val), 'test': len(test)})


{'n': 8996, 'train': 5397, 'val': 1799, 'test': 1800}


In [22]:
# Imputation experiment for 'horsepower': 0 vs train-mean; Linear Regression, RMSE on val
from typing import List
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Resolve target column
target_candidates: List[str] = ['fuel_efficiency_mpg', 'combined_mpg', 'highway_mpg', 'city_mpg']
for t in target_candidates:
    if t in df.columns:
        target_col = t
        break
else:
    raise ValueError('No target column found among: ' + ', '.join(target_candidates))

# Feature set (use only those that exist)
feature_candidates: List[str] = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
features: List[str] = [c for c in feature_candidates if c in df.columns]
if 'horsepower' not in features:
    raise ValueError("'horsepower' column not found in features")

# Keep only needed columns; drop rows with missing target or with missing in non-horsepower features
subset = df[features + [target_col]].copy()
non_hp_features = [c for c in features if c != 'horsepower']
if non_hp_features:
    subset = subset.dropna(subset=[target_col] + non_hp_features)
else:
    subset = subset.dropna(subset=[target_col])

# Shuffle with seed 42 and split 60/20/20
subset = subset.sample(frac=1.0, random_state=42).reset_index(drop=True)
N = len(subset)
n_train = int(N * 0.6)
n_val = int(N * 0.2)

train_df = subset.iloc[:n_train].reset_index(drop=True)
val_df = subset.iloc[n_train:n_train + n_val].reset_index(drop=True)
# test_df = subset.iloc[n_train + n_val:].reset_index(drop=True)  # not used here

# Helper: train/eval with a provided imputed 'horsepower'
def train_eval_with_imputation(train_in: 'pd.DataFrame', val_in: 'pd.DataFrame') -> float:
    X_train = train_in[features].values
    y_train = train_in[target_col].values
    X_val = val_in[features].values
    y_val = val_in[target_col].values

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    # compute RMSE without relying on 'squared' kw for compatibility
    rmse = mean_squared_error(y_val, y_pred) ** 0.5
    return rmse

# Option A: fill horsepower with 0
train_zero = train_df.copy()
val_zero = val_df.copy()
train_zero['horsepower'] = train_zero['horsepower'].fillna(0)
val_zero['horsepower'] = val_zero['horsepower'].fillna(0)
rmse_zero = train_eval_with_imputation(train_zero, val_zero)

# Option B: fill horsepower with train mean (computed on training only)
train_mean_value = pd.to_numeric(train_df['horsepower'], errors='coerce').mean()
train_mean = train_df.copy()
val_mean = val_df.copy()
train_mean['horsepower'] = pd.to_numeric(train_mean['horsepower'], errors='coerce').fillna(train_mean_value)
val_mean['horsepower'] = pd.to_numeric(val_mean['horsepower'], errors='coerce').fillna(train_mean_value)
rmse_mean = train_eval_with_imputation(train_mean, val_mean)

print({'rmse_zero': round(rmse_zero, 2), 'rmse_mean': round(rmse_mean, 2)})
print('Better option:', 'mean' if rmse_mean < rmse_zero else 'zero')


{'rmse_zero': 0.52, 'rmse_mean': 0.46}
Better option: mean


In [24]:
# Regularized linear regression (Ridge): fill NAs with 0, tune r
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Ensure we have train_df/val_df and features/target_col from previous cell
assert 'train_df' in globals() and 'val_df' in globals(), 'Expected train_df and val_df from previous step'
assert 'features' in globals() and 'target_col' in globals(), 'Expected features and target_col from previous step'

# Make copies and coerce to numeric; fill NAs with 0 as required
train_r = train_df.copy()
val_r = val_df.copy()
for c in features:
    train_r[c] = pd.to_numeric(train_r[c], errors='coerce').fillna(0)
    val_r[c] = pd.to_numeric(val_r[c], errors='coerce').fillna(0)

X_train = train_r[features].values
y_train = train_r[target_col].values
X_val = val_r[features].values
y_val = val_r[target_col].values

rs = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_by_r = {}
for r in rs:
    model = Ridge(alpha=r)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred) ** 0.5
    rmse_by_r[r] = rmse

print('RMSE by r:', rmse_by_r)
best_r = min(rmse_by_r, key=lambda k: rmse_by_r[k])
print('Best r:', best_r)


RMSE by r: {0: 0.5171866956560172, 0.01: 0.5171866956637374, 0.1: 0.517186695733219, 1: 0.5171866964280346, 5: 0.5171866995160863, 10: 0.5171867033761105, 100: 0.5171867728488829}
Best r: 0


In [28]:
# Seed sensitivity: split with seeds 0..9, zero-impute, LinearRegression, RMSE std
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

assert 'features' in globals() and 'target_col' in globals(), 'Expected features and target_col from previous step'

# Build a base subset consistent with earlier logic: drop missing target and non-horsepower features
base_subset = df[features + [target_col]].copy()
non_hp_features = [c for c in features if c != 'horsepower']
if non_hp_features:
    base_subset = base_subset.dropna(subset=[target_col] + non_hp_features)
else:
    base_subset = base_subset.dropna(subset=[target_col])

seeds = list(range(10))
rmse_scores = []

for seed in seeds:
    data_seed = base_subset.sample(frac=1.0, random_state=seed).reset_index(drop=True)
    N = len(data_seed)
    n_train = int(N * 0.6)
    n_val = int(N * 0.2)

    train_s = data_seed.iloc[:n_train].reset_index(drop=True)
    val_s = data_seed.iloc[n_train:n_train + n_val].reset_index(drop=True)

    # zero-impute all features (coerce to numeric first), as required
    for c in features:
        train_s[c] = pd.to_numeric(train_s[c], errors='coerce').fillna(0)
        val_s[c] = pd.to_numeric(val_s[c], errors='coerce').fillna(0)

    X_train = train_s[features].values
    y_train = train_s[target_col].values
    X_val = val_s[features].values
    y_val = val_s[target_col].values

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred) ** 0.5
    rmse_scores.append(rmse)

std_rmse = float(np.std(rmse_scores))
print('RMSE per seed:', [round(s, 2) for s in rmse_scores])
print('STD of RMSE:', std_rmse)


RMSE per seed: [0.52, 0.52, 0.52, 0.52, 0.51, 0.53, 0.53, 0.51, 0.51, 0.51]
STD of RMSE: 0.007126319128873935


In [27]:
# Final eval: seed=9, combine train+val, zero-impute, Ridge r=0.001, RMSE on test
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

assert 'features' in globals() and 'target_col' in globals(), 'Expected features and target_col from previous step'

# Build subset consistent with earlier preprocessing
subset = df[features + [target_col]].copy()
non_hp_features = [c for c in features if c != 'horsepower']
if non_hp_features:
    subset = subset.dropna(subset=[target_col] + non_hp_features)
else:
    subset = subset.dropna(subset=[target_col])

# Split with seed=9 (60/20/20)
subset = subset.sample(frac=1.0, random_state=9).reset_index(drop=True)
N = len(subset)
n_train = int(N * 0.6)
n_val = int(N * 0.2)

train_df = subset.iloc[:n_train].reset_index(drop=True)
val_df = subset.iloc[n_train:n_train + n_val].reset_index(drop=True)
test_df = subset.iloc[n_train + n_val:].reset_index(drop=True)

# Combine train and val
train_full = pd.concat([train_df, val_df], axis=0, ignore_index=True)

# Zero-impute features (coerce numeric first)
for c in features:
    train_full[c] = pd.to_numeric(train_full[c], errors='coerce').fillna(0)
    test_df[c] = pd.to_numeric(test_df[c], errors='coerce').fillna(0)

X_train = train_full[features].values
y_train = train_full[target_col].values
X_test = test_df[features].values
y_test = test_df[target_col].values

model = Ridge(alpha=0.001)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse_test = (mean_squared_error(y_test, y_pred)) ** 0.5
print('Test RMSE (r=0.001, seed=9):', round(float(rmse_test), 2))


Test RMSE (r=0.001, seed=9): 0.52
