In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv")
df

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.870990,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369
...,...,...,...,...,...,...,...,...,...,...,...
9699,140,5.0,164.0,2981.107371,17.3,2013,Europe,Diesel,Front-wheel drive,,15.101802
9700,180,,154.0,2439.525729,15.0,2004,USA,Gasoline,All-wheel drive,0.0,17.962326
9701,220,2.0,138.0,2583.471318,15.1,2008,USA,Diesel,All-wheel drive,-1.0,17.186587
9702,230,4.0,177.0,2905.527390,19.4,2011,USA,Diesel,Front-wheel drive,1.0,15.331551


In [3]:
# Filtered columns
columns = ['engine_displacement', 'horsepower', 'vehicle_weight', 
                  'model_year', 'fuel_efficiency_mpg']
df_filtered = df[columns]

In [4]:
## EDA fuel_efficiency_mpg has long tail? NO, they don't have it
print(df_filtered['fuel_efficiency_mpg'].describe())
print(f"Result: {df_filtered['fuel_efficiency_mpg'].skew():.2f}")

# Visual check (if needed)
# plt.hist(df['fuel_efficiency_mpg'], bins=50)
# plt.xlabel('Fuel Efficiency MPG')
# plt.ylabel('Frequency')
# plt.title('Distribution of Fuel Efficiency')
# plt.show()

count    9704.000000
mean       14.985243
std         2.556468
min         6.200971
25%        13.267459
50%        15.006037
75%        16.707965
max        25.967222
Name: fuel_efficiency_mpg, dtype: float64
Result: -0.01


### Question 1 - Missing values

In [5]:
missing_values = df_filtered.isnull().sum()
missing_values

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

### Question 2 - Median (50%) for "horsepower"

In [6]:
median_horsepower = df_filtered['horsepower'].median()
median_horsepower

np.float64(149.0)

In [12]:
## Data Preparation

def prepare_X(df, fill_value):
    df_num = df.copy()
    df_num.horsepower = df_num.horsepower.fillna(fill_value)
    X = df_num.drop('fuel_efficiency_mpg', axis=1).values
    return np.column_stack([np.ones(X.shape[0]), X])

def train_linear_regression(X, y):
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    return w

def train_linear_regression_reg(X, y, r=0.0):
    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    return w

# RMSE 
def rmse(y, y_pred):
    error = y - y_pred
    return np.sqrt((error ** 2).mean())

def get_rmse_for_fill(df_train, df_val, fill_value):
    X_train = prepare_X(df_train, fill_value)
    y_train = df_train.fuel_efficiency_mpg.values
    
    X_val = prepare_X(df_val, fill_value)
    y_val = df_val.fuel_efficiency_mpg.values
    
    w = train_linear_regression(X_train, y_train)
    y_pred = X_val.dot(w)
    
    return rmse(y_val, y_pred)

### Question 3 - Filling NAs

In [8]:
# Shuffle with seed 42
np.random.seed(42)
n = len(df_filtered)
idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df_filtered.iloc[idx].reset_index(drop=True)

# Split: 60% train, 20% val, 20% test
n_train = int(n * 0.6)
n_val = int(n * 0.2)

df_train = df_shuffled[:n_train]
df_val = df_shuffled[n_train:n_train + n_val]
df_test = df_shuffled[n_train + n_val:]

print(f'Training: {len(df_train)}, Validation: {len(df_val)}, Testing: {len(df_test)}\n')

mean_train = df_train.horsepower.mean()

score_zero = get_rmse_for_fill(df_train, df_val, 0)
score_mean = get_rmse_for_fill(df_train, df_val, mean_train)

print(f'With 0: {score_zero:.2f}')
print(f'With mean: {score_mean:.2f}')

# Best method
if score_zero < score_mean:
    print('\nAnswer: With 0')
elif score_mean < score_zero:
    print('\nAnswer: With mean')
else:
    print('\nAnswer: Both are equally good')

Training: 5822, Validation: 1940, Testing: 1942

With 0: 0.52
With mean: 0.46

Answer: With mean


### Question 4 - Best regularization

In [13]:
def train_linear_regression_reg(X, y, r=0.0):
    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    return w

# Fill missing values with 0
X_train = prepare_X(df_train, 0)
X_val = prepare_X(df_val, 0)

y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values

# Different r values
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]

for r in r_values:
    w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = X_val.dot(w)
    score = rmse(y_val, y_pred)
    print(f'r={r:6.2f}, rmse={score:.2f}')

r=  0.00, rmse=0.51
r=  0.01, rmse=0.51
r=  0.10, rmse=0.52
r=  1.00, rmse=0.52
r=  5.00, rmse=0.52
r= 10.00, rmse=0.52
r=100.00, rmse=0.52


### Question 5 - RMSE Standard Deviation

In [10]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

# Different seeds
scores = []

for s in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    # Shuffle with seed s
    n = len(df_filtered)
    idx = np.arange(n)
    np.random.seed(s)
    np.random.shuffle(idx)
    
    df_shuffled = df_filtered.iloc[idx].reset_index(drop=True)
    
    # Split
    n_train = int(n * 0.6)
    n_val = int(n * 0.2)
    
    df_train = df_shuffled[:n_train]
    df_val = df_shuffled[n_train:n_train + n_val]
    
    # Prepare data
    df_train_num = df_train.fillna(0)
    df_val_num = df_val.fillna(0)
    
    X_train = df_train_num.drop('fuel_efficiency_mpg', axis=1).values
    y_train = df_train_num.fuel_efficiency_mpg.values
    
    X_val = df_val_num.drop('fuel_efficiency_mpg', axis=1).values
    y_val = df_val_num.fuel_efficiency_mpg.values
    
    # Train
    w_0, w = train_linear_regression(X_train, y_train)
    y_pred = w_0 + X_val.dot(w)
    
    score = rmse(y_val, y_pred)
    scores.append(score)
    
    print(f'seed {s}: {score:.4f}')

print(f'std = {np.std(scores):.3f}')

seed 0: 0.5211
seed 1: 0.5218
seed 2: 0.5230
seed 3: 0.5161
seed 4: 0.5112
seed 5: 0.5287
seed 6: 0.5322
seed 7: 0.5095
seed 8: 0.5149
seed 9: 0.5131
std = 0.007


### Question 6 - Evaluation on test 

In [11]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X_with_bias = np.column_stack([ones, X])
    
    XTX = X_with_bias.T.dot(X_with_bias)
    
    # Create the regularization term
    reg = np.eye(XTX.shape[0]) * r
    reg[0, 0] = 0  # Do not regularize the bias term
    
    XTX_reg = XTX + reg
    XTX_reg_inv = np.linalg.inv(XTX_reg)
    
    w_full = XTX_reg_inv.dot(X_with_bias.T).dot(y)
    
    # Return the bias and weights separately
    return w_full[0], w_full[1:]

def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))

# Shuffle with seed 9
n = len(df_filtered)
idx = np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)

df_shuffled = df_filtered.iloc[idx].reset_index(drop=True)

# Split
n_train = int(n * 0.6)
n_val = int(n * 0.2)

df_train = df_shuffled[:n_train]
df_val = df_shuffled[n_train:n_train + n_val]
df_test = df_shuffled[n_train + n_val:]

# Combine train and validation
df_full_train = pd.concat([df_train, df_val])
df_full_train = df_full_train.reset_index(drop=True)

# Prepare data
df_full_train_num = df_full_train.fillna(0)
df_test_num = df_test.fillna(0)

X_full_train = df_full_train_num.drop('fuel_efficiency_mpg', axis=1).values
y_full_train = df_full_train_num.fuel_efficiency_mpg.values

X_test = df_test_num.drop('fuel_efficiency_mpg', axis=1).values
y_test = df_test_num.fuel_efficiency_mpg.values

# Train with r=0.001
w_0, w = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)

# Predict on test
y_pred = w_0 + X_test.dot(w)
score = rmse(y_test, y_pred)
print(f'rmse = {score:.2f}')

rmse = 0.52
