In [151]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

In [152]:
df = pd.read_csv("car_fuel_efficiency.csv")

## **Q1**
#### There's one column with missing values. What is it?

- 'engine_displacement'
- 'horsepower'
- 'vehicle_weight'
- 'model_year'

In [153]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9704 entries, 0 to 9703
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   engine_displacement  9704 non-null   int64  
 1   num_cylinders        9222 non-null   float64
 2   horsepower           8996 non-null   float64
 3   vehicle_weight       9704 non-null   float64
 4   acceleration         8774 non-null   float64
 5   model_year           9704 non-null   int64  
 6   origin               9704 non-null   object 
 7   fuel_type            9704 non-null   object 
 8   drivetrain           9704 non-null   object 
 9   num_doors            9202 non-null   float64
 10  fuel_efficiency_mpg  9704 non-null   float64
dtypes: float64(6), int64(2), object(3)
memory usage: 834.1+ KB


answer: horsepower

## **Q2**
#### What's the median (50% percentile) for variable 'horsepower'?

In [154]:
df["horsepower"].median()

149.0

## **Prepare and split the dataset**

#### - Shuffle the dataset (the filtered one you created above), use seed 42.
#### - Split your data in train/val/test sets, with 60%/20%/20% distribution.

#### Use the same code as in the lectures

In [155]:
df_shuffled = df.sample(frac=1, random_state=42)

In [156]:
n = len(df_shuffled)
n_train = int(0.6 * n)
n_val = int(0.2 * n)
n_test = n - n_train - n_val

df_train = df_shuffled.iloc[:n_train]
df_val = df_shuffled.iloc[n_train:n_train+n_val]
df_test = df_shuffled.iloc[n_train+n_val:]

In [157]:
y_train = df_train['fuel_efficiency_mpg'].values
y_val = df_val['fuel_efficiency_mpg'].values

X_train = df_train.drop('fuel_efficiency_mpg', axis=1)
X_val = df_val.drop('fuel_efficiency_mpg', axis=1)

## **Q3**
#### - We need to deal with missing values for the column from Q1.
#### - We have two options: fill it with 0 or with the mean of this variable.
#### - Try both options. For each, train a linear regression model without regularization using the code from the lessons.
#### - For computing the mean, use the training only!
#### - Use the validation dataset to evaluate the models and compare the RMSE of each option.
#### - Round the RMSE scores to 2 decimal digits using round(score, 2)

#### Which option gives better RMSE?

In [158]:
X_train_zero = X_train.copy()
X_val_zero = X_val.copy()

X_train_mean = X_train.copy()
X_val_mean = X_val.copy()

X_train_zero['horsepower'] = X_train_zero['horsepower'].fillna(0)
X_val_zero['horsepower'] = X_val_zero['horsepower'].fillna(0)

mean_hp = X_train['horsepower'].mean()
X_train_mean['horsepower'] = X_train_mean['horsepower'].fillna(mean_hp)
X_val_mean['horsepower'] = X_val_mean['horsepower'].fillna(mean_hp)

X_train_zero = X_train_zero.fillna(X_train_zero.mean(numeric_only=True))
X_val_zero = X_val_zero.fillna(X_train_zero.mean(numeric_only=True))
X_train_mean = X_train_mean.fillna(X_train_mean.mean(numeric_only=True))
X_val_mean = X_val_mean.fillna(X_train_mean.mean(numeric_only=True))

In [159]:
model_zero = LinearRegression()
model_zero.fit(X_train_zero.select_dtypes(include=[np.number]), y_train)
y_pred_zero = model_zero.predict(X_val_zero.select_dtypes(include=[np.number]))
rmse_zero = mean_squared_error(y_val, y_pred_zero, squared=False)

model_mean = LinearRegression()
model_mean.fit(X_train_mean.select_dtypes(include=[np.number]), y_train)
y_pred_mean = model_mean.predict(X_val_mean.select_dtypes(include=[np.number]))
rmse_mean = mean_squared_error(y_val, y_pred_mean, squared=False)

print("RMSE (fill with 0):", round(rmse_zero, 2))
print("RMSE (fill with mean):", round(rmse_mean, 2))

if round(rmse_zero, 2) < round(rmse_mean, 2):
    print("Better option: Fill with 0")
elif round(rmse_mean, 2) < round(rmse_zero, 2):
    print("Better option: Fill with mean")
else:
    print("Both are equally good")

RMSE (fill with 0): 0.46
RMSE (fill with mean): 0.39
Better option: Fill with mean




## **Q4**
#### 1. Now let's train a regularized linear regression.
#### 2. For this question, fill the NAs with 0.
#### 3. Try different values of r from this list: [0, 0.01, 0.1, 1, 5, 10, 100].
#### 4. Use RMSE to evaluate the model on the validation dataset.
#### 5. Round the RMSE scores to 2 decimal digits.
#### 6. Which r gives the best RMSE?

#### If multiple options give the same best RMSE, select the smallest r.

In [160]:
X_train_0 = X_train.fillna(0)
X_val_0 = X_val.fillna(0)

X_train_0 = pd.get_dummies(X_train_0, drop_first=True)
X_val_0 = pd.get_dummies(X_val_0, drop_first=True)

X_val_0 = X_val_0.reindex(columns=X_train_0.columns, fill_value=0)

In [161]:
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]

In [162]:
results = {}

for r in r_values:
    model = Ridge(alpha=r)
    model.fit(X_train_0, y_train)
    y_pred = model.predict(X_val_0)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    results[r] = round(rmse, 2)

results

{0: 0.51, 0.01: 0.51, 0.1: 0.51, 1: 0.51, 5: 0.51, 10: 0.51, 100: 0.51}

In [163]:
best_r = min(results, key=results.get)
print("Best r:", best_r, "→ RMSE:", results[best_r])

Best r: 0 → RMSE: 0.51


## **Q5**

#### 1. We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
#### 2. Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
#### 3. For each seed, do the train/validation/test split with 60%/20%/20% distribution.
#### 4. Fill the missing values with 0 and train a model without regularization.
#### 5. For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
#### 6. What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
#### 7. Round the result to 3 decimal digits (round(std, 3))

In [164]:
rmse_scores = []

seeds = [0,1,2,3,4,5,6,7,8,9]

for seed in seeds:
    df_shuffled = df.sample(frac=1, random_state=seed)
    
    n = len(df_shuffled)
    n_train = int(0.6 * n)
    n_val = int(0.2 * n)
    
    df_train = df_shuffled.iloc[:n_train]
    df_val = df_shuffled.iloc[n_train:n_train+n_val]
    df_test = df_shuffled.iloc[n_train+n_val:]

    df_train_0 = df_train.fillna(0)
    df_val_0 = df_val.fillna(0)

    X_train = df_train_0.drop(columns=['fuel_efficiency_mpg'])
    y_train = df_train_0['fuel_efficiency_mpg']
    X_val = df_val_0.drop(columns=['fuel_efficiency_mpg'])
    y_val = df_val_0['fuel_efficiency_mpg']

    X_train = pd.get_dummies(X_train, drop_first=True)
    X_val = pd.get_dummies(X_val, drop_first=True)
    X_val = X_val.reindex(columns=X_train.columns, fill_value=0)

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores.append(rmse)

std = np.std(rmse_scores)
print("RMSE per seed:", rmse_scores)
print("Standard deviation:", round(std, 3))

RMSE per seed: [0.5093576292583686, 0.5109172467498379, 0.5114451480461776, 0.506579263729307, 0.4982068079989254, 0.5156193854317062, 0.5195245878381457, 0.49626277833285615, 0.5019844968110262, 0.49958637214952595]
Standard deviation: 0.007


## **Q6**

#### 1. Split the dataset like previously, use seed 9.

#### 2. Combine train and validation datasets.

#### 3. Fill the missing values with 0 and train a model with r=0.001.

#### 4. What's the RMSE on the test dataset?

In [165]:
seed = 9
df_shuffled = df.sample(frac=1, random_state=seed)

n = len(df_shuffled)
n_train = int(0.6 * n)
n_val = int(0.2 * n)

df_train = df_shuffled.iloc[:n_train].copy()
df_val   = df_shuffled.iloc[n_train:n_train + n_val].copy()
df_test  = df_shuffled.iloc[n_train + n_val:].copy()

df_trainval = pd.concat([df_train, df_val], ignore_index=True)

df_trainval = df_trainval.fillna(0)
df_test = df_test.fillna(0)

y_trainval = df_trainval['fuel_efficiency_mpg'].values
X_trainval = df_trainval.drop(columns=['fuel_efficiency_mpg'])

y_test = df_test['fuel_efficiency_mpg'].values
X_test = df_test.drop(columns=['fuel_efficiency_mpg'])

X_trainval = pd.get_dummies(X_trainval, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

X_test = X_test.reindex(columns=X_trainval.columns, fill_value=0)

model = Ridge(alpha=0.001)
model.fit(X_trainval, y_trainval)

y_pred_test = model.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print("RMSE on test:", round(rmse_test, 3))

RMSE on test: 0.502
