In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
raw_data = pd.read_csv("car_fuel_efficiency.csv")

In [3]:
raw_data.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [None]:
#QUESTION 1

In [4]:
raw_data.isna().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [None]:
#QUESTION 2

In [5]:
raw_data["horsepower"].median()

np.float64(149.0)

In [None]:
#QUESTION 3

In [36]:
from sklearn.model_selection import train_test_split

columns = ["engine_displacement", "horsepower", "vehicle_weight", "model_year"]
target = "fuel_efficiency_mpg"
X = raw_data[columns]
y = raw_data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, shuffle=False)


In [38]:
#Case 1: Fill missing values with zero

X_train_1 = X_train.fillna(0)
X_val_1 = X_val.fillna(0)

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

LR = LinearRegression()
LR.fit(X_train_1, y_train)
y_pred = LR.predict(X_val_1)
rmse = round(mean_squared_error(y_val, y_pred)**.5, 2)
print(rmse)

0.52


In [39]:
#Case 1: Fill missing values with mean

mean = X_train.mean()
X_train_2 = X_train.fillna(mean)
X_val_2 = X_val.fillna(mean)

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

LR = LinearRegression()
LR.fit(X_train_2, y_train)
y_pred = LR.predict(X_val_2)
rmse = round(mean_squared_error(y_val, y_pred)**.5, 2)
print(rmse)

0.46


In [None]:
#QUESTION 4

In [28]:
from sklearn.linear_model import Ridge

r = [0, .01, 1, 10, 100]
for r_ in r:
    LR = Ridge(alpha=r_)
    LR.fit(X_train_1, y_train)
    y_pred = LR.predict(X_val_1)
    rmse = round(mean_squared_error(y_val, y_pred)**.5, 2)
    print(f"rmse is {rmse} for alpha = {r_}", end="\n")

rmse is 0.52 for alpha = 0
rmse is 0.52 for alpha = 0.01
rmse is 0.52 for alpha = 1
rmse is 0.52 for alpha = 10
rmse is 0.52 for alpha = 100


In [None]:
#QUESTION 5

In [33]:
seed_values = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

all_rmses = np.array([])
for seed in seed_values:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, shuffle=False)
    X_train = X_train.fillna(0)
    X_val = X_val.fillna(0)
    LR = LinearRegression()
    LR.fit(X_train, y_train)
    y_pred = LR.predict(X_val)
    rmse = round(mean_squared_error(y_val, y_pred)**.5, 2)
    print(f"rmse is {rmse} for seed = {seed}", end="\n")
    all_rmses = np.append(all_rmses, rmse)

print(f"The standart deviation of the all scores is {round(np.std(all_rmses), 3)}")

rmse is 0.52 for seed = 0
rmse is 0.53 for seed = 1
rmse is 0.51 for seed = 2
rmse is 0.52 for seed = 3
rmse is 0.53 for seed = 4
rmse is 0.51 for seed = 5
rmse is 0.53 for seed = 6
rmse is 0.51 for seed = 7
rmse is 0.51 for seed = 8
rmse is 0.52 for seed = 9
The standart deviation of the all scores is 0.008


In [None]:
#QUESTION 6

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

ridge = Ridge(alpha=0.001)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)
rmse = round(mean_squared_error(y_test, y_pred)**.5, 2)
print(rmse)

0.52
