In [65]:
import pandas as pd

In [66]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv")
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [38]:
interested_columns = [
    'engine_displacement',
    'horsepower',
    'vehicle_weight',
    'model_year',
    'fuel_efficiency_mpg'
]
df = df[interested_columns]
df.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

In [39]:
df.horsepower.median()

np.float64(149.0)

In [107]:
import numpy as np

base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']

def create_validation_frame(input_data_set, seed=42):
    n = len(input_data_set)
    n_val = int(n*0.2)
    n_test = int(n*0.2)
    n_train = n - n_val - n_test
    n_train, n_test, n_val, n
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)
    df_train = input_data_set.iloc[idx[:n_train]]
    df_val = input_data_set.iloc[idx[n_train:n_train+n_val]]
    df_test = input_data_set.iloc[idx[n_train+n_val:]]
    return df_train, df_val, df_test
    
def train_linear_regression(X, Y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XXT = np.dot(X.T, X)
    XXT_inv = np.linalg.inv(XXT)
    w_full = XXT_inv.dot(X.T).dot(Y)
    return w_full[0], w_full[1:]

def rmse(y, y_pred):
    error = y-y_pred
    se = error**2
    mse = se.mean()
    return np.sqrt(mse)

def prepare_x(df, fillna_with=0):
    df_num = df[base]
    df_num = df_num.fillna(fillna_with)
    x = df_num.values
    return x

In [125]:
df_train, df_val, df_test = create_validation_frame(df)
len(df_train), len(df_val), len(df_test)
y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
y_test = np.log1p(df_test.fuel_efficiency_mpg.values)
mean_value = df_train.fuel_efficiency_mpg.mean()
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [126]:
X_train = prepare_x(df_train, 0)
w0, w = train_linear_regression(X_train, y_train)
X_val = prepare_x(df_val, 0)
y_pred = w0 + X_val.dot(w)
score = rmse(y_val, y_pred)
round(score, 2)

np.float64(0.04)

In [127]:
X_train = prepare_x(df_train, mean_value)
w0, w = train_linear_regression(X_train, y_train)
y_pred = w0 + X_val.dot(w)
score = rmse(y_val, y_pred)
round(score, 2)

np.float64(0.04)

In [128]:
def train_linear_regression_reg(X, Y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XXT = np.dot(X.T, X)
    XXT = XXT + r * np.eye(XXT.shape[0])
    XXT_inv = np.linalg.inv(XXT)
    w_full = XXT_inv.dot(X.T).dot(Y)
    return w_full[0], w_full[1:]

In [70]:
for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    X_train = prepare_x(df_train)
    w0, w = train_linear_regression_reg(X_train, y_train, r)
    
    X_val = prepare_x(df_val)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    print(r, w0, round(score, 5))

0 3.593421327008845 0.03861
0.01 3.1036742495036447 0.03863
0.1 1.3939019563361459 0.03924
1 0.21415509485652717 0.04012
5 0.04497602525332003 0.04028
10 0.022630081669347605 0.0403
100 0.002276727731185574 0.04032


In [129]:
scores = []
for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    df_train, df_val, df_test = create_validation_frame(df, seed)
    len(df_train), len(df_val), len(df_test)
    y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
    y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
    y_test = np.log1p(df_test.fuel_efficiency_mpg.values)
    del df_train['fuel_efficiency_mpg']
    del df_val['fuel_efficiency_mpg']
    del df_test['fuel_efficiency_mpg']
    X_train = prepare_x(df_train, 0)
    w0, w = train_linear_regression(X_train, y_train)
    X_val = prepare_x(df_val, 0)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    scores.append(score)
std = np.std(scores)
round(std, 3)

np.float64(0.001)

In [130]:
df_train, df_val, df_test = create_validation_frame(df, 9)
len(df_train), len(df_val), len(df_test)
y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']
df_full = pd.concat([df_train,df_val])
X_train_full = prepare_x(df_full, 0)
Y_train_full =  np.concatenate([y_train, y_val])
w0, w = train_linear_regression_reg(X_train_full, Y_train_full, r=0.001)

X_test = prepare_x(df_test, 0)
y_pred = w0 + X_test.dot(w)
rmse(y_test, y_pred)

np.float64(0.5156261299167999)