## Regression Homework

In [1]:
import pandas as pd
import numpy as np

## Data Preparation

In [50]:
# !wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv  --wget is not working, so downloaded directly

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')
data.head(3)

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341


In [3]:
data.columns

Index(['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight',
       'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain',
       'num_doors', 'fuel_efficiency_mpg'],
      dtype='object')

In [5]:
columns_used =['engine_displacement',
'horsepower',
'vehicle_weight',
'model_year',
'fuel_efficiency_mpg']
df = data[columns_used]
df.head(1)

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729


In [6]:
# Question - 1
df.isnull().sum() 

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

In [None]:
# Question - 2
df['horsepower'].describe()

count    8996.000000
mean      149.657292
std        29.879555
min        37.000000
25%       130.000000
50%       149.000000
75%       170.000000
max       271.000000
Name: horsepower, dtype: float64

In [56]:
df.dtypes

engine_displacement      int64
horsepower             float64
vehicle_weight         float64
model_year               int64
fuel_efficiency_mpg    float64
dtype: object

## Split data 60,20,20

In [None]:
#This function will split the data based on the seed number
def split_data(seed_value):

    n = len(df)
    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test

    idx = np.arange(n)
    np.random.seed(seed_value)
    np.random.shuffle(idx)

    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train+n_val]]
    df_test = df.iloc[idx[n_train+n_val:]]

    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
    y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
    y_test = np.log1p(df_test.fuel_efficiency_mpg.values)

    del df_train['fuel_efficiency_mpg']
    del df_val['fuel_efficiency_mpg']
    del df_test['fuel_efficiency_mpg']

    return df_train, df_val, df_test, y_train, y_val, y_test

In [31]:
df_train, df_val, df_test, y_train, y_val, y_test = split_data(42)
print(len(df_train), len(df_val), len(df_test))

5824 1940 1940


In [32]:
df_train.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year
0,220,144.0,2535.887591,2009
1,160,141.0,2741.170484,2019
2,230,155.0,2471.880237,2017
3,150,206.0,3748.164469,2015
4,300,111.0,2135.716359,2006


## Linear Regression

In [22]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [23]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [75]:
#This function will fill the nan and train in linear regression
def train_linear_model(df_train, df_val, y_train, y_val, fillvalue=0):
    # Fill with 0 and find RSME
    X_train = df_train.fillna(fillvalue).values
    X_val = df_val.fillna(fillvalue).values

    w0, w = train_linear_regression(X_train, y_train)

    y_pred = w0 + X_val.dot(w)
    # y_pred
    rmse1 = rmse(y_val, y_pred)
    #print('RMSE : ', rmse1)
    return rmse1

In [76]:
# Question - 3

#Fill with 0 and find RMSE
rmse_0 = round(train_linear_model(df_train, df_val, y_train, y_val, 0), 2)
print(f'RMSE when filled with 0: {rmse_0}')

#Fill with mean and find RMSE
mean1 = round(df_train['horsepower'].mean(), 1)
print(f'mean: {mean1}')
rmse_mean = round(train_linear_model(df_train, df_val, y_train, y_val, mean1), 2)
print(f'RMSE when filled with mean: {rmse_mean}')


RMSE when filled with 0: 0.04
mean: 149.7
RMSE when filled with mean: 0.04


## Regularized Linear Regression

In [50]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [71]:
def train_linear_model_reg(df_train, df_val, y_train, y_val):
    r1 = [0, 0.01, 0.1, 1, 5, 10, 100]
    X_train = df_train.fillna(0).values
    X_val = df_val.fillna(0).values
    for r in r1:
        w0, w = train_linear_regression_reg(X_train, y_train, r)
        #print(f'W0 : {w0}, w = {len(w)}')
        y_pred = w0 + X_val.dot(w)
        rmse1 = rmse(y_val, y_pred)
        print(f'r : {r} ----> RMSE : {rmse1} ')


In [72]:
# Question - 4
train_linear_model_reg(df_train, df_val, y_train, y_val)

r : 0 ----> RMSE : 0.038607646441651766 
r : 0.01 ----> RMSE : 0.03862981662449219 
r : 0.1 ----> RMSE : 0.03923784881651058 
r : 1 ----> RMSE : 0.04012201415387104 
r : 5 ----> RMSE : 0.04027838804734784 
r : 10 ----> RMSE : 0.04029958138531605 
r : 100 ----> RMSE : 0.040318994094792664 


In [78]:
# Question - 5
r1 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmses =[]
for r in r1:
    df_train, df_val, df_test, y_train, y_val, y_test = split_data(r)
    rmse1 = train_linear_model(df_train, df_val, y_train, y_val)
    print(f'{r} - {rmse1}')
    #print(rmse1)
    rmses.append(rmse1)
print(f'STD: {round(np.std(rmses), 3)}')

0 - 0.03801775537104819
1 - 0.039278853338995105
2 - 0.03944653052696821
3 - 0.038727637039551656
4 - 0.03727535850045066
5 - 0.039384388340472844
6 - 0.03890763931474179
7 - 0.03837971626919168
8 - 0.04018986975307275
9 - 0.038607646441651766
STD: 0.001


In [95]:
# Question 6
r=0.001
df_train, df_val, df_test, y_train, y_val, y_test = split_data(9)
X_train = pd.concat([df_train, df_val], ignore_index= True)
y_train = np.concatenate((y_train, y_val))
X_test = df_test.fillna(0).values
X_train = X_train.fillna(0).values

w0, w = train_linear_regression_reg(X_train, y_train, r)
y_pred = w0 + X_test.dot(w)
rmse1 = rmse(y_test, y_pred)
print(f'r : {r} ----> RMSE : {rmse1} ')



r : 0.001 ----> RMSE : 0.03919613644482336 
