In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from tqdm import tqdm

In [2]:
data = pd.read_csv('./Dataset/data.csv')

In [3]:
# as Make and Model do not provide any essential information towards price, we will drop them
X = data.drop(columns=['MSRP', 'Make', 'Model']) 
Y = data['MSRP'] # MSRP is the car price
Y = Y.values.reshape(Y.shape[0], 1)
print(X.shape, Y.shape)

(11914, 13) (11914, 1)


In [4]:
# Changing the categorial values into numerial values so we can run optimization over them
X = pd.get_dummies(X, dummy_na=False, columns=['Engine Fuel Type', 'Transmission Type', 
                                               'Driven_Wheels', 'Vehicle Size', 'Vehicle Style', 'Market Category'])
# Filling all the null values, if any, with 0
X.fillna(0, inplace=True)
print(X.shape)

(11914, 116)


In [5]:
""" Dividing the dataset into train and test split"""
train_X = X[:-int(0.1*X.shape[0])]
train_Y = Y[:-int(0.1*Y.shape[0])]

test_X = X[-int(0.1*X.shape[0]):]
test_Y = Y[-int(0.1*Y.shape[0]):]

print('Training Data shape\n X: {}\tY:{}'.format(train_X.shape, train_Y.shape))
print('Testing Data shape\n X: {}\tY:{}'.format(test_X.shape, test_Y.shape))

Training Data shape
 X: (10723, 116)	Y:(10723, 1)
Testing Data shape
 X: (1191, 116)	Y:(1191, 1)


In [6]:
ols = linear_model.LinearRegression()

In [7]:
model = ols.fit(train_X, train_Y)

In [8]:
sklearn_y_pred = model.predict(test_X)

In [9]:
# Randomly initializing the parameters
m = 0.1*np.random.random_sample((X.shape[1], 1))
b = 0.1*np.random.random_sample()
print(m.shape)

(116, 1)


In [10]:
def multivariant_gradient_descent(slope, intercept, learning_rate, num_iter, X, Y):
    """ A vectorised implementation of gradient descent for multivariate linear regression"""
    n = X.shape[0]
    for i in tqdm(range(num_iter)):
        
        # Calculating gradient according to the sum of squared error loss function
        grad_slope = n**-1 * np.dot(X.T, (np.dot(X, slope) + intercept - Y))
        grad_intercept = n**-1 * np.sum((np.dot(X, slope) + intercept - Y), axis=0)

        # updating parameters
        slope = slope - (learning_rate*grad_slope)
        intercept = intercept - (learning_rate*grad_intercept)
    return slope, intercept

In [11]:
# Running optimization on training data using the gradient descent script written above
slope, intercept = multivariant_gradient_descent(m, b, 2*1e-7, 10000, train_X, train_Y)

100%|██████████| 10000/10000 [03:12<00:00, 51.95it/s]


In [12]:
# Running predictions on the test data
my_own_function_y_pred = np.dot(test_X, slope) + intercept
my_own_function_y_pred = my_own_function_y_pred.astype(dtype=int)

In [13]:
# Calculating error on prediction made my the function I wrote
my_fun_error = 1/test_Y.shape[0] * np.sum((my_own_function_y_pred - test_Y)**2) 

# Calculating error on prediction made my the sklearn model
sklearn_error = 1/test_Y.shape[0] * np.sum((sklearn_y_pred - test_Y)**2)

In [14]:
print('Sum of squared error on the test data using sklearn: {}'.format(sklearn_error))
print('Sum of squared error on the test data using written gradient descent: {}'.format(my_fun_error))

Sum of squared error on the test data using sklearn: 5651103919.657405
Sum of squared error on the test data using written gradient descent: 6246049948.120067
