In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
data = pd.read_csv('./Dataset/data.csv')

In [3]:
data.head() # peeking into the car price prediction dataset

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [4]:
# as Make and Model do not provide any essential information towards price, we will drop them
X = data.drop(columns=['MSRP', 'Make', 'Model']) 
Y = data['MSRP'] # MSRP is the car price
Y = Y.values.reshape(Y.shape[0], 1)
print(X.shape, Y.shape)

(11914, 13) (11914, 1)


In [5]:
# Changing the categorial values into numerial values so we can run optimization over them
X = pd.get_dummies(X, dummy_na=False, columns=['Engine Fuel Type', 'Transmission Type', 
                                               'Driven_Wheels', 'Vehicle Size', 'Vehicle Style', 'Market Category'])
# Filling all the null values, if any, with 0
X.fillna(0, inplace=True)
print(X.shape)

(11914, 116)


In [6]:
""" Dividing the dataset into train and test split"""
train_X = X[:-int(0.1*X.shape[0])]
train_Y = Y[:-int(0.1*Y.shape[0])]

test_X = X[-int(0.1*X.shape[0]):]
test_Y = Y[-int(0.1*Y.shape[0]):]

print('Training Data shape\n X: {}\tY:{}'.format(train_X.shape, train_Y.shape))
print('Testing Data shape\n X: {}\tY:{}'.format(test_X.shape, test_Y.shape))

Training Data shape
 X: (10723, 116)	Y:(10723, 1)
Testing Data shape
 X: (1191, 116)	Y:(1191, 1)


In [7]:
# Randomly initializing the parameters
m = 0.1*np.random.random_sample((X.shape[1], 1))
b = 0.1*np.random.random_sample()
print(m.shape)

(116, 1)


In [8]:
def multivariant_gradient_descent(slope, intercept, learning_rate, num_iter, X, Y):
    """ A vectorised implementation of gradient descent for multivariate linear regression"""
    n = X.shape[0]
    for i in tqdm(range(num_iter)):
        
        # Calculating gradient according to the sum of squared error loss function
        grad_slope = n**-1 * np.dot(X.T, (np.dot(X, slope) + intercept - Y))
        grad_intercept = n**-1 * np.sum((np.dot(X, slope) + intercept - Y), axis=0)

        # updating parameters
        slope = slope - (learning_rate*grad_slope)
        intercept = intercept - (learning_rate*grad_intercept)
    return slope, intercept

In [9]:
# Running optimization on training data using the gradient descent script written above
slope, intercept = multivariant_gradient_descent(m, b, 2*1e-7, 10000, train_X, train_Y)

100%|██████████| 10000/10000 [02:38<00:00, 63.28it/s]


In [10]:
print('Initial intercept: {}'.format(b))
print('Intercept after optimization: {}'.format(intercept))

Initial intercept: 0.07815428876090505
Intercept after optimization: [0.04938421]


In [11]:
# Running predictions on the test data
pred = np.dot(test_X, slope) + intercept
pred = pred.astype(dtype=int)
print(pred) # Predicted car prices

[[38098]
 [38098]
 [38098]
 ...
 [61357]
 [61326]
 [34842]]


In [12]:
print(test_Y) # True price values

[[34515]
 [38015]
 [34365]
 ...
 [50620]
 [50920]
 [28995]]
