In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
data = pd.read_csv('./Dataset/data.csv')

In [3]:
data.head() # peeking into the car price prediction dataset

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [4]:
data.nunique()

Make                   48
Model                 915
Year                   28
Engine Fuel Type       10
Engine HP             356
Engine Cylinders        9
Transmission Type       5
Driven_Wheels           4
Number of Doors         3
Market Category        71
Vehicle Size            3
Vehicle Style          16
highway MPG            59
city mpg               69
Popularity             48
MSRP                 6049
dtype: int64

In [5]:
# as Make and Model do not provide any essential information towards price, we will drop them
X = data.drop(columns=['MSRP', 'Make', 'Model', 'Market Category']) 
Y = data['MSRP'] # MSRP is the car price
Y = Y.values.reshape(Y.shape[0], 1)
print(X.shape, Y.shape)

(11914, 12) (11914, 1)


In [6]:
# Changing the categorial values into numerial values so we can run optimization over them
X = pd.get_dummies(X, dummy_na=False, columns=['Engine Fuel Type', 'Transmission Type', 
                                               'Driven_Wheels', 'Vehicle Size', 'Vehicle Style'])
# Filling all the null values, if any, with 0
X.fillna(0, inplace=True)
print(X.shape)

(11914, 45)


In [7]:
items = list(X)
print('Number of item to normalize: {}'.format(len(items)))

Number of item to normalize: 45


In [8]:
for j in items:
    max_val = np.max(X[j])
    min_val = np.min(X[j])
    for i in tqdm(range(X.shape[0])):
        X.loc[i, j] = (X.loc[i, j] - min_val) / (max_val - min_val)

100%|██████████| 11914/11914 [00:05<00:00, 2138.29it/s]
100%|██████████| 11914/11914 [00:04<00:00, 2477.23it/s]
100%|██████████| 11914/11914 [00:04<00:00, 2466.41it/s]
100%|██████████| 11914/11914 [00:04<00:00, 2519.74it/s]
100%|██████████| 11914/11914 [00:05<00:00, 2097.40it/s]
100%|██████████| 11914/11914 [00:05<00:00, 2096.29it/s]
100%|██████████| 11914/11914 [00:05<00:00, 2203.49it/s]
100%|██████████| 11914/11914 [00:05<00:00, 2276.46it/s]
100%|██████████| 11914/11914 [00:05<00:00, 2119.40it/s]
100%|██████████| 11914/11914 [00:05<00:00, 2109.78it/s]
100%|██████████| 11914/11914 [00:05<00:00, 2378.02it/s]
100%|██████████| 11914/11914 [00:05<00:00, 2349.95it/s]
100%|██████████| 11914/11914 [00:05<00:00, 2174.80it/s]
100%|██████████| 11914/11914 [00:05<00:00, 2115.31it/s]
100%|██████████| 11914/11914 [00:06<00:00, 1945.64it/s]
100%|██████████| 11914/11914 [00:04<00:00, 2563.18it/s]
100%|██████████| 11914/11914 [00:04<00:00, 2658.07it/s]
100%|██████████| 11914/11914 [00:04<00:00, 2689.

In [9]:
X.head()

Unnamed: 0,Year,Engine HP,Engine Cylinders,Number of Doors,highway MPG,city mpg,Popularity,Engine Fuel Type_diesel,Engine Fuel Type_electric,Engine Fuel Type_flex-fuel (premium unleaded recommended/E85),...,Vehicle Style_Convertible,Vehicle Style_Convertible SUV,Vehicle Style_Coupe,Vehicle Style_Crew Cab Pickup,Vehicle Style_Extended Cab Pickup,Vehicle Style_Passenger Minivan,Vehicle Style_Passenger Van,Vehicle Style_Regular Cab Pickup,Vehicle Style_Sedan,Vehicle Style_Wagon
0,0.777778,0.334665,0.375,0.5,0.040936,0.092308,0.692131,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.777778,0.2997,0.375,0.5,0.046784,0.092308,0.692131,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.777778,0.2997,0.375,0.5,0.046784,0.1,0.692131,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.777778,0.22977,0.375,0.5,0.046784,0.084615,0.692131,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.777778,0.22977,0.375,0.5,0.046784,0.084615,0.692131,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
""" Dividing the dataset into train and test split"""
train_X = X[:-int(0.1*X.shape[0])]
train_Y = Y[:-int(0.1*Y.shape[0])]

test_X = X[-int(0.1*X.shape[0]):]
test_Y = Y[-int(0.1*Y.shape[0]):]

print('Training Data shape\n X: {}\tY:{}'.format(train_X.shape, train_Y.shape))
print('Testing Data shape\n X: {}\tY:{}'.format(test_X.shape, test_Y.shape))

Training Data shape
 X: (10723, 45)	Y:(10723, 1)
Testing Data shape
 X: (1191, 45)	Y:(1191, 1)


In [11]:
# Randomly initializing the parameters
m = 0.1*np.random.random_sample((X.shape[1], 1))
b = 0.1*np.random.random_sample()
print(m.shape)

(45, 1)


In [12]:
def multivariant_gradient_descent(slope, intercept, learning_rate, num_iter, X, Y):
    """ A vectorised implementation of gradient descent for multivariate linear regression"""
    n = X.shape[0]
    for i in tqdm(range(num_iter)):
        
        # Calculating gradient according to the sum of squared error loss function
        grad_slope = n**-1 * np.dot(X.T, (np.dot(X, slope) + intercept - Y))
        grad_intercept = n**-1 * np.sum((np.dot(X, slope) + intercept - Y), axis=0)

        # updating parameters
        slope = slope - (learning_rate*grad_slope)
        intercept = intercept - (learning_rate*grad_intercept)
    return slope, intercept

In [13]:
# Running optimization on training data using the gradient descent script written above
slope, intercept = multivariant_gradient_descent(m, b, 1e-3, 1000, train_X, train_Y)

100%|██████████| 10000/10000 [01:36<00:00, 103.48it/s]


In [14]:
print('Initial intercept: {}'.format(b))
print('Intercept after optimization: {}'.format(intercept))

Initial intercept: 0.03972126967379826
Intercept after optimization: [10742.77066674]


In [18]:
# Running predictions on the test data
pred = np.dot(test_X, slope) + intercept
pred = pred.astype(dtype=int)
print(pred) # Predicted car prices

[[38411]
 [29724]
 [38411]
 ...
 [82375]
 [44105]
 [15614]]


In [19]:
print(test_Y) # True price values

[[34515]
 [38015]
 [34365]
 ...
 [50620]
 [50920]
 [28995]]
