In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('cars24-car-price-clean.csv')
df.head()

Unnamed: 0,selling_price,year,km_driven,mileage,engine,max_power,age,make,model,Individual,Trustmark Dealer,Diesel,Electric,LPG,Petrol,Manual,5,>5
0,-1.111046,-0.801317,1.195828,0.045745,-1.310754,-1.15778,0.801317,-0.433854,-1.125683,1.248892,-0.098382,-0.985275,-0.020095,-0.056917,1.024622,0.495818,0.444503,-0.424728
1,-0.223944,0.45003,-0.737872,-0.140402,-0.537456,-0.360203,-0.45003,-0.327501,-0.333227,1.248892,-0.098382,-0.985275,-0.020095,-0.056917,1.024622,0.495818,0.444503,-0.424728
2,-0.915058,-1.42699,0.035608,-0.582501,-0.537456,-0.404885,1.42699,-0.327501,-0.789807,1.248892,-0.098382,-0.985275,-0.020095,-0.056917,1.024622,0.495818,0.444503,-0.424728
3,-0.892365,-0.801317,-0.409143,0.32962,-0.921213,-0.693085,0.801317,-0.433854,-0.905265,1.248892,-0.098382,-0.985275,-0.020095,-0.056917,1.024622,0.495818,0.444503,-0.424728
4,-0.182683,0.137194,-0.544502,0.760085,0.042999,0.010435,-0.137194,-0.246579,-0.013096,-0.80071,-0.098382,1.014945,-0.020095,-0.056917,-0.97597,0.495818,0.444503,-0.424728


In [4]:
df.drop(["make","model"], axis=1, inplace=True)

In [14]:
X = df[df.columns.drop('selling_price')]
Y = df['selling_price']

In [15]:
X = X.to_numpy()
Y = Y.to_numpy()

In [16]:
X.shape

(19820, 15)

In [18]:
Y.shape

(19820,)

In [None]:
def hypothesis(x,theta):
    y_ = 0.0
    n = x.shape[0]
    for i in range(n):
        y_ += (theta[i]*x[i])
    return y_

def error(X,y,theta):
    e = 0.0
    m = x.shape[0]

    for i in range(m):
        y_ = hypothesis(X[i],theta)
        e += (y[i] - y_)**2

    return e/m

def gradient(X,y,theta):
    m,n = X.shape
    grad = np.zeros((n,))

    #for all values of j
    for j in range(n):
        # sum over all examples
        for i in range(m):
            y_ = hypothesis(X[i],theta)
            grad[j] += (y_ - y[i])*X[i][j]
    return grad/m

def gradient_descent(X,y,learning_rate=0.1,max_epochs=100):
    m,n = X.shape
    theta = np.zeros((n,))
    error_list = []

    for i in range(max_epochs):
        e = error(X,y,theta)
        error_list.append(e)
        #gradient
        grad = gradient(X,y,theta)
        for j in range(n):
            theta[j] = theta[j] - learning_rate*grad[j]
    return theta,error_list

            

In [None]:
import time
start = time.time()
theta,error_list = gradient_descent(X,Y)
end = time.time()
print("Time taken is ", end-start)

In [23]:
# vectorization form
import time
def hypothesis(X,theta):
    return np.dot(X,theta)

def error(X,y,theta):
    e = 0.0
    m = X.shape[0]
    y_ = hypothesis(X,theta)
    e =  np.sum((y-y_)**2)
    return e/m

def gradient(X,y,theta):
    y_= hypothesis(X,theta)
    grad = np.dot(X.T,(y_ - y))
    m = X.shape[0]
    return grad/m

def gradient_descent(X,y,learning_rate = 0.1, max_iters=500):
    n = X.shape[1]
    theta = np.zeros((n,))
    error_list = []

    for i in range(max_iters):
        e = error(X,y,theta)
        error_list.append(e)

        #Gradient descent
        grad = gradient(X,y,theta)
        theta = theta - learning_rate*grad

    return theta, error_list


In [25]:
start = time.time()
theta,error_list = gradient_descent(X,Y)
end = time.time()
print("Time taken by vectorized code", end-start)

Time taken by vectorized code 0.23256397247314453


In [26]:
print(theta)

[ 0.17767871 -0.04641991 -0.13312412  0.08580425  0.44049269 -0.17767871
 -0.04154607 -0.00820066  0.07130438  0.03893641  0.00174728 -0.12646906
 -0.21554591 -0.03148962 -0.03838659]


In [36]:
df = pd.read_csv('cars24-car-price-clean.csv')
df.head()

Unnamed: 0,selling_price,year,km_driven,mileage,engine,max_power,age,make,model,Individual,Trustmark Dealer,Diesel,Electric,LPG,Petrol,Manual,5,>5
0,-1.111046,-0.801317,1.195828,0.045745,-1.310754,-1.15778,0.801317,-0.433854,-1.125683,1.248892,-0.098382,-0.985275,-0.020095,-0.056917,1.024622,0.495818,0.444503,-0.424728
1,-0.223944,0.45003,-0.737872,-0.140402,-0.537456,-0.360203,-0.45003,-0.327501,-0.333227,1.248892,-0.098382,-0.985275,-0.020095,-0.056917,1.024622,0.495818,0.444503,-0.424728
2,-0.915058,-1.42699,0.035608,-0.582501,-0.537456,-0.404885,1.42699,-0.327501,-0.789807,1.248892,-0.098382,-0.985275,-0.020095,-0.056917,1.024622,0.495818,0.444503,-0.424728
3,-0.892365,-0.801317,-0.409143,0.32962,-0.921213,-0.693085,0.801317,-0.433854,-0.905265,1.248892,-0.098382,-0.985275,-0.020095,-0.056917,1.024622,0.495818,0.444503,-0.424728
4,-0.182683,0.137194,-0.544502,0.760085,0.042999,0.010435,-0.137194,-0.246579,-0.013096,-0.80071,-0.098382,1.014945,-0.020095,-0.056917,-0.97597,0.495818,0.444503,-0.424728


In [54]:
from sklearn.linear_model import LinearRegression
model = LinearRegression

X1 = df[df.columns.drop('selling_price')]
Y1= df['selling_price']

In [55]:
X1.head()

Unnamed: 0,year,km_driven,mileage,engine,max_power,age,make,model,Individual,Trustmark Dealer,Diesel,Electric,LPG,Petrol,Manual,5,>5
0,-0.801317,1.195828,0.045745,-1.310754,-1.15778,0.801317,-0.433854,-1.125683,1.248892,-0.098382,-0.985275,-0.020095,-0.056917,1.024622,0.495818,0.444503,-0.424728
1,0.45003,-0.737872,-0.140402,-0.537456,-0.360203,-0.45003,-0.327501,-0.333227,1.248892,-0.098382,-0.985275,-0.020095,-0.056917,1.024622,0.495818,0.444503,-0.424728
2,-1.42699,0.035608,-0.582501,-0.537456,-0.404885,1.42699,-0.327501,-0.789807,1.248892,-0.098382,-0.985275,-0.020095,-0.056917,1.024622,0.495818,0.444503,-0.424728
3,-0.801317,-0.409143,0.32962,-0.921213,-0.693085,0.801317,-0.433854,-0.905265,1.248892,-0.098382,-0.985275,-0.020095,-0.056917,1.024622,0.495818,0.444503,-0.424728
4,0.137194,-0.544502,0.760085,0.042999,0.010435,-0.137194,-0.246579,-0.013096,-0.80071,-0.098382,1.014945,-0.020095,-0.056917,-0.97597,0.495818,0.444503,-0.424728


In [56]:
X1.shape

(19820, 17)

In [57]:
Y1.shape

(19820,)

In [58]:
model.fit(X1,Y1)

TypeError: fit() missing 1 required positional argument: 'y'