<a id="table-of-contents"></a>

# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Loading and Reading the Data

In [2]:
df = pd.read_csv('../input/cardataset/data.csv')
df.head()

# Data Cleaning

In [3]:
df.columns = df.columns.str.lower().str.replace(" ", "_")
df.head()

# Making List of Categorical Columns

In [4]:
strings = list(df.dtypes[df.dtypes == 'object'].index)
strings


# Other methods 
# list(df.select_dtypes(include = 'O').columns)
# [col for col in df.columns if df[col].dtype == 'object']

# Cleaning Categorical Data 

In [5]:
for col in strings:
    df[col] = df[col].str.lower().str.replace(" ", "_")
    
df.head()

# EDA

# Unique values and their numbers

In [6]:
for col in df.columns:
    print(col)
    print(df[col].unique()[:5])
    print(df[col].nunique())
    print('\n')

# Price Distribution

In [7]:
sns.histplot(df['msrp'], bins = 50);

In [8]:
sns.histplot(df['msrp'][df['msrp'] < 100000], bins = 50);

# MSRP +1

In [9]:
price_logs = np.log1p(df['msrp'])
sns.histplot(price_logs, bins = 50);

# Missing Values

In [10]:
print(df.isnull().sum())
df.isnull().sum().plot(kind = 'bar')

# Splitting Data Into Validation, Test, and Training Sets

In [11]:
n  = len(df)    

n_val = int(n * 0.2)  # Creating Validation Set
n_test = int(n * 0.2) # Creating test set
n_train = n - n_val - n_test

idx = np.arange(n)
np.random.seed(2)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

y_train = np.log1p(df_train['msrp'].values)
y_val = np.log1p(df_val['msrp'].values)
y_test = np.log1p(df_test['msrp'].values)

del df_train['msrp']
del df_val['msrp']
del df_test['msrp']

len(df_train), len(df_val), len(df_test)

# Linear Regression

In [12]:
xi = [453, 11, 86] 


w0 = 0

w =  [1, 1, 1]

def linear_regression(xi):
    n =len(xi)                # Number of features used
    
    pred = w0                 # Initial / Base prediction
    
    for j in range(n):
        pred += w[j]*xi[j]     # Formula = w0 +sigma[0:n-1]{w[j]*xi[j]}
    
    return pred

linear_regression(xi)

In [13]:
w0 = 7.17

w =  [0.01, 0.04, 0.002]

linear_regression(xi)

# Linear Regression Vector Form

# Generalized Linear Regression

In [14]:
def dot(xi,w):
    n = len(xi)
    
    res = 0.0
    
    for j in range(n):
        res += xi[j]*w[j]
    return res

def linear_regression(xi):
    return w0 + dot(xi,w)

w_new = [w0] + w
def linear_regression(xi):
    xi = [1] + xi
    return dot(xi,w_new)

linear_regression(xi)

# Linear Regression with Multiple Variables

In [15]:
w0 = 7.17

w =  [0.01, 0.04, 0.002]

w_new = [w0] + w

x1 = [1, 148, 24, 1385]
x2 = [1, 132, 25, 2031]
x10 = [1, 453, 11, 86]

X= [x1, x2, x10]
X = np.array(X)

def linear_regression(X):
    return X.dot(w_new)

linear_regression(X)

# Training Linear Regression - Normal Equationraining Linear Regression - Normal Equation

In [16]:
X = [
    [148, 24, 1385],
    [132, 25, 2031],
    [453, 11, 86],
    [158, 24, 185],
    [172, 25, 201],
    [413, 11, 86],
    [38, 54, 185],
    [142, 25, 431],
    [453, 31, 86]
]

X = np.array(X)


# Including a biased term
# ones = np.ones(X.shape[0])
# X = np.column_stack([ones, X])
# X

y = [10000, 20000, 15000, 20050, 10000, 20000, 15000, 25000, 120]



In [17]:
XTX = X.T.dot(X)

XTX_inv = np.linalg.inv(XTX)

# XTX.dot(XTX_inv).round(1)
XTX.dot(XTX_inv)

w_full = XTX_inv.dot(X.T).dot(y)

w0 = w_full[0]
w = w_full[1:]

w0, w

In [18]:
def train_linear_regression(X,y):
   
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    XTX = X.T.dot(X)
    
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]   

# Car Price Baseline Model

In [19]:
df_train.dtypes

base = ['engine_hp','engine_cylinders','highway_mpg', 'city_mpg', 'popularity']

X_train = df_train[base].values

train_linear_regression(X_train,y_train)

In [20]:
df_train[base].isnull().sum()

In [21]:
X_train = df_train[base].fillna(0).values

In [22]:
w0, w = train_linear_regression(X_train,y_train)

y_pred =  w0 + X_train.dot(w)

sns.histplot(y_pred, color='red', label='prediction', alpha = 0.5, bins = 33)
sns.histplot(y_train, color='blue',label='target', alpha = 0.5, bins = 33)

# RMSE

In [23]:
def rmse(y,y_pred):
    error  = y- y_pred
    squared_error = error ** 2
    mse = squared_error.mean()
    return np.sqrt(mse)  

In [24]:
rmse(y_train,y_pred)

# Validating the Model using RMSE

In [25]:
base = ['engine_hp','engine_cylinders','highway_mpg', 'city_mpg', 'popularity']

def prepare_X(df):
    df_num = df[base]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [26]:
X_train = prepare_X(df_train)                            # Preparing Training set

w0,w = w0, w = train_linear_regression(X_train,y_train)  # Building the model on Train set

X_val = prepare_X(df_val)                                 # Preparing Validation Set
y_pred = w0 + X_val.dot(w)                                # Prediction on Validation Set

rmse(y_val, y_pred)                                       # Calculating RMSE for Validation Set 

# Feature Engineering

In [27]:
def prepare_X(df):
    df = df.copy()
    df['age'] = max(df['year']) - df['year']
    features = base + ['age']
    df_num = df[features]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [28]:
X_train = prepare_X(df_train)

In [29]:
X_train = prepare_X(df_train)                            # Preparing Training set

w0,w = w0, w = train_linear_regression(X_train,y_train)  # Building the model on Train set

X_val = prepare_X(df_val)                                 # Preparing Validation Set
y_pred = w0 + X_val.dot(w)                                # Prediction on Validation Set

rmse(y_val, y_pred)                                       # Calculating RMSE for Validation Set 

In [30]:
sns.histplot(y_pred, color='red', label='prediction', alpha = 0.5, bins = 33)
sns.histplot(y_val, color='blue', label='target', alpha = 0.5, bins = 33)

# Categorical Variables

In [31]:
makes = list(df['make'].value_counts().head().index)
makes

In [32]:
def prepare_X(df):
    df = df.copy()
    features = base.copy()
     
    df['age'] = max(df['year']) - df['year']    
    features.append('age')
    
    for v in [2,3,4]:
        df['num_doors_%s' %v] = (df['number_of_doors'] == v).astype('int') 
        features.append('num_doors_%s' %v)
      
    for m in makes:
        df['make_%s' %m] = (df['make'] == m).astype('int') 
        features.append('make_%s' %m)
        
    df_num = df[features]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [33]:
X_train = prepare_X(df_train)                            # Preparing Training set

w0,w = train_linear_regression(X_train,y_train)          # Building the model on Train set

X_val = prepare_X(df_val)                                 # Preparing Validation Set
y_pred = w0 + X_val.dot(w)                                # Prediction on Validation Set

rmse(y_val, y_pred)                                       # Calculating RMSE for Validation Set 

In [34]:
categorical_variables = ['make','engine_fuel_type', 'transmission_type' , 'driven_wheels', 'market_category', 
'vehicle_size', 'vehicle_style' ]           

In [35]:
categories = {}

for c in categorical_variables:
    categories[c] = list(df[c].value_counts().head().index)


In [36]:
categories.items()

In [37]:
def prepare_X(df):
    df = df.copy()
    features = base.copy()
     
    df['age'] = max(df['year']) - df['year']    
    features.append('age')
    
    for v in [2,3,4]:
        df['num_doors_%s' %v] = (df['number_of_doors'] == v).astype('int') 
        features.append('num_doors_%s' %v)
        
    for c,values in categories.items():
        for v in values:
            df["%s_%s" %(c,v)] = (df[c] == v).astype('int') 
            features.append("%s_%s" %(c,v))
        
    df_num = df[features]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [38]:
X_train = prepare_X(df_train)                            # Preparing Training set

w0,w = train_linear_regression(X_train,y_train)          # Building the model on Train set

X_val = prepare_X(df_val)                                 # Preparing Validation Set
y_pred = w0 + X_val.dot(w)                                # Prediction on Validation Set

rmse(y_val, y_pred)                                       # Calculating RMSE for Validation Set 

# Regularization

In [39]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])
    
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [40]:
X_train = prepare_X(df_train)
w0, w = train_linear_regression_reg(X_train, y_train, r=0.01)

X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)
rmse(y_val, y_pred)

# Tuning the Model

In [41]:
for r in [0.0, 0.00001, 0.0001, 0.001, 0.1, 1, 10]:
    X_train = prepare_X(df_train)
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)

    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    
    print(r, w0, score)

In [42]:
r = 0.001
X_train = prepare_X(df_train)
w0, w = train_linear_regression_reg(X_train, y_train, r=r)

X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)
score = rmse(y_val, y_pred)
score

# Using the model

In [43]:
df_full_train = pd.concat([df_train, df_val])

In [44]:
df_full_train = df_full_train.reset_index(drop=True)

In [45]:
X_full_train = prepare_X(df_full_train)
y_full_train = np.concatenate([y_train, y_val])

In [46]:
w0, w = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)

In [47]:
X_test = prepare_X(df_test)
y_pred = w0 + X_test.dot(w)
score = rmse(y_test, y_pred)
score

In [48]:
car = df_test.iloc[100].to_dict()
car

In [49]:
df_small = pd.DataFrame([car])
df_small

In [50]:
X_small = prepare_X(df_small)

In [51]:
y_pred = w0 + X_small.dot(w)
y_pred = y_pred[0]
y_pred

In [52]:
np.expm1(y_pred)

In [53]:
np.expm1(y_test[100])