# Vehicle Dataset - Linear Regression


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import random as r
import joblib as j

from sklearn.model_selection import train_test_split
from sklearn.linear_model import  LinearRegression
from sklearn.metrics import  r2_score, mean_absolute_error, mean_squared_error


In [None]:
# load the data

data_df = pd.read_csv('./datasets/car details v4.csv')
data_df.info()

In [None]:
# extract power 
def extract_power(power_str):
    try:
        # Extract the numeric value before 'bhp'
        power = float(power_str.split('bhp')[0].strip())
        return power
    except:
        return None
    
# extract torque
def extract_torque(torque_str):
    try:
        # Extract the numeric value before 'Nm'
        torque = float(torque_str.split('Nm')[0].strip())
        return torque
    except:
        return None

def convert_power_to_kw(bhp):
    if bhp is not None:
        return bhp * 0.745699872
    return None

def convert_torque_to_lbft(nm):
    if nm is not None:
        return nm * 0.737562149
    return None


In [None]:
# Transform data by converting Max power and Max torque into continuous values

# Extract numeric values
data_df['Power_bhp'] = data_df['Max Power'].apply(extract_power)
data_df['Torque_nm'] = data_df['Max Torque'].apply(extract_torque)

# Convert to other units
data_df['Power_kW'] = data_df['Power_bhp'].apply(convert_power_to_kw)
data_df['Torque_lbft'] = data_df['Torque_nm'].apply(convert_torque_to_lbft)

# Reset the index after dropping NaN values
data_df = data_df.dropna()
data_df = data_df.dropna().reset_index(drop=True)

data_df.head(2)


In [None]:
# Regression line

# From the above description from the dataset, I intend to find the relationships between the following features
# -> kilometers - price
# -> fuel type - price
# -> power_kW - price
# -> torque_lbft - price

# We now plot a scatter plot on the dependent and independent pairs to check the linearity of features
# before we consider them for the regression model

# Fuel Type Vs Price

sns.boxplot(x="Fuel Type", y="Price", data = data_df, color="pink")
plt.title("Fuel Type Vs Price")
plt.xlabel("Fuel Type")
plt.ylabel("Price")
plt.show()

# Kilometer Vs Price

plt.scatter(x=data_df["Kilometer"], y= data_df["Price"], c="teal", alpha=0.5)
sns.regplot(data=data_df, x="Kilometer", y="Price", 
            scatter_kws={'alpha':0.5}, 
            line_kws={'color': 'red'})
plt.title("Price Vs Kilometers")
plt.xlabel("Kilometer")
plt.ylabel("Price")
plt.show()

# Power_kW Vs Price

plt.scatter(x=data_df["Power_kW"], y= data_df["Price"], c="magenta", alpha=0.5)
sns.regplot(data=data_df, x="Power_kW", y="Price", 
            scatter_kws={'alpha':0.5}, 
            line_kws={'color': 'red'})
plt.title("Power_kW Vs Price")
plt.xlabel("Power_kW")
plt.ylabel("Price")
plt.show()

# Torque_lbft Vs Price

plt.scatter(x=data_df["Torque_lbft"], y= data_df["Price"], c="lime", alpha=0.5)
sns.regplot(data=data_df, x="Torque_lbft", y="Price", 
            scatter_kws={'alpha':0.5}, 
            line_kws={'color': 'red'})
plt.title("Torque_lbft Vs Price")
plt.xlabel("Torque_lbft")
plt.ylabel("Price")
plt.show()

In [None]:
# prepare the data 

x = data_df[['Kilometer', 'Power_kW', 'Torque_lbft']]
y = data_df['Price']

# check the shape of the data. 
# Anything that's being plotted on X-axis should be 2D
print(f"X: {x.shape} \nY: {y.shape}")

In [None]:
# generate a random SEED value
SEED = r.randint(0,5000)

# split the training and testing data in 70-30
xtrain, xtest, ytrain, ytest = train_test_split(x,y,train_size=0.7,random_state=SEED)

# build the model
model = LinearRegression()

# fit the model
model.fit(xtrain, ytrain)

# check the incercept(m) and coefficient(c) of the line(y = mx + c)
print(f"c: {model.intercept_} \nm: {model.coef_}")


In [None]:
# predict the Y values from the test set
ypred = model.predict(xtest)

# calculate the accuracy using R2-score
r2_result = r2_score(ytest,ypred)
mse = mean_squared_error(ytest, ypred)
mae = mean_absolute_error(ytest, ypred)

print(f"R2-score: {r2_result}")
print(f"MSE: {mse}")
print(f"MAE: {mae}")

## Store the fine model


In [None]:
accuracy_list = []
models = []

for i in range(5000):
    x_train, x_test, y_train, y_test = train_test_split(x, y,train_size=0.7, random_state=i)
    fine_model = LinearRegression()
    fine_model.fit(x_train,y_train)
    
    y_pred = fine_model.predict(x_test)
    r2_result = r2_score(y_test,y_pred)
    
    accuracy_list.append(r2_result)
    models.append(fine_model)

In [None]:
# max accuracy in the fitted models
max_accuracy = np.max(accuracy_list)

# find the index of the max value
model_idx = np.argmax(accuracy_list)
    
print(f"accuracy list: {accuracy_list}")
print(f"Max accuracy of the fitted model: {max_accuracy}")
print(f"Model Idx with max accuracy: {model_idx}")

In [None]:
# save the model

j.dump(models[model_idx], "multiple-linear-fine-model.pkl")

In [None]:
# predict the saved model

trained_model = j.load("./multiple-linear-fine-model.pkl")

# 'Kilometer', 'Power_kW', 'Torque_lbft'
pred_price = trained_model.predict([[50, 89.25, 5.86]])
print(pred_price)