<img src="https://miro.medium.com/max/647/1*ZOcUPrSXLYucFxppoI-dYg.png">

# Problem definition


For this project we are using a car dataset, where we want to predict the selling price of car based on its certain features.
Since we need to find the real value, with real calculation, therefore this problem is regression problem. 
We will be using linear regression to solve this problem.

General equation of Multiple Linear Regression:
$$y = \beta_0 + \beta_{1}x_1 + \beta_{2}x_2 + \beta_{3}x_3 + \beta_{4}x_4 + ... + \beta_{n}x_n$$

# Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data Gathering

In [None]:
df  = pd.read_csv("car_dataset.csv")
df.head()

# Data Preparation

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.describe().T

In [None]:
r = len(df.columns[1:])//2+ 1*(len(df.columns[1:])%2==1)
c = 2
r,c

In [None]:
fig, ax = plt.subplots(nrows=r,ncols=c, figsize=(16,12))
fig.subplots_adjust(left=None,    bottom=None,    right=None,    top=None,    wspace=0.5,    hspace=0.5)

for i,col in enumerate(df.columns[1:]):
    ax[i//2,i%2].hist(df[col])
    ax[i//2,i%2].set_title(col)

In [None]:
for i,col in enumerate(df.columns[1:]):
    plt.figure(i)
    sns.histplot(df[col])

In [None]:
df.corr()

In [None]:
plt.rcParams["figure.figsize"] = (9,5)

In [None]:
sns.heatmap(df.corr(), annot=True);

In [None]:
cols = list(df.iloc[:,1:].select_dtypes(include="object").columns)
# df.info()

In [None]:
# df[(df.Selling_Price < df.Present_Price)].shape

In [None]:
df = pd.get_dummies(df,columns=["Fuel_Type","Seller_Type","Transmission"], drop_first=True,)

In [None]:
df["car_age"] = 2018-df["Year"]

In [None]:
df.drop(["Car_Name","Year"],axis=1,inplace=True)

In [None]:
df.shape

In [None]:
sns.heatmap(df.corr(), annot=True);

In [None]:
# sns.pairplot(df);

# Feature Engineering

<ul>Fuel_Type feature:
    <li>Fuel is Petrol if Fuel_type_diesel = 0 ,Fuel_Type_Petrol = 1</li>
    <li>Fuel is Diesel if Fuel_type_diesel = 1 ,Fuel_Type_Petrol = 0</li>
    <li>Fuel is cng if Fuel_type_diesel = 0 ,Fuel_Type_Petrol = 0</li>
   </ul>
<ul>Transmission feature:
    <li>transmission is manual if Transmission_Manual = 1</li> 
    <li>transmission is automatic if Transmission_Manual = 0</li></ul>
<ul>Seller_Type feature:
    <li>Seller_Type is Individual if Seller_Type_Individual = 1 </li> 
    <li>Seller_Type is dealer if Seller_Type_Individual = 0</li> </ul>
    


### Features and target variable

In [None]:
X = df.drop(["Selling_Price"],axis=1)
y= df["Selling_Price"]


### Splitting data into training and testing

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .2, random_state=42)


In [None]:
# X_train,X_test,y_train,y_test
X_train.shape

##  Model Building (Linear Regression)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr_model = LinearRegression(normalize=True) # 
lr_model.fit(X_train,y_train)

lr_model.score(X_train,y_train), lr_model.score(X_test,y_test)

# Interpret the model

In [None]:
pd.DataFrame(lr_model.coef_,X_train.columns, columns=["Coefs_"])

In [None]:
for k,v in dict(zip(X_train.columns, lr_model.coef_)).items():
    print(f"{k:<20}","\t",f"{v:.3f}")

# Model Evaluation

In [None]:
dir(lr_model)
lr_model.score(X_train,y_train),lr_model.score(X_test,y_test)

In [None]:
y_pred = lr_model.predict(X_test)

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
def eval_model(y_test,y_pred):
    rmse=(mean_squared_error(y_test,y_pred)**.5)
    mae=mean_absolute_error(y_test,y_pred)
    R2_score=r2_score(y_test,y_pred)
    print(f"rmse:\t\t{rmse }")
    print(f"mae:\t\t{mae }")
    print(f"R2_score:\t{R2_score }")
#     return rmse, mae, R2_score

In [None]:
eval_model(y_test,y_pred)

In [None]:
y_pred = lr_model.predict(X_test)
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred,"residuals":y_test-y_pred})
results

In [None]:
residuals = y_test-y_pred
sns.displot(residuals);

In [None]:
f"{len(residuals[residuals>0])} low + {len(residuals[residuals<0])} high = {len(X_test)}"

In [None]:
plt.scatter(y_test,y_pred)
plt.plot([0,30],[0,30],"r");

In [None]:
from yellowbrick.regressor import PredictionError
# Instantiate the linear model and visualizer
lr_model= LinearRegression(normalize=True)
visualizer = PredictionError(lr_model)
visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
visualizer.show() 

# Regularization

# Ridge

In [None]:
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
# X_train_scaled


In [None]:
ridge_model = Ridge()
ridge_model.fit(X_train_scaled,y_train)

In [None]:
ridge_model.score(X_train_scaled,y_train),ridge_model.score(X_test_scaled,y_test)

In [None]:
y_pred = ridge_model.predict(X_test_scaled)

In [None]:
eval_model(y_test,y_pred)

In [None]:
ridge_model.alpha

In [None]:
ridgecv_model = RidgeCV(alphas=np.linspace(0.1, 20, 100),cv=10)

ridgecv_model.fit(X_train_scaled,y_train)
ridgecv_model.alpha_

In [None]:
ridgecv_model = RidgeCV(alphas=np.linspace(0.1, 1, 100),cv=10)

ridgecv_model.fit(X_train_scaled,y_train)
ridgecv_model.alpha_

In [None]:
ridgecv_model.best_score_

In [None]:
# y_pred_rcv = ridgecv_model.predict(X_test_scaled)
y_pred = ridgecv_model.predict(X_test_scaled)

In [None]:
eval_model(y_test,y_pred) #, ridgecv_model.score(X_train_scaled,y_train),ridgecv_model.score(X_test_scaled,y_pred_rcv)

In [None]:
from yellowbrick.regressor import ManualAlphaSelection
# Create a list of alphas to cross-validate against
alpha_space = np.linspace(0.01, 10, 100)
# Instantiate the visualizer
visualizer = ManualAlphaSelection(
    Ridge(),
    alphas=np.linspace(0.01, 1, 100),
    cv=10
)
visualizer.fit(X_train_scaled, y_train)
visualizer.show()

In [None]:
from yellowbrick.model_selection import FeatureImportances

model = Ridge(alpha=.18)
viz = FeatureImportances(model,labels=list(X.columns),relative=False)
viz.fit(X_train_scaled,y_train)
viz.show()

# Lasso

In [None]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV

In [None]:
lasso_model = Lasso().fit(X_train,y_train)
y_pred = lasso_model.predict(X_test)
eval_model(y_test,y_pred), lasso_model.score(X_train,y_train), lasso_model.score(X_test,y_test)

In [None]:
lassocv_model =LassoCV(alphas=np.linspace(0.01,10,100),cv=10).fit(X_train_scaled,y_train)
y_pred = lassocv_model.predict(X_test_scaled)
eval_model(y_test,y_pred), lassocv_model.score(X_train_scaled,y_train), lassocv_model.score(X_test_scaled,y_test)

In [None]:
lassocv_model.alpha_

In [None]:
lassocv_model =LassoCV(alphas=np.linspace(0.001,1,100),cv=10).fit(X_train_scaled,y_train)
y_pred = lassocv_model.predict(X_test_scaled)
eval_model(y_test,y_pred), lassocv_model.score(X_train_scaled,y_train), lassocv_model.score(X_test_scaled,y_test)

In [None]:
lassocv_model.alpha_

In [None]:
lasso_model = Lasso(alpha=.03127272727272727).fit(X_train_scaled,y_train)
y_pred = lasso_model.predict(X_test_scaled)
# eval_model(y_test,y_pred), 
lasso_model.score(X_train_scaled,y_train), lasso_model.score(X_test_scaled,y_test)

In [None]:
from yellowbrick.regressor import AlphaSelection

# Create a list of alphas to cross-validate against
alphas = np.linspace(0.001,.1,100)

# Instantiate the linear model and visualizer
model = LassoCV(alphas=alphas)
visualizer = AlphaSelection(model)
visualizer.fit(X_train_scaled, y_train)
visualizer.show()

In [None]:
errors = {"train":[],"test":[]}
for alpha in np.linspace(0.001,.1,100):
    model = Lasso(alpha=alpha).fit(X_train_scaled, y_train)
    y_pred = model.predict(X_train_scaled)
    error = mean_squared_error(y_train,y_pred)
    errors["train"].append(error)
    
    y_pred = model.predict(X_test_scaled)
    error = mean_squared_error(y_test,y_pred)
    errors["test"].append(error)
errors    

In [None]:
plt.plot(np.linspace(0.001,.1,100), errors["train"],label="train")
plt.plot(np.linspace(0.001,.1,100), errors["test"],label="test")
plt.legend(fontsize=12)
plt.show()

In [None]:
 len(errors["train"]) , len(errors["test"])

## Elastic-Net 

In [None]:
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV

In [None]:
el_model = ElasticNet().fit(X_train,y_train)
y_pred = el_model.predict(X_test)
eval_model(y_test,y_pred)

In [None]:
el_model.score(X_train,y_train),el_model.score(X_test,y_test)

In [None]:
el_model.get_params()

In [None]:
elcv_model = ElasticNetCV(alphas = np.linspace(.001,1,100), l1_ratio = [.1, .5, .7, .9, .95, .99, 1]).fit(X_train,y_train)

y_pred = elcv_model.predict(X_train)

elcv_model.score(X_train,y_train),elcv_model.score(X_test,y_test),elcv_model.get_params()

In [None]:
# dir(elcv_model)
elcv_model.get_params()

In [None]:
elcv_model.l1_ratio

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model = ElasticNet()
params = {"alpha":np.logspace(1,2,100), "l1_ratio":np.linspace(.01,1,50)}
gr_model = GridSearchCV(model, params)
gr_model.fit(X_train, y_train)


In [None]:
y_pred = gr_model.predict(X_test)
eval_model(y_test,y_pred),gr_model.best_score_, gr_model.best_params_