In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# Importing ML Libraries
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split,RandomizedSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor

from sklearn.metrics import mean_squared_error

In [None]:
# Importing dataset
cars_data = pd.read_csv('car data.csv')

In [None]:
cars_data.head()

In [None]:
# Data Housekeeping

cars_data['Fuel_Type'].unique()
cars_data['Seller_Type'].unique()
cars_data['Transmission'].unique()
cars_data['Owner'].unique()

In [None]:
# Check for Missing values
cars_data.isna().sum()

In [None]:
# Subtracting current year with mentioned year and just keeping data-engineered variable and also dropping car model

cars_data['Current_Year'] = 2020
cars_models = cars_data['Car_Name']
cars_data['#_of_Years'] = cars_data['Current_Year']-cars_data['Year']
cars_data = cars_data.drop(['Year','Current_Year','Car_Name'],1)
cars_data = pd.get_dummies(cars_data,drop_first=True)

In [None]:
# HeatMap Co-relation
df = cars_data.copy()

features = df.columns

mask = np.zeros_like(df[features].corr(), dtype=np.bool) 
mask[np.triu_indices_from(mask)] = True 

f, ax = plt.subplots(figsize=(16, 12))
plt.title('Pearson Correlation Matrix',fontsize=25)

sns.heatmap(df[features].corr(),linewidths=0.25,vmax=0.7,square=True,cmap="YlOrBr",
linecolor='w',annot=True,annot_kws={"size":8},mask=mask,cbar_kws={"shrink": .9});

In [None]:
# Independent and Dependent Variables
X = cars_data.drop(columns=['Selling_Price'])
Y = cars_data['Selling_Price']

In [None]:
# Feature importance
extra_reg = ExtraTreesRegressor()
extra_reg.fit(X,Y)

In [None]:
# Feature Importance of diff features
pd.DataFrame(index=X.columns,data=extra_reg.feature_importances_,columns=['Importance']).sort_values(by='Importance',ascending=True).plot(kind='barh')

In [None]:
# Train and Test split
xtrain,xtest,ytrain,ytest = train_test_split(X,Y,test_size=0.3,random_state=0)

In [None]:
# Shape check
xtrain.shape
xtest.shape

In [None]:
# Linear Regression
model = LinearRegression()
model = model.fit(xtrain,ytrain)
pred = model.predict(xtest)

np.sqrt(mean_squared_error(ytest,pred))

In [None]:
# Decision Tree 
model = DecisionTreeRegressor(random_state=0)
model = model.fit(xtrain,ytrain)
pred = model.predict(xtest)

np.sqrt(mean_squared_error(ytest,pred))

In [None]:
# Random Forest 
model = RandomForestRegressor(random_state=0)
model = model.fit(xtrain,ytrain)
pred = model.predict(xtest)

np.sqrt(mean_squared_error(ytest,pred))

In [None]:
# Tuned Model - Random Forest using RandomSearchCV

n_estimators      =  [int(x) for x in np.linspace(100,1000,10)]
max_depth         =  [int(x) for x in np.linspace(5,30,6)]
min_samples_split =  [2,5,10,15,20]
min_samples_leaf  =  [1,2,5,10]

# Random grid
parameters = {  'n_estimators':n_estimators,
                'max_depth': max_depth,
                'min_samples_split' : min_samples_split,
                'min_samples_leaf' : min_samples_leaf       
            }

rf = RandomForestRegressor(random_state=0)
random_search = RandomizedSearchCV(estimator=rf,param_distributions=parameters,n_iter=10,cv=10,verbose=1,scoring='neg_mean_squared_error')
random_search = random_search.fit(xtrain,ytrain)

random_search_tuned = random_search.best_estimator_

model_tuned = random_search_tuned.fit(xtrain,ytrain)
pred = model_tuned.predict(xtest)

np.sqrt(mean_squared_error(ytest,pred))

In [None]:
# Pickle the tuned model
import pickle
file = open('rf_regressor_tuned_model','wb')
pickle.dump(model_tuned,file)