<h3 style="font-family:serif"><b> Importing the required libraries</b></h3>

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error

<h3 style="font-family:serif"><b> Reading the dataset </b></h3>

In [2]:
data = pd.read_csv('E:\Downloads\Cardekho_Extract.csv')
print(f'''number of rows - {data.shape[0]}
number of columns - {data.shape[1]}''')

number of rows - 20026
number of columns - 16


In [3]:
data.head()

Unnamed: 0,Source.Name,web-scraper-order,web-scraper-start-url,full_name,selling_price,new-price,year,seller_type,km_driven,owner_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,cardekho_extract(0-2000).csv,1611917819-1662,https://www.cardekho.com/used-car-details/used...,Maruti Alto Std,1.2 Lakh*,,2012.0,Individual,"1,20,000 kms",First Owner,Petrol,Manual,Mileage19.7 kmpl,Engine796 CC,Max Power46.3 bhp,Seats5
1,cardekho_extract(0-2000).csv,1611918361-1902,https://www.cardekho.com/used-car-details/used...,Hyundai Grand i10 Asta,5.5 Lakh*,New Car (On-Road Price) : Rs.7.11-7.48 Lakh*,2016.0,Individual,"20,000 kms",First Owner,Petrol,Manual,Mileage18.9 kmpl,Engine1197 CC,Max Power82 bhp,Seats5
2,cardekho_extract(0-2000).csv,1611917012-1306,https://www.cardekho.com/used-car-details/used...,Hyundai i20 Asta,2.15 Lakh*,,2010.0,Individual,"60,000 kms",First Owner,Petrol,Manual,Mileage17.0 kmpl,Engine1197 CC,Max Power80 bhp,Seats5
3,cardekho_extract(0-2000).csv,1611917695-1607,https://www.cardekho.com/used-car-details/used...,Maruti Alto K10 2010-2014 VXI,2.26 Lakh*,,2012.0,Individual,"37,000 kms",First Owner,Petrol,Manual,Mileage20.92 kmpl,Engine998 CC,Max Power67.1 bhp,Seats5
4,cardekho_extract(0-2000).csv,1611914861-367,https://www.cardekho.com/used-car-details/used...,Ford Ecosport 2015-2021 1.5 TDCi Titanium BSIV,5.7 Lakh*,New Car (On-Road Price) : Rs.10.14-13.79 Lakh*,2015.0,Dealer,"30,000 kms",First Owner,Diesel,Manual,Mileage22.77 kmpl,Engine1498 CC,Max Power98.59 bhp,Seats5


<h3 style="font-family:serif"><b>Checking NaN values</b></h3>

In [4]:
def null_count():
    return pd.DataFrame({'features': data.columns,
                'dtypes': data.dtypes.values,
                'NaN count': data.isnull().sum().values,
                'NaN percentage': data.isnull().sum().values/data.shape[0]}).style.background_gradient(cmap='Blues',low=0.1,high=0.01)
null_count()

Unnamed: 0,features,dtypes,NaN count,NaN percentage
0,Source.Name,object,0,0.0
1,web-scraper-order,object,0,0.0
2,web-scraper-start-url,object,0,0.0
3,full_name,object,46,0.002297
4,selling_price,object,46,0.002297
5,new-price,object,10460,0.522321
6,year,float64,46,0.002297
7,seller_type,object,46,0.002297
8,km_driven,object,46,0.002297
9,owner_type,object,46,0.002297


<h3 style="font-family:serif"><b>Cleaning selling price column</b></h3>
<p>selling price has different values such as "5.5 Lakh*", "2.3 Cr*" and "95,000*" which is of data type object, gettting only the numerical value and multipling by suffix value lakhs, cr... eg(5.5 Lakh*: 500000, 2.3Cr*: 23000000, 95,000*: 95000) </p>

In [5]:
for i in range(data.shape[0]):
    try:
        price = float(data['selling_price'][i].split(' ')[0])
        digit = data['selling_price'][i].split(' ')[1]
        if digit == 'Lakh*':
            price = price * 100000
            data['selling_price'][i] = price
        elif digit == 'Cr*':
            price = price * 10000000
            data['selling_price'][i] = price
    except:
        price = data['selling_price'][i][:-1]
        price = price.replace(',', '')
        data['selling_price'][i] = float(price)

TypeError: 'float' object is not subscriptable

<h3 style="font-family:serif"><b>Cleaning kilometer driven, Mileage, Engine, Maxpower and Seats columns</b></h3>
<p> km_driven, mileage, engine, max_power and seats have values like "120,000 kms", "Mileage19.7 kmpl", "Engine796CC", "Max Power46.3 bhp" and "seats5", getting only the numerical values from those strings. eg (km_driven: 120000, mileage: 19.7, engine:796, max_power: 46.3, seats: 5)</p>

In [None]:
# kilometer driven
data['km_driven'] = data['km_driven'].str.split(' ', n=1, expand=True)[0]
data['km_driven'] = data['km_driven'].str.replace(',','')
# Mileage
data['mileage'] = data['mileage'].str.split(' ', expand=True)[0].str.split('e', expand=True)[2]
# Engine
data['engine'] = data['engine'].str.split(' ', expand=True)[0].str.split('e',expand=True)[1]
# Max Power
data['max_power'] = data['max_power'].str.split(' ', expand=True)[1].str.split('r',expand=True)[1]
# Seats 
data['seats'] = data['seats'].str.split('s', expand=True)[1]

<h3 style="font-family:serif"><b>Changing the data types of the columns</b></h3>
<p>changing the data types of the feaures such as selling_price, km_driven, mileage, engine, max_power, seats to int/float</p>

In [None]:
cols = ['selling_price', 'km_driven', 'mileage', 'engine', 'max_power', 'seats']

for col in cols:
    try:
        data[col] = data[col].astype(int)
    except:
        data[col] = data[col].astype(float)

<h3 style="font-family:serif"><b>Creating new feature Company</b></h3>
<p> creating a new feature "company" from full_name by getting the first word which is actually the cars company name, "Ford Ecosport 2015-2021 1.5 TDCi Titanium BSIV" this car model belongs to Ford company</p>

In [None]:
# creating new feature company instead of full name
data['company'] = data['full_name'].str.split(' ', expand=True)[0]

<h3 style="font-family:serif"><b>Droping unwanted columns</b></h3>
<p>Droping "new_price" column which has more the 50 percent missing values, "full_name" because I created company column for better analysis and "owner_type" for having only one value which doesn't provide any information.</p>


In [None]:
data.drop(columns=['new_price','full_name','owner_type'], axis=1, inplace=True)
data.head()

<h3 style="font-family:serif"><b>Basic statistic on numerical features</b></h3>

In [None]:
data.describe().T

<h3 style="font-family:serif"><b>Basic statistics on categorical features</b></h3>

In [None]:
data.describe(include='O')

In [None]:
x = data.nunique().sort_values(ascending=False).index
y = data.nunique().sort_values(ascending=False).values

plt.rcParams['figure.dpi'] = 200
fig = plt.figure(figsize=(7,3))
gs = fig.add_gridspec(1,1)
ax0 = fig.add_subplot(gs[0,0])
for i in ['top', 'right']:
    ax0.spines[i].set_visible(False)
    
colormap = ['#79c6e8' for i in range(data.shape[1])]
colormap[0] = '#9c9a9a'

ax0.bar(x=x, height=y, edgecolor='k', linewidth=0.7, color=colormap, alpha=0.9)
ax0.tick_params(axis='x',labelsize=5, rotation=90, width=1)
ax0.tick_params(axis='y', labelsize=5, left=False)
fig.suptitle(t='Number of unique values in each features', fontsize=6, weight='bold')

for p in ax0.patches:
    value = f'{p.get_height():,.0f}'
    x = p.get_x() + p.get_width() / 2
    y = p.get_y() + p.get_height()+290
    ax0.text(x, y, value, ha='center', va='center', fontsize=5, 
            bbox=dict(facecolor='white', edgecolor='black', boxstyle='round', linewidth=0.3))
plt.show()


In [None]:
fig = plt.figure(figsize=(7,3))
gs = fig.add_gridspec(1,1)

cars = data.company.value_counts().head(10)
pal = sns.light_palette('#79c6e8', n_colors=50, reverse=True)
pal[0] = '#9c9a9a'
ax = fig.add_subplot(gs[0,0])
ax=sns.barplot(x=cars.index, y=cars.values, edgecolor='k', linewidth=0.7, palette=pal)
ax.tick_params(axis='x', labelsize=5, rotation=90)
ax.tick_params(axis='y', labelsize=5, left=False)
ax.set_ylabel(ylabel='count', fontsize=5, weight='bold')

for p in ax.patches:
    value = f'{p.get_height():,.0f}'
    x = p.get_x() + p.get_width()/2
    y = p.get_y() + p.get_height()+320
    ax.text(x=x, y=y, s=value, fontsize=4.5, ha='center', va='center',
           bbox=dict(fc='white', edgecolor='k', boxstyle='round', linewidth=0.5))
    
fig.suptitle(t='Top 10 most present company names in the dataset', fontsize='6', weight='bold')
sns.despine()
plt.show()

In [None]:
fig = plt.figure(figsize=(7,3))
gs = fig.add_gridspec(1,1)

cars = data.company.value_counts().tail(15)
pal = sns.light_palette('#79c6e8', n_colors=50, reverse=True)
pal[0] = '#9c9a9a'
ax = fig.add_subplot(gs[0,0])
ax=sns.barplot(x=cars.index, y=cars.values, edgecolor='k', linewidth=0.7, palette=pal)
ax.tick_params(axis='x', labelsize=5, rotation=90)
ax.tick_params(axis='y', labelsize=5, left=False)
ax.set_ylabel(ylabel='count', fontsize=5, weight='bold')

for p in ax.patches:
    value = f'{p.get_height():,.0f}'
    x = p.get_x() + p.get_width()/2
    y = p.get_y() + p.get_height()+0.5
    ax.text(x=x, y=y, s=value, fontsize=4.5, ha='center', va='center',
           bbox=dict(fc='white', edgecolor='k', boxstyle='round', linewidth=0.5))
    
fig.suptitle(t='Top 10 most least company names in the dataset', fontsize='6', weight='bold')
sns.despine()
plt.show()

In [None]:
fig = plt.figure(figsize=(7,3))
gs = fig.add_gridspec(1,1)

cars = data.year.value_counts().head(10)
pal = sns.light_palette('#79c6e8', n_colors=50, reverse=True)
pal[7] = '#9c9a9a'
ax = fig.add_subplot(gs[0,0])
ax=sns.barplot(x=cars.index, y=cars.values, edgecolor='k', linewidth=0.7, palette=pal)
ax.tick_params(axis='x', labelsize=5, rotation=90)
ax.tick_params(axis='y', labelsize=5, left=False)
ax.set_ylabel(ylabel='count', fontsize=5, weight='bold')

for p in ax.patches:
    value = f'{p.get_height():,.0f}'
    x = p.get_x() + p.get_width()/2
    y = p.get_y() + p.get_height()+150
    ax.text(x=x, y=y, s=value, fontsize=4.5, ha='center', va='center',
           bbox=dict(fc='white', edgecolor='k', boxstyle='round', linewidth=0.5))
    
fig.suptitle(t='Top 10 years with highest sales', fontsize='6', weight='bold')
sns.despine()
plt.show()

<h3 style="font-family:serif"><b>Bivariate Analysis </b></h3>

In [None]:
fig = plt.figure(figsize=(7,3))
gs = fig.add_gridspec(1,1)

pal = sns.light_palette('#79c6e8', reverse=True, n_colors=50)
pal.insert(0,'#9c9a9a')

cars = data.groupby('company')['selling_price'].mean().sort_values(ascending=False).head(10)/1000000
ax = fig.add_subplot(gs[0,0])
ax=sns.barplot(x=cars.index, y=cars.values, edgecolor='k', linewidth=0.7, palette=pal)
ax.tick_params(axis='x', labelsize=5, rotation=90)
ax.tick_params(axis='y', labelsize=5, left=False)
ax.set_xlabel(xlabel=None)
ax.set_ylabel(ylabel='price in millions', fontsize=5, weight='bold')
for p in ax.patches:
    value = f'{p.get_height():,.1f}' + ' M'
    x = p.get_x() + p.get_width()/2
    y = p.get_y() + p.get_height()+3
    ax.text(x=x, y=y, s=value, fontsize=4.5, ha='center', va='center',
           bbox=dict(fc='white', edgecolor='k', boxstyle='round', linewidth=0.5))
    
fig.suptitle(t='Top 10 car company with highest selling price', fontsize=6, weight='bold')
sns.despine()
plt.show()

In [None]:
fig = plt.figure(figsize=(7,5))
gs = fig.add_gridspec(2,1)
color = ['#79c6e8' for i in range(data.shape[0])]
color[4967] = color[19972] = color[14257] = color[475] = '#9c9a9a'
ax=fig.add_subplot(gs[0,0])
ax.scatter(x=data.km_driven/1000000,y=data.selling_price/1000000,s=28,
                       color=color, edgecolor='white', linewidth=0.4)
ax.tick_params(labelsize=5, left=False)
ax.set_ylabel(ylabel='selling price in millions', fontsize=5, weight='bold')

ax1=fig.add_subplot(gs[1,0])
rmd_data = data[data['selling_price']<20000000]
rmd_data = rmd_data[rmd_data['km_driven']<2000000]
ax1.scatter(x=rmd_data.km_driven/1000000,y=rmd_data.selling_price/1000000,
            s=28, color='#79c6e8', edgecolor='white', linewidth=0.4)
ax1.tick_params(labelsize=5, left=False)
ax1.set_xlabel(xlabel='kilometer driven in millions', fontsize=5, weight='bold')
ax1.set_ylabel(ylabel='selling price in millions', fontsize=5, weight='bold')
ax1.text(x=1, y=12, s='After removing outliers', fontsize=6, weight='bold', alpha=0.6,
        bbox=dict(facecolor='white', lw=0.5, alpha=0.3))
sns.despine()
plt.show()

In [None]:
fig = plt.figure(figsize=(7,5))
gs = fig.add_gridspec(2,1)
color = ['#79c6e8' for i in range(data.shape[0])]
color[4967] = color[14257] = color[475] = color[11964] = color[18396]= '#9c9a9a'
ax=fig.add_subplot(gs[0,0])
ax.scatter(x=data.mileage,y=data.selling_price/1000000,s=28,
                       color=color, edgecolor='white', linewidth=0.4)
ax.tick_params(labelsize=5, left=False)
ax.set_ylabel(ylabel='selling price in millions', fontsize=5, weight='bold')

ax1=fig.add_subplot(gs[1,0])
rmd_data = data[data['selling_price']<20000000]
rmd_data = rmd_data[rmd_data['mileage']<100]
ax1.scatter(x=rmd_data.mileage,y=rmd_data.selling_price/1000000,s=28,
                       color='#79c6e8', edgecolor='white', linewidth=0.4)
ax1.tick_params(labelsize=5, left=False)
ax1.set_xlabel(xlabel='mileage in kmpl', fontsize=5, weight='bold')
ax1.set_ylabel(ylabel='selling price in millions', fontsize=5, weight='bold')
ax1.text(x=25, y=12, s='After removing outliers', fontsize=6, weight='bold', alpha=0.6,
        bbox=dict(facecolor='white', lw=0.5, alpha=0.3))
sns.despine()
plt.show()

In [None]:
fig = plt.figure(figsize=(7,5))
gs = fig.add_gridspec(2,1)
color = ['#79c6e8' for i in range(data.shape[0])]
color[4967]=color[475]=color[14257]=color[1536]=color[17020]=color[16856]=color[13130]=color[3980]='#9c9a9a'
ax=fig.add_subplot(gs[0,0])
ax.scatter(x=data.max_power,y=data.selling_price/1000000,s=28,
                       color=color, edgecolor='white', linewidth=0.4)
ax.tick_params(labelsize=5, left=False)
ax.set_ylabel(ylabel='selling price in millions', fontsize=5, weight='bold')

ax1=fig.add_subplot(gs[1,0])
rmd_data = data[data['selling_price']<20000000]
rmd_data = rmd_data[rmd_data['max_power']<530]
ax1.scatter(x=rmd_data.max_power,y=rmd_data.selling_price/1000000,s=28,
                       color='#79c6e8', edgecolor='white', linewidth=0.4)
ax1.tick_params(labelsize=5, left=False)
ax1.set_xlabel(xlabel='max_power in bhp', fontsize=5, weight='bold')
ax1.set_ylabel(ylabel='selling price in millions', fontsize=5, weight='bold')
ax1.text(x=380, y=11, s='After removing outliers', fontsize=6, weight='bold', alpha=0.6,
        bbox=dict(facecolor='white', lw=0.5, alpha=0.3))
sns.despine()
plt.show()

In [None]:
for i in ['mileage', 'engine', 'max_power', 'seats']:
    company_name = data[data[i].isnull()]['company'].value_counts().index[0]
    if data[i].nunique()>10:
        values = data[data['company']==company_name][i].mean()
    else:
        values = data[data['company']==company_name][i].median()
        
    data[i].fillna(values, inplace=True)

<h3 style="font-family:serif"><b> Removing outliers in the data </b></h3>

In [None]:
data = data[data['selling_price'] < 20000000]
data = data[data['km_driven'] < 1000000]
data = data[data['mileage'] < 100]
data = data[data['engine'] < 6100]
data = data[data['max_power'] < 530]
data = data.reset_index(drop=True)

<h3 style="font-family:serif"><b>Appling discretization on company</b></h3>

In [None]:
company_name = data.company.value_counts().index[:15]
for i in range(data.shape[0]):
    if data['company'][i] in company_name:
         continue
    else:
        data['company'][i] = 'others'

<h3 style="font-family:serif"><b>Encoding categorical features</b></h3>

In [None]:
data = pd.get_dummies(data=data, columns=['seller_type','fuel_type','transmission_type','company'], drop_first=True)
data.shape

<h3 style="font-family:serif"><b>Splitting the data into training and testing</b></h3>

In [None]:
x = data.iloc[:,1:]
y = data['selling_price']
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.20,random_state=25)

<h3 style="font-family:serif"><b>Feature Scaling</b></h3>

In [None]:
scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

In [None]:
def do_prediction(classifier):
    
    # training the classifier on the dataset
    classifier.fit(xtrain, ytrain)
    
    #Do prediction and evaluting the prediction
    prediction = classifier.predict(xtest)
    cross_validation_score = cross_val(xtrain, ytrain, classifier)
    error = mean_absolute_error(ytest, prediction)
    
    return error, cross_validation_score

def cross_val(xtrain, ytrain, classifier):
    
    # Applying k-Fold Cross Validation
    accuracies = cross_val_score(estimator = classifier, X = xtrain, y = ytrain, cv = 5)
    return accuracies.mean()

<h3 style="font-family:serif"><b>1. Linear Regression</b></h3>

In [None]:
model_1 = LinearRegression()
error, score = do_prediction(model_1)

print('Linear Regression MAE: {}'.format(round(error,2)))
print('Cross validation score: {}'.format(round(score,2)))