In [57]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

## 1. Load Data

In [58]:
df = pd.read_csv('cardekho.csv')
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage(km/ltr/kg),engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0


In [59]:
df.info() 

<class 'pandas.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                8128 non-null   str    
 1   year                8128 non-null   int64  
 2   selling_price       8128 non-null   int64  
 3   km_driven           8128 non-null   int64  
 4   fuel                8128 non-null   str    
 5   seller_type         8128 non-null   str    
 6   transmission        8128 non-null   str    
 7   owner               8128 non-null   str    
 8   mileage(km/ltr/kg)  7907 non-null   float64
 9   engine              7907 non-null   float64
 10  max_power           7913 non-null   str    
 11  seats               7907 non-null   float64
dtypes: float64(3), int64(3), str(6)
memory usage: 762.1 KB


In [60]:
df.describe().loc[['mean', 'min', 'max']]

Unnamed: 0,year,selling_price,km_driven,mileage(km/ltr/kg),engine,seats
mean,2013.804011,638271.8,69819.51,19.418783,1458.625016,5.416719
min,1983.0,29999.0,1.0,0.0,624.0,2.0
max,2020.0,10000000.0,2360457.0,42.0,3604.0,14.0


In [61]:
# Check for missing values
df.isna().sum()

name                    0
year                    0
selling_price           0
km_driven               0
fuel                    0
seller_type             0
transmission            0
owner                   0
mileage(km/ltr/kg)    221
engine                221
max_power             215
seats                 221
dtype: int64

In [62]:
df.describe(include='O') # Identify categorical features

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  df.describe(include='O') # Identify categorical features


Unnamed: 0,name,fuel,seller_type,transmission,owner,max_power
count,8128,8128,8128,8128,8128,7913
unique,2058,4,3,2,5,320
top,Maruti Swift Dzire VDI,Diesel,Individual,Manual,First Owner,74
freq,129,4402,6766,7078,5289,377


In [63]:
# Check unique values in categorical features
print(df['fuel'].unique())
print(df['seller_type'].unique())
print(df['transmission'].unique())
print(df['owner'].unique())

<StringArray>
['Diesel', 'Petrol', 'LPG', 'CNG']
Length: 4, dtype: str
<StringArray>
['Individual', 'Dealer', 'Trustmark Dealer']
Length: 3, dtype: str
<StringArray>
['Manual', 'Automatic']
Length: 2, dtype: str
<StringArray>
[         'First Owner',         'Second Owner',          'Third Owner',
 'Fourth & Above Owner',       'Test Drive Car']
Length: 5, dtype: str


In [64]:
# convert categorical features to numerical
df = pd.get_dummies(df, columns=['fuel', 'seller_type'], dtype=int)
df['owner'] = df['owner'].map({'First Owner': 1, 'Second Owner': 2, 'Third Owner': 3, 'Fourth & Above Owner': 4, 'Test Drive Car': 0})
df['transmission'] = df['transmission'].map({'Manual': 0, 'Automatic': 1})
df.head()

Unnamed: 0,name,year,selling_price,km_driven,transmission,owner,mileage(km/ltr/kg),engine,max_power,seats,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer
0,Maruti Swift Dzire VDI,2014,450000,145500,0,1,23.4,1248.0,74.0,5.0,0,1,0,0,0,1,0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,0,2,21.14,1498.0,103.52,5.0,0,1,0,0,0,1,0
2,Honda City 2017-2020 EXi,2006,158000,140000,0,3,17.7,1497.0,78.0,5.0,0,0,0,1,0,1,0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,0,1,23.0,1396.0,90.0,5.0,0,1,0,0,0,1,0
4,Maruti Swift VXI BSIII,2007,130000,120000,0,1,16.1,1298.0,88.2,5.0,0,0,0,1,0,1,0
