In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from pandas.api.types import CategoricalDtype
from sklearn.preprocessing import MinMaxScaler

In [195]:
#Loading the Dataset
df = pd.read_csv("used_car_dataset.csv")
df

Unnamed: 0.1,Unnamed: 0,brand,currency,description,fuel_type,item_condition,manufacturer,mileage_from_odometer,model_date,price,vehicle_engine,vehicle_transmission
0,0,MG,PKR,MG HS 2021 for sale in Lahore,Petrol,used,MG,"6,000 km",2021.0,6525000.0,1490cc,Automatic
1,1,Toyota,PKR,Toyota Aygo 2012 for sale in Islamabad,Petrol,used,Toyota,"68,000 km",2012.0,1625000.0,1000cc,Manual
2,2,Honda,PKR,Honda City 2015 for sale in Lahore,Petrol,used,Honda,"150,000 km",2015.0,2550000.0,1300cc,Automatic
3,3,KIA,PKR,KIA Sorento 2022 for sale in Sialkot,Petrol,used,KIA,"18,000 km",2022.0,9000000.0,3500cc,Automatic
4,4,Toyota,PKR,Toyota Corolla 2015 for sale in Rawalpindi,Petrol,used,Toyota,"110,000 km",2015.0,3350000.0,1300cc,Automatic
...,...,...,...,...,...,...,...,...,...,...,...,...
60104,60104,KIA,PKR,KIA Sportage 2022 for sale in Lahore,Petrol,used,KIA,10 km,2022.0,7900000.0,2000cc,Automatic
60105,60105,KIA,PKR,KIA Picanto 2022 for sale in Lahore,Petrol,used,KIA,10 km,2022.0,3350000.0,1000cc,Automatic
60106,60106,Suzuki,PKR,Suzuki Swift 2022 for sale in Lahore,Petrol,used,Suzuki,10 km,2022.0,4750000.0,1200cc,Automatic
60107,60107,Suzuki,PKR,Suzuki Wagon R 2018 for sale in Rawalpindi,Hybrid,used,Suzuki,"15,000 km",2018.0,3175000.0,660cc,Automatic


In [196]:
#Dropping Duplicate Columns
df = df.drop(columns=["Unnamed: 0","manufacturer","currency", 'item_condition'])

In [197]:
#Checking For Null Values
df.isnull().sum()

brand                    0
description              0
fuel_type                0
mileage_from_odometer    0
model_date               0
price                    0
vehicle_engine           0
vehicle_transmission     0
dtype: int64

In [198]:
#Finding Out Duplicates in Data and Dropping Them
print("Duplicate Data",df.duplicated().sum())
df.drop_duplicates(inplace=True)

Duplicate Data 5242


In [199]:
df.describe()

Unnamed: 0,model_date,price
count,54867.0,54867.0
mean,2010.936537,3341167.0
std,9.511491,5103791.0
min,1942.0,1780.0
25%,2006.0,1130000.0
50%,2013.0,2200000.0
75%,2018.0,3850000.0
max,2023.0,210000000.0


In [200]:
df.head()

Unnamed: 0,brand,description,fuel_type,mileage_from_odometer,model_date,price,vehicle_engine,vehicle_transmission
0,MG,MG HS 2021 for sale in Lahore,Petrol,"6,000 km",2021.0,6525000.0,1490cc,Automatic
1,Toyota,Toyota Aygo 2012 for sale in Islamabad,Petrol,"68,000 km",2012.0,1625000.0,1000cc,Manual
2,Honda,Honda City 2015 for sale in Lahore,Petrol,"150,000 km",2015.0,2550000.0,1300cc,Automatic
3,KIA,KIA Sorento 2022 for sale in Sialkot,Petrol,"18,000 km",2022.0,9000000.0,3500cc,Automatic
4,Toyota,Toyota Corolla 2015 for sale in Rawalpindi,Petrol,"110,000 km",2015.0,3350000.0,1300cc,Automatic


In [201]:
df.dtypes

brand                     object
description               object
fuel_type                 object
mileage_from_odometer     object
model_date               float64
price                    float64
vehicle_engine            object
vehicle_transmission      object
dtype: object

In [202]:
# convert vehicle_engine and mileage to numeric
df['mileage_from_odometer'] = df['mileage_from_odometer'].str.replace(' km','')
df['mileage_from_odometer'] = df['mileage_from_odometer'].str.replace(',','').apply(pd.to_numeric)
df['vehicle_engine'] = df['vehicle_engine'].str.replace('cc', '').apply(pd.to_numeric)
df.dtypes

brand                     object
description               object
fuel_type                 object
mileage_from_odometer      int64
model_date               float64
price                    float64
vehicle_engine           float64
vehicle_transmission      object
dtype: object

In [203]:
print(df.isnull().sum())
df['vehicle_engine'].interpolate(inplace=True)
print((df.isnull().sum()))

brand                     0
description               0
fuel_type                 0
mileage_from_odometer     0
model_date                0
price                     0
vehicle_engine           65
vehicle_transmission      0
dtype: int64
brand                    0
description              0
fuel_type                0
mileage_from_odometer    0
model_date               0
price                    0
vehicle_engine           0
vehicle_transmission     0
dtype: int64


In [204]:
# Checking unique values
unique_fuel_type = df['fuel_type'].unique()
unique_brand = df['brand'].unique()
unique_vehicle_transmission = df['vehicle_transmission'].unique()

print("Unique fuel types:", unique_fuel_type)
print("Unique brands:", unique_brand)
print("Unique vehicle transmissions:", unique_vehicle_transmission)

Unique fuel types: ['Petrol' 'Hybrid' 'Diesel' 'CNG' 'Electric' 'Lpg']
Unique brands: ['MG' 'Toyota' 'Honda' 'KIA' 'Nissan' 'Suzuki' 'Mercedes Benz'
 'Range Rover' 'Daihatsu' 'Changan' 'BMW' 'Hyundai' 'Mazda' 'Haval' 'DFSK'
 'Mitsubishi' 'Audi' 'Chevrolet' 'Porsche' 'United' 'Others' 'FAW' 'Lexus'
 'Prince' 'Chrysler' 'Subaru' 'Peugeot' 'Ford' 'Land Rover' 'Master'
 'Volkswagen' 'Chery' 'Willys' 'Proton' 'Jeep' 'MINI' 'SsangYong' 'Isuzu'
 'Daewoo' 'Daehan' 'BAIC' 'Tesla' 'Cadillac' 'Datsun' 'Adam' 'Hummer'
 'Alfa Romeo' 'Sogo' 'Golf' 'Jaguar' 'Dodge' 'Hino' 'Buick' 'Morris'
 'JW Forland' 'Bentley' 'JAC' 'Fiat' 'Roma' 'JMC' 'Ferrari' 'Geely'
 'ZOTYE' 'Opel' 'Vauxhall' 'Oldsmobile' 'GMC' 'Citroen' 'Classic Cars'
 'Mushtaq' 'Volvo']
Unique vehicle transmissions: ['Automatic' 'Manual']


In [205]:
# Dummy encode fuel type column
df = pd.get_dummies(df, columns=['fuel_type'], prefix='fuel', drop_first=True)
df.head()

Unnamed: 0,brand,description,mileage_from_odometer,model_date,price,vehicle_engine,vehicle_transmission,fuel_Diesel,fuel_Electric,fuel_Hybrid,fuel_Lpg,fuel_Petrol
0,MG,MG HS 2021 for sale in Lahore,6000,2021.0,6525000.0,1490.0,Automatic,0,0,0,0,1
1,Toyota,Toyota Aygo 2012 for sale in Islamabad,68000,2012.0,1625000.0,1000.0,Manual,0,0,0,0,1
2,Honda,Honda City 2015 for sale in Lahore,150000,2015.0,2550000.0,1300.0,Automatic,0,0,0,0,1
3,KIA,KIA Sorento 2022 for sale in Sialkot,18000,2022.0,9000000.0,3500.0,Automatic,0,0,0,0,1
4,Toyota,Toyota Corolla 2015 for sale in Rawalpindi,110000,2015.0,3350000.0,1300.0,Automatic,0,0,0,0,1


In [206]:
#LABEL ENCODING ON TRANSMIssion
from sklearn.preprocessing import LabelEncoder


label_encoder = LabelEncoder()


df['vehicle_transmission'] = label_encoder.fit_transform(df['vehicle_transmission'])
df.head()

Unnamed: 0,brand,description,mileage_from_odometer,model_date,price,vehicle_engine,vehicle_transmission,fuel_Diesel,fuel_Electric,fuel_Hybrid,fuel_Lpg,fuel_Petrol
0,MG,MG HS 2021 for sale in Lahore,6000,2021.0,6525000.0,1490.0,0,0,0,0,0,1
1,Toyota,Toyota Aygo 2012 for sale in Islamabad,68000,2012.0,1625000.0,1000.0,1,0,0,0,0,1
2,Honda,Honda City 2015 for sale in Lahore,150000,2015.0,2550000.0,1300.0,0,0,0,0,0,1
3,KIA,KIA Sorento 2022 for sale in Sialkot,18000,2022.0,9000000.0,3500.0,0,0,0,0,0,1
4,Toyota,Toyota Corolla 2015 for sale in Rawalpindi,110000,2015.0,3350000.0,1300.0,0,0,0,0,0,1


In [207]:
df2 = df.copy()
df2['brand'].nunique()

71

In [208]:
# Ordinal Encoding based on average selling price of brand
brand_means = df.groupby('brand')['price'].mean()
sorted_brands = brand_means.sort_values().index
dtype = CategoricalDtype(categories=sorted_brands, ordered=True)
df['brand'] = df['brand'].astype(dtype).cat.codes
df

Unnamed: 0,brand,description,mileage_from_odometer,model_date,price,vehicle_engine,vehicle_transmission,fuel_Diesel,fuel_Electric,fuel_Hybrid,fuel_Lpg,fuel_Petrol
0,53,MG HS 2021 for sale in Lahore,6000,2021.0,6525000.0,1490.0,0,0,0,0,0,1
1,42,Toyota Aygo 2012 for sale in Islamabad,68000,2012.0,1625000.0,1000.0,1,0,0,0,0,1
2,35,Honda City 2015 for sale in Lahore,150000,2015.0,2550000.0,1300.0,0,0,0,0,0,1
3,43,KIA Sorento 2022 for sale in Sialkot,18000,2022.0,9000000.0,3500.0,0,0,0,0,0,1
4,42,Toyota Corolla 2015 for sale in Rawalpindi,110000,2015.0,3350000.0,1300.0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
60101,43,KIA Sportage 2023 for sale in Karachi,30,2023.0,7900000.0,2000.0,0,0,0,0,0,1
60102,43,KIA Sportage 2022 for sale in Karachi,40,2022.0,7900000.0,2000.0,0,0,0,0,0,1
60103,67,Lexus LX Series 2003 for sale in Lahore,80792,2003.0,7800000.0,4700.0,0,0,0,0,0,1
60107,15,Suzuki Wagon R 2018 for sale in Rawalpindi,15000,2018.0,3175000.0,660.0,0,0,0,1,0,0


In [209]:
df['place_of_sale'] = df.description.loc[:].str.split(' ').str[-1]
df

Unnamed: 0,brand,description,mileage_from_odometer,model_date,price,vehicle_engine,vehicle_transmission,fuel_Diesel,fuel_Electric,fuel_Hybrid,fuel_Lpg,fuel_Petrol,place_of_sale
0,53,MG HS 2021 for sale in Lahore,6000,2021.0,6525000.0,1490.0,0,0,0,0,0,1,Lahore
1,42,Toyota Aygo 2012 for sale in Islamabad,68000,2012.0,1625000.0,1000.0,1,0,0,0,0,1,Islamabad
2,35,Honda City 2015 for sale in Lahore,150000,2015.0,2550000.0,1300.0,0,0,0,0,0,1,Lahore
3,43,KIA Sorento 2022 for sale in Sialkot,18000,2022.0,9000000.0,3500.0,0,0,0,0,0,1,Sialkot
4,42,Toyota Corolla 2015 for sale in Rawalpindi,110000,2015.0,3350000.0,1300.0,0,0,0,0,0,1,Rawalpindi
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60101,43,KIA Sportage 2023 for sale in Karachi,30,2023.0,7900000.0,2000.0,0,0,0,0,0,1,Karachi
60102,43,KIA Sportage 2022 for sale in Karachi,40,2022.0,7900000.0,2000.0,0,0,0,0,0,1,Karachi
60103,67,Lexus LX Series 2003 for sale in Lahore,80792,2003.0,7800000.0,4700.0,0,0,0,0,0,1,Lahore
60107,15,Suzuki Wagon R 2018 for sale in Rawalpindi,15000,2018.0,3175000.0,660.0,0,0,0,1,0,0,Rawalpindi


In [210]:
# Ordinal Encoding based on average selling price of brand
place_means = df.groupby('place_of_sale')['price'].mean()
sorted_places = place_means.sort_values().index
dtype = CategoricalDtype(categories=sorted_places, ordered=True)
df['place_of_sale'] = df['place_of_sale'].astype(dtype).cat.codes
df

Unnamed: 0,brand,description,mileage_from_odometer,model_date,price,vehicle_engine,vehicle_transmission,fuel_Diesel,fuel_Electric,fuel_Hybrid,fuel_Lpg,fuel_Petrol,place_of_sale
0,53,MG HS 2021 for sale in Lahore,6000,2021.0,6525000.0,1490.0,0,0,0,0,0,1,296
1,42,Toyota Aygo 2012 for sale in Islamabad,68000,2012.0,1625000.0,1000.0,1,0,0,0,0,1,294
2,35,Honda City 2015 for sale in Lahore,150000,2015.0,2550000.0,1300.0,0,0,0,0,0,1,296
3,43,KIA Sorento 2022 for sale in Sialkot,18000,2022.0,9000000.0,3500.0,0,0,0,0,0,1,290
4,42,Toyota Corolla 2015 for sale in Rawalpindi,110000,2015.0,3350000.0,1300.0,0,0,0,0,0,1,190
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60101,43,KIA Sportage 2023 for sale in Karachi,30,2023.0,7900000.0,2000.0,0,0,0,0,0,1,279
60102,43,KIA Sportage 2022 for sale in Karachi,40,2022.0,7900000.0,2000.0,0,0,0,0,0,1,279
60103,67,Lexus LX Series 2003 for sale in Lahore,80792,2003.0,7800000.0,4700.0,0,0,0,0,0,1,296
60107,15,Suzuki Wagon R 2018 for sale in Rawalpindi,15000,2018.0,3175000.0,660.0,0,0,0,1,0,0,190


In [211]:
df.describe()

Unnamed: 0,brand,mileage_from_odometer,model_date,price,vehicle_engine,vehicle_transmission,fuel_Diesel,fuel_Electric,fuel_Hybrid,fuel_Lpg,fuel_Petrol,place_of_sale
count,54867.0,54867.0,54867.0,54867.0,54867.0,54867.0,54867.0,54867.0,54867.0,54867.0,54867.0,54867.0
mean,30.889733,92044.281244,2010.936537,3341167.0,1427.683954,0.511364,0.056045,0.001221,0.032005,0.000492,0.900833,251.610823
std,12.661694,93749.238451,9.511491,5103791.0,749.812448,0.499875,0.23001,0.034924,0.176014,0.022178,0.298889,59.891733
min,0.0,1.0,1942.0,1780.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,15.0,29000.0,2006.0,1130000.0,1000.0,0.0,0.0,0.0,0.0,0.0,1.0,235.0
50%,35.0,80000.0,2013.0,2200000.0,1300.0,1.0,0.0,0.0,0.0,0.0,1.0,279.0
75%,42.0,125000.0,2018.0,3850000.0,1600.0,1.0,0.0,0.0,0.0,0.0,1.0,294.0
max,70.0,1000000.0,2023.0,210000000.0,15000.0,1.0,1.0,1.0,1.0,1.0,1.0,301.0


In [212]:
scaler = MinMaxScaler()
df[['mileage_from_odometer', 'vehicle_engine', 'price']] = scaler.fit_transform(df[['mileage_from_odometer', 'vehicle_engine', 'price']])
df.head()

Unnamed: 0,brand,description,mileage_from_odometer,model_date,price,vehicle_engine,vehicle_transmission,fuel_Diesel,fuel_Electric,fuel_Hybrid,fuel_Lpg,fuel_Petrol,place_of_sale
0,53,MG HS 2021 for sale in Lahore,0.005999,2021.0,0.031063,0.099333,0,0,0,0,0,1,296
1,42,Toyota Aygo 2012 for sale in Islamabad,0.067999,2012.0,0.00773,0.066667,1,0,0,0,0,1,294
2,35,Honda City 2015 for sale in Lahore,0.149999,2015.0,0.012134,0.086667,0,0,0,0,0,1,296
3,43,KIA Sorento 2022 for sale in Sialkot,0.017999,2022.0,0.042849,0.233333,0,0,0,0,0,1,290
4,42,Toyota Corolla 2015 for sale in Rawalpindi,0.109999,2015.0,0.015944,0.086667,0,0,0,0,0,1,190


In [213]:
#Unqiue Car models name
models = df['description'].str.split().str[:2].str.join(' ')
models = models.str.replace('Other', '').str.strip()
df['description']=models
unique_models = df.description.unique()
print(unique_models)

['MG HS' 'Toyota Aygo' 'Honda City' 'KIA Sorento' 'Toyota Corolla'
 'Nissan March' 'Honda Civic' 'Suzuki Cultus' 'Suzuki Alto' 'Honda BR-V'
 'Suzuki Wagon' 'Suzuki Mehran' 'Toyota Aqua' 'Mercedes Benz'
 'Range Rover' 'Daihatsu Move' 'Daihatsu Mira' 'Toyota Vitz'
 'Toyota Prius' 'Toyota Hilux' 'Toyota Passo' 'Toyota Yaris'
 'Changan Alsvin' 'BMW 3' 'Toyota Prado' 'Honda HR-V' 'Honda Spike'
 'Daihatsu Charade' 'Hyundai Tucson' 'Mazda Flair' 'Toyota Mark'
 'Toyota Crown' 'Toyota Sienta' 'Nissan Sunny' 'Haval H6'
 'Hyundai Elantra' 'Toyota Surf' 'Suzuki Swift' 'Honda Vezel' 'Honda Jade'
 'Toyota Fortuner' 'Toyota Duet' 'Hyundai H-100' 'Suzuki FX' 'Honda Fit'
 'Toyota Harrier' 'DFSK Glory' 'KIA Sportage' 'Suzuki Bolan' 'Nissan Note'
 'Nissan Clipper' 'Toyota Land' 'Suzuki Khyber' 'Honda N' 'Hyundai Santro'
 'Daihatsu Cuore' 'Honda Cross' 'Mitsubishi Lancer' 'Audi A6'
 'Suzuki Cervo' 'Toyota Pickup' 'Suzuki Baleno' 'KIA Picanto'
 'Chevrolet Cruze' 'Honda CR-V' 'Porsche Panamera' 'Hyundai Son

In [214]:
# Ordinal Encoding for Car Description(MODEL) based on average selling price of brand
models = df.groupby('description')['price'].mean()
sorted_places = models.sort_values().index
dtype = CategoricalDtype(categories=sorted_places, ordered=True)
df['description'] = df['description'].astype(dtype).cat.codes
try:
    df=df.drop('model',axis=1)
    df
except:
    df
df

Unnamed: 0,brand,description,mileage_from_odometer,model_date,price,vehicle_engine,vehicle_transmission,fuel_Diesel,fuel_Electric,fuel_Hybrid,fuel_Lpg,fuel_Petrol,place_of_sale
0,53,392,0.005999,2021.0,0.031063,0.099333,0,0,0,0,0,1,296
1,42,190,0.067999,2012.0,0.007730,0.066667,1,0,0,0,0,1,294
2,35,266,0.149999,2015.0,0.012134,0.086667,0,0,0,0,0,1,296
3,43,418,0.017999,2022.0,0.042849,0.233333,0,0,0,0,0,1,290
4,42,288,0.109999,2015.0,0.015944,0.086667,0,0,0,0,0,1,190
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60101,43,380,0.000029,2023.0,0.037611,0.133333,0,0,0,0,0,1,279
60102,43,380,0.000039,2022.0,0.037611,0.133333,0,0,0,0,0,1,279
60103,67,465,0.080791,2003.0,0.037135,0.313333,0,0,0,0,0,1,296
60107,15,226,0.014999,2018.0,0.015111,0.044000,0,0,0,1,0,0,190


In [215]:
#Storing to .csv
df.to_csv('Transformed Data.csv',index=False)

# DATA VALIDATION

In [216]:
df.corr()

Unnamed: 0,brand,description,mileage_from_odometer,model_date,price,vehicle_engine,vehicle_transmission,fuel_Diesel,fuel_Electric,fuel_Hybrid,fuel_Lpg,fuel_Petrol,place_of_sale
brand,1.0,0.790873,0.02073,0.062566,0.378582,0.534773,-0.486507,0.178238,0.070333,0.111441,-0.008699,-0.193539,0.092382
description,0.790873,1.0,-0.062278,0.255698,0.484777,0.612339,-0.561632,0.270076,0.060152,0.165754,-0.018852,-0.280128,0.156949
mileage_from_odometer,0.02073,-0.062278,1.0,-0.388987,-0.187322,0.055657,0.13455,0.048468,-0.028575,-0.010648,0.001615,-0.035201,-0.096879
model_date,0.062566,0.255698,-0.388987,1.0,0.312329,-0.116231,-0.283912,-0.134779,0.037709,0.088721,-0.037437,0.091293,0.157807
price,0.378582,0.484777,-0.187322,0.312329,1.0,0.483061,-0.366272,0.146876,0.20572,0.09052,-0.011822,-0.173242,0.146474
vehicle_engine,0.534773,0.612339,0.055657,-0.116231,0.483061,1.0,-0.339559,0.430958,0.003934,0.027257,-0.00345,-0.337134,0.096833
vehicle_transmission,-0.486507,-0.561632,0.13455,-0.283912,-0.366272,-0.339559,1.0,-0.05048,-0.03577,-0.184355,0.02169,0.124714,-0.197901
fuel_Diesel,0.178238,0.270076,0.048468,-0.134779,0.146876,0.430958,-0.05048,1.0,-0.00852,-0.044306,-0.005407,-0.734395,0.007024
fuel_Electric,0.070333,0.060152,-0.028575,0.037709,0.20572,0.003934,-0.03577,-0.00852,1.0,-0.006358,-0.000776,-0.105387,0.022953
fuel_Hybrid,0.111441,0.165754,-0.010648,0.088721,0.09052,0.027257,-0.184355,-0.044306,-0.006358,1.0,-0.004035,-0.548035,0.045852


In [217]:
#MISSING VALUES
missing_values = df.isnull().sum()
print("Checking for Missing  Values")
print(missing_values)

Checking for Missing  Values
brand                    0
description              0
mileage_from_odometer    0
model_date               0
price                    0
vehicle_engine           0
vehicle_transmission     0
fuel_Diesel              0
fuel_Electric            0
fuel_Hybrid              0
fuel_Lpg                 0
fuel_Petrol              0
place_of_sale            0
dtype: int64


In [218]:
#Data Types - Checking
print("Data Validation - Regarding Data Types")
df.dtypes

Data Validation - Regarding Data Types


brand                       int8
description                int16
mileage_from_odometer    float64
model_date               float64
price                    float64
vehicle_engine           float64
vehicle_transmission       int32
fuel_Diesel                uint8
fuel_Electric              uint8
fuel_Hybrid                uint8
fuel_Lpg                   uint8
fuel_Petrol                uint8
place_of_sale              int16
dtype: object

In [219]:
df.describe()

Unnamed: 0,brand,description,mileage_from_odometer,model_date,price,vehicle_engine,vehicle_transmission,fuel_Diesel,fuel_Electric,fuel_Hybrid,fuel_Lpg,fuel_Petrol,place_of_sale
count,54867.0,54867.0,54867.0,54867.0,54867.0,54867.0,54867.0,54867.0,54867.0,54867.0,54867.0,54867.0,54867.0
mean,30.889733,233.944429,0.092043,2010.936537,0.015902,0.095179,0.511364,0.056045,0.001221,0.032005,0.000492,0.900833,251.610823
std,12.661694,116.067105,0.093749,9.511491,0.024304,0.049987,0.499875,0.23001,0.034924,0.176014,0.022178,0.298889,59.891733
min,0.0,0.0,0.0,1942.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,15.0,128.0,0.028999,2006.0,0.005373,0.066667,0.0,0.0,0.0,0.0,0.0,1.0,235.0
50%,35.0,266.0,0.079999,2013.0,0.010468,0.086667,1.0,0.0,0.0,0.0,0.0,1.0,279.0
75%,42.0,288.0,0.124999,2018.0,0.018325,0.106667,1.0,0.0,0.0,0.0,0.0,1.0,294.0
max,70.0,465.0,1.0,2023.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,301.0


# PCA TO REDUCE DIMESIONS

In [220]:
df.corr()

Unnamed: 0,brand,description,mileage_from_odometer,model_date,price,vehicle_engine,vehicle_transmission,fuel_Diesel,fuel_Electric,fuel_Hybrid,fuel_Lpg,fuel_Petrol,place_of_sale
brand,1.0,0.790873,0.02073,0.062566,0.378582,0.534773,-0.486507,0.178238,0.070333,0.111441,-0.008699,-0.193539,0.092382
description,0.790873,1.0,-0.062278,0.255698,0.484777,0.612339,-0.561632,0.270076,0.060152,0.165754,-0.018852,-0.280128,0.156949
mileage_from_odometer,0.02073,-0.062278,1.0,-0.388987,-0.187322,0.055657,0.13455,0.048468,-0.028575,-0.010648,0.001615,-0.035201,-0.096879
model_date,0.062566,0.255698,-0.388987,1.0,0.312329,-0.116231,-0.283912,-0.134779,0.037709,0.088721,-0.037437,0.091293,0.157807
price,0.378582,0.484777,-0.187322,0.312329,1.0,0.483061,-0.366272,0.146876,0.20572,0.09052,-0.011822,-0.173242,0.146474
vehicle_engine,0.534773,0.612339,0.055657,-0.116231,0.483061,1.0,-0.339559,0.430958,0.003934,0.027257,-0.00345,-0.337134,0.096833
vehicle_transmission,-0.486507,-0.561632,0.13455,-0.283912,-0.366272,-0.339559,1.0,-0.05048,-0.03577,-0.184355,0.02169,0.124714,-0.197901
fuel_Diesel,0.178238,0.270076,0.048468,-0.134779,0.146876,0.430958,-0.05048,1.0,-0.00852,-0.044306,-0.005407,-0.734395,0.007024
fuel_Electric,0.070333,0.060152,-0.028575,0.037709,0.20572,0.003934,-0.03577,-0.00852,1.0,-0.006358,-0.000776,-0.105387,0.022953
fuel_Hybrid,0.111441,0.165754,-0.010648,0.088721,0.09052,0.027257,-0.184355,-0.044306,-0.006358,1.0,-0.004035,-0.548035,0.045852


In [221]:
#ORIGINAL

df

Unnamed: 0,brand,description,mileage_from_odometer,model_date,price,vehicle_engine,vehicle_transmission,fuel_Diesel,fuel_Electric,fuel_Hybrid,fuel_Lpg,fuel_Petrol,place_of_sale
0,53,392,0.005999,2021.0,0.031063,0.099333,0,0,0,0,0,1,296
1,42,190,0.067999,2012.0,0.007730,0.066667,1,0,0,0,0,1,294
2,35,266,0.149999,2015.0,0.012134,0.086667,0,0,0,0,0,1,296
3,43,418,0.017999,2022.0,0.042849,0.233333,0,0,0,0,0,1,290
4,42,288,0.109999,2015.0,0.015944,0.086667,0,0,0,0,0,1,190
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60101,43,380,0.000029,2023.0,0.037611,0.133333,0,0,0,0,0,1,279
60102,43,380,0.000039,2022.0,0.037611,0.133333,0,0,0,0,0,1,279
60103,67,465,0.080791,2003.0,0.037135,0.313333,0,0,0,0,0,1,296
60107,15,226,0.014999,2018.0,0.015111,0.044000,0,0,0,1,0,0,190


In [223]:
# DIMENSIONALITY REDUCTION
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


X = df.drop('price', axis=1)  # Features
y = df['price']  # Target variable

# Standardize the features (important for PCA)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# retain 95% features
pca = PCA(0.95)  
# Fit and transform the standardized data
X_pca = pca.fit_transform(X_scaled)

#No of Principal Components
print("Number of Principal Components:", pca.n_components_)

#Making Data Frame
new_pca = pd.DataFrame(data=X_pca, columns=[f'PC{i+1}' for i in range(X_pca.shape[1])])

new_pca

Number of Principal Components: 9


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
0,1.832604,-2.074533,0.388102,-0.146510,0.026568,0.177296,0.102514,-0.027064,-0.366427
1,-0.507919,-0.384362,0.112642,-0.183881,0.066698,0.061770,-0.499776,-0.233035,-1.060809
2,0.464145,-0.850989,0.441394,0.427454,0.121278,0.107019,-0.687569,0.680439,0.513746
3,2.713890,-1.639095,0.989266,-0.511764,-0.161325,0.326560,0.088375,-0.477311,-0.423893
4,0.600976,-0.690742,0.615009,0.585134,0.105904,-0.199464,1.144224,0.470869,0.473480
...,...,...,...,...,...,...,...,...,...
54862,1.706338,-1.938060,0.272145,-0.382561,-0.110329,0.202785,0.306911,-0.023260,-0.181030
54863,1.697245,-1.883590,0.304239,-0.366268,-0.101548,0.201903,0.289718,-0.088984,-0.154614
54864,4.235548,-0.450000,2.903203,0.259792,0.203405,0.284891,-0.255054,-1.893890,-0.746265
54865,1.261789,0.734032,-5.697498,2.941239,-0.284989,-0.217117,0.932820,-0.469961,0.644103
