In [1]:
import pandas as pd
import numpy as np

import warnings

warnings.filterwarnings('ignore')

In [2]:
df_org = pd.read_csv("./data/scout_missings_filled_2.csv")
df = df_org.copy()

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15915 entries, 0 to 15914
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   make_model               15915 non-null  object 
 1   body_type                15915 non-null  object 
 2   price                    15915 non-null  int64  
 3   vat                      15915 non-null  object 
 4   km                       15915 non-null  float64
 5   prev_owner               15915 non-null  float64
 6   hp                       15915 non-null  float64
 7   inspection_new           15915 non-null  object 
 8   warranty                 15915 non-null  float64
 9   body_color               15915 non-null  object 
 10  paint_type               15915 non-null  object 
 11  nr_of_doors              15915 non-null  float64
 12  nr_of_seats              15915 non-null  float64
 13  gearing_type             15915 non-null  object 
 14  displacement          

## Additional Feature Engineering

In [4]:
# split make_model
df[['make', 'model']] = df['make_model'].str.split(' ', expand=True)
df.drop(columns=['make_model'], inplace=True)

# drop weight column
df.drop(columns=['weight'], inplace=True)

# lower Weight to weight
df.columns = df.columns.str.lower()

# lower and underscore spacings
df['body_type'] = df['body_type'].str.lower().str.replace(' ', '_').str.replace('-', '_')
df['vat'] = df['vat'].str.lower()
df['inspection_new'] = df['inspection_new'].str.lower().str.replace(' ', '_')
df['body_color'] = df['body_color'].str.lower()
df['paint_type'] = df['paint_type'].str.lower().str.replace(' ', '_').str.replace('/', '_')
df['gearing_type'] = df['gearing_type'].str.lower().str.replace('-', '_')
df['drive_chain'] = df['drive_chain'].str.lower()
df['emission_class'] = df['emission_class'].str.lower().str.replace(' ', '_')
df['new_used'] = df['new_used'].str.lower().str.replace(' ', '_').str.replace('-', '_').str.replace("'", '')
df['upholstery_material'] = df['upholstery_material'].str.lower().str.replace(' ', '_')

# fill nan
df['fuel_type'].fillna('others',inplace=True)
df['upholstery_material'].fillna('other',inplace=True)
median_value_city = df['consumption_city'].median()
df['consumption_city'] = df['consumption_city'].fillna(median_value_city)
median_value_country = df['consumption_country'].median()
df['consumption_country'] = df['consumption_country'].fillna(median_value_country)
df['upholstery_material'] = df['upholstery_material'].fillna("'none_specified'") # drop not needed
df['comfort_convenience'] = df['comfort_convenience'].fillna("'none_specified'")
df['entertainment_media'] = df['entertainment_media'].fillna("'none_specified'")
df['extras'] = df['extras'].fillna("'none_specified'")
df['safety_security'] = df['safety_security'].fillna("'none_specified'")

## Change object Dtypes to Categorical

In [5]:
# Select columns with dtype 'object' and convert them to categorical
df[df.select_dtypes(include='object').columns] = df.select_dtypes(include='object').apply(lambda x: x.astype('category'))

# convert to categories
df['prev_owner'] = df['prev_owner'].astype('category')
df['nr_of_doors'] = df['nr_of_doors'].astype('category')
df['nr_of_seats'] = df['nr_of_seats'].astype('category')
df['cylinders'] = df['cylinders'].astype('category')
df['gears'] = df['gears'].astype('category')
df['electricity_consumption'] = df['electricity_consumption'].astype('category')

# conver to string
df['comfort_convenience'] = df['comfort_convenience'].astype('string')
df['entertainment_media'] = df['entertainment_media'].astype('string')
df['extras'] = df['extras'].astype('string')
df['safety_security'] = df['safety_security'].astype('string')

In [6]:
# get unique categorical values for each column
unique = df.nunique()
dtypes = df.dtypes
col_info = pd.DataFrame({'unique_values': unique,
                         'dtype': dtypes})

col_info

Unnamed: 0,unique_values,dtype
body_type,9,category
price,2952,int64
vat,3,category
km,6693,float64
prev_owner,5,category
hp,80,float64
inspection_new,2,category
warranty,42,float64
body_color,13,category
paint_type,3,category


## One Hot Encoding using pd.get_dummies()

In [7]:
cat_cols = list(col_info[(col_info.dtype=='category')].index)
cat_cols

['body_type',
 'vat',
 'prev_owner',
 'inspection_new',
 'body_color',
 'paint_type',
 'nr_of_doors',
 'nr_of_seats',
 'gearing_type',
 'cylinders',
 'drive_chain',
 'emission_class',
 'gears',
 'electricity_consumption',
 'new_used',
 'fuel_type',
 'upholstery_material',
 'make',
 'model']

In [8]:
df_dummies_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)
df_dummies_encoded.sample(50)

Unnamed: 0,price,km,hp,warranty,displacement,co2_emission,comfort_convenience,entertainment_media,extras,safety_security,...,make_Opel,make_Renault,model_A2,model_A3,model_Astra,model_Clio,model_Corsa,model_Duster,model_Espace,model_Insignia
6885,12636,23176.0,81.0,12.0,1598.0,97.0,"'Cruise control', 'Multi-function steering whe...",'none_specified','Alloy wheels',"'Immobilizer', 'Lane departure warning system'...",...,True,False,False,False,True,False,False,False,False,False
7811,23940,12.0,110.0,0.0,1400.0,139.0,"'Air conditioning', 'Armrest', 'Automatic clim...","'On-board computer', 'Radio', 'USB'","'Alloy wheels', 'Catalytic Converter'","'ABS', 'Central door lock', 'Daytime running l...",...,True,False,False,False,True,False,False,False,False,False
15550,31900,24000.0,165.0,0.0,1798.0,153.0,"'Air conditioning', 'Automatic climate control...","'Bluetooth', 'Digital radio', 'Hands-free equi...","'Alloy wheels', 'Touch screen', 'Voice Control'","'ABS', 'Adaptive Cruise Control', 'Blind spot ...",...,False,True,False,False,False,False,False,False,True,False
8306,13480,10.0,66.0,0.0,1398.0,150.0,"'Air conditioning', 'Electrical side mirrors',...",'Radio','none_specified',"'ABS', 'Central door lock', 'Daytime running l...",...,True,False,False,False,False,False,True,False,False,False
15627,29385,21123.0,165.0,34.0,1798.0,153.0,"'Air conditioning', 'Automatic climate control...","'Hands-free equipment', 'Radio'","'Alloy wheels', 'Catalytic Converter', 'Touch ...","'ABS', 'Blind spot monitor', 'Central door loc...",...,False,True,False,False,False,False,False,False,True,False
1037,20500,12842.0,85.0,0.0,1598.0,97.0,'Air conditioning','none_specified','none_specified',"'Central door lock', 'Driver-side airbag', 'Is...",...,False,False,False,False,False,False,False,False,False,False
15256,18790,87445.0,118.0,6.0,1598.0,120.0,'Cruise control','none_specified','Alloy wheels',"'ABS', 'Driver-side airbag', 'Passenger-side a...",...,False,True,False,False,False,False,False,False,True,False
11854,24290,15992.0,125.0,0.0,1956.0,150.0,"'Air conditioning', 'Automatic climate control...","'Bluetooth', 'Hands-free equipment', 'MP3', 'O...","'Alloy wheels', 'Roof rack', 'Sport seats'","'ABS', 'Adaptive headlights', 'Central door lo...",...,True,False,False,False,False,False,False,False,False,True
8674,7479,38105.0,51.0,60.0,1229.0,128.0,"'Air conditioning', 'Electrical side mirrors',...","'On-board computer', 'Radio'",'none_specified',"'ABS', 'Central door lock', 'Daytime running l...",...,True,False,False,False,False,False,True,False,False,False
10527,17450,50908.0,100.0,0.0,1598.0,129.0,"'Air conditioning', 'Armrest', 'Automatic clim...","'CD player', 'Hands-free equipment', 'MP3', 'O...","'Alloy wheels', 'Roof rack'","'ABS', 'Adaptive headlights', 'Alarm system', ...",...,True,False,False,False,False,False,False,False,False,True


In [9]:
df_dummies_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15915 entries, 0 to 15914
Data columns (total 98 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   price                             15915 non-null  int64  
 1   km                                15915 non-null  float64
 2   hp                                15915 non-null  float64
 3   warranty                          15915 non-null  float64
 4   displacement                      15915 non-null  float64
 5   co2_emission                      15915 non-null  float64
 6   comfort_convenience               15915 non-null  string 
 7   entertainment_media               15915 non-null  string 
 8   extras                            15915 non-null  string 
 9   safety_security                   15915 non-null  string 
 10  age                               15915 non-null  float64
 11  consumption_comb                  15915 non-null  float64
 12  cons

## Using sklearn.preprocessing.OneHotEncoder

In [10]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop='first') # can put this in pipeline

# Apply OneHotEncoder to each column and concatenate
encoded_df_list = []
for column in cat_cols:
    encoded = encoder.fit_transform(df[[column]])
    encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out([column]))
    encoded_df_list.append(encoded_df)

df_sk_encoded = pd.concat(encoded_df_list, axis=1)

df_sk_encoded.sample(50)

Unnamed: 0,body_type_convertible,body_type_coupe,body_type_off_road,body_type_other,body_type_sedans,body_type_station_wagon,body_type_transporter,body_type_van,vat_vat deductible,vat_vat undeductible,...,make_Opel,make_Renault,model_A2,model_A3,model_Astra,model_Clio,model_Corsa,model_Duster,model_Espace,model_Insignia
9778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
828,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13189,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
118,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15829,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5161,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
11540,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12065,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4360,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
14378,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [11]:
df_sk_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15915 entries, 0 to 15914
Data columns (total 83 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   body_type_convertible             15915 non-null  float64
 1   body_type_coupe                   15915 non-null  float64
 2   body_type_off_road                15915 non-null  float64
 3   body_type_other                   15915 non-null  float64
 4   body_type_sedans                  15915 non-null  float64
 5   body_type_station_wagon           15915 non-null  float64
 6   body_type_transporter             15915 non-null  float64
 7   body_type_van                     15915 non-null  float64
 8   vat_vat deductible                15915 non-null  float64
 9   vat_vat undeductible              15915 non-null  float64
 10  prev_owner_1.0                    15915 non-null  float64
 11  prev_owner_2.0                    15915 non-null  float64
 12  prev