In [1]:
import os
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer

Import and inspect data from file

In [2]:
adverts_df = pd.read_csv("../adverts.csv")
adverts_df.head(3)

Unnamed: 0,public_reference,mileage,reg_code,standard_colour,standard_make,standard_model,vehicle_condition,year_of_registration,price,body_type,crossover_car_and_van,fuel_type
0,202006039777689,0.0,,Grey,Volvo,XC90,NEW,,73970,SUV,False,Petrol Plug-in Hybrid
1,202007020778260,108230.0,61.0,Blue,Jaguar,XF,USED,2011.0,7000,Saloon,False,Diesel
2,202007020778474,7800.0,17.0,Grey,SKODA,Yeti,USED,2017.0,14000,SUV,False,Petrol


In [3]:
adverts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402005 entries, 0 to 402004
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   public_reference       402005 non-null  int64  
 1   mileage                401878 non-null  float64
 2   reg_code               370148 non-null  object 
 3   standard_colour        396627 non-null  object 
 4   standard_make          402005 non-null  object 
 5   standard_model         402005 non-null  object 
 6   vehicle_condition      402005 non-null  object 
 7   year_of_registration   368694 non-null  float64
 8   price                  402005 non-null  int64  
 9   body_type              401168 non-null  object 
 10  crossover_car_and_van  402005 non-null  bool   
 11  fuel_type              401404 non-null  object 
dtypes: bool(1), float64(2), int64(2), object(7)
memory usage: 34.1+ MB


In [4]:
adverts_df.shape

(402005, 12)

Ramdom sample from data for faster experimentation

In [5]:
# Random sample of 1% of the original data
sample_adverts_df = adverts_df.sample(frac=0.1, random_state=42)
sample_adverts_df.shape

(40200, 12)

Split data into features and targets

In [6]:
features = sample_adverts_df.drop('price', axis=1)
target = sample_adverts_df.price

In [7]:
print(features.shape)
print(target.shape)

(40200, 11)
(40200,)


## Numerical feature processing

1. Split data into numerical features

In [20]:
numerical_features = features.select_dtypes(include=['number'])
numerical_features.head(3)

Unnamed: 0,public_reference,mileage,year_of_registration
332044,202010074692259,2826.0,2019.0
173955,202009023198786,10601.0,2019.0
367464,202007221569681,23000.0,2008.0


2. Drop public reference from dataset as it doesnt seem to be such as useful feature

In [21]:
numerical_features_no_pr = numerical_features.drop('public_reference', axis=1)
numerical_features_no_pr.head(3)

Unnamed: 0,mileage,year_of_registration
332044,2826.0,2019.0
173955,10601.0,2019.0
367464,23000.0,2008.0


make a column transformation step as part of a pipeline

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html#sklearn.preprocessing.FunctionTransformer

In [27]:
def drop_public_ref(df):
    return df.drop('public_reference', axis=1)
    
no_public_ref = FunctionTransformer(drop_public_ref)
no_public_ref_df = no_public_ref.transform(numerical_features)
no_public_ref_df.head(3)

Unnamed: 0,mileage,year_of_registration
332044,2826.0,2019.0
173955,10601.0,2019.0
367464,23000.0,2008.0


Investigate missing values in numerical features

In [28]:
def check_missing_values(df):
    return df.isnull().sum()

check_missing_values(no_public_ref_df) # missing values observed

mileage                   12
year_of_registration    3271
dtype: int64

Handle missing values in numerical features (Using missing value imputation strategies)

https://scikit-learn.org/stable/modules/impute.html

In [15]:
from sklearn.impute import SimpleImputer
import numpy as np

In [35]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean').set_output(transform='pandas')
imputed_missing_values = imputer.fit_transform(no_public_ref_df)

check_missing_values(imputed_missing_values) # no missing values observed

mileage                 0
year_of_registration    0
dtype: int64

Other missing value imputation strategy

https://scikit-learn.org/stable/auto_examples/impute/plot_missing_values.html#sphx-glr-auto-examples-impute-plot-missing-values-py

Scaling/transformation/Outlier removal input features

https://scikit-learn.org/stable/modules/preprocessing.html

In [37]:
from sklearn.preprocessing import StandardScaler

todo: make some plots and visualization of the numerical features at different stages of the pipeline

todo: consider non linear feature generation method like polonomial features and spline methods

https://scikit-learn.org/stable/modules/preprocessing.html#imputation-of-missing-values sections 6.3.7

Column transformation of individual steps

https://scikit-learn.org/stable/modules/generated/sklearn.compose.make_column_transformer.html#sklearn.compose.make_column_transformer

https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html#sklearn.compose.ColumnTransformer

## Categorical feature processing

Split and preprocess data into categorical features

In [12]:
categorical_features = adverts_df.select_dtypes(include=['object', 'category', 'bool'])
categorical_features

Unnamed: 0,reg_code,standard_colour,standard_make,standard_model,vehicle_condition,body_type,crossover_car_and_van,fuel_type
0,,Grey,Volvo,XC90,NEW,SUV,False,Petrol Plug-in Hybrid
1,61,Blue,Jaguar,XF,USED,Saloon,False,Diesel
2,17,Grey,SKODA,Yeti,USED,SUV,False,Petrol
3,16,Brown,Vauxhall,Mokka,USED,Hatchback,False,Diesel
4,64,Grey,Land Rover,Range Rover Sport,USED,SUV,False,Diesel
...,...,...,...,...,...,...,...,...
402000,69,Grey,Peugeot,208,USED,Hatchback,False,Petrol
402001,59,Red,Peugeot,107,USED,Hatchback,False,Petrol
402002,62,White,Nissan,Qashqai,USED,SUV,False,Petrol
402003,65,Red,Abarth,595,USED,Hatchback,False,Petrol
