In [2]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from preprocess import PreprocessedDataFrame
from category_encoders import TargetEncoder

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

pd.set_option("display.max_rows", None, "display.max_columns", None)

In [3]:
FOLDER = ''
X = pd.read_csv(FOLDER + 'train.csv')
y= X.pop('price')
print('Total data size:', X.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
print('Train:', X_train.shape)
print('Test:', X_test.shape)

X_train.head()

Total data size: (16784, 29)
Train: (13427, 29)
Test: (3357, 29)


Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,transmission,curb_weight,power,fuel_type,engine_cap,no_of_owners,depreciation,coe,road_tax,dereg_value,mileage,omv,arf,opc_scheme,lifespan,eco_category,features,accessories,indicative_price
15946,1000462,Porsche Cayenne Hybrid S 3.0A,porsche,cayenne,10,2012.0,,22-oct-2012,suv,"parf car, direct owner sale, hybrid cars",auto,2315.0,245.0,petrol-electric,2995.0,4.0,67590.0,82289.0,2380.0,43327.0,125000.0,102465.0,61479.0,,,uncategorized,3.0l v6 supercharged/hybrid engine with 328bhp...,low mileage! 2 keys! keyless entry/start. bi-x...,
483,982870,Mercedes-Benz 230CE,mercedes-benz,230,rare 230ce this was an iconic car of the 80's ...,1983.0,01-jan-1983,09-mar-2021,sports car,"parf car, direct owner sale, rare & exotic, al...",auto,1380.0,100.0,,2299.0,1.0,,4771.0,280.0,,,4882.0,4882.0,,,uncategorized,"rare 230ce comes with power sunroof, power win...",48,
971,975690,Citroen Dispatch 2.0M (New 5-yr COE),citroen,dispatch,comes with new 6 months road tax. bank loan an...,2009.0,,13-jul-2011,van,coe car,manual,1700.0,,diesel,1997.0,2.0,5600.0,,,,,27948.0,1398.0,,12-jul-2031,uncategorized,2.0l 4 cylinder inline engine type. fuel type:...,"2 airbags, traction control, multi-function st...",
15609,984574,Toyota Sienta Hybrid 1.5A X LED,toyota,sienta,100% loan available. just like new car. still ...,2019.0,,14-jan-2020,mpv,"parf car, premium ad car, low mileage car, hyb...",auto,1380.0,73.0,petrol-electric,1496.0,1.0,10030.0,26667.0,682.0,35287.0,5559.0,25151.0,17212.0,,,uncategorized,"powerful 1.5l dohc engine, responsive cvt-i au...","keyless push start, retractable side mirrors, ...",
5955,1025146,Mitsubishi Lancer 1.6M GLX (COE till 06/2028),mitsubishi,lancer,"rare pristine 3 owners cs3, accessories includ...",2008.0,,04-jul-2008,mid-sized sedan,"coe car, premium ad car",manual,1162.0,79.0,,1584.0,3.0,8460.0,38214.0,1028.0,26184.0,220000.0,11189.0,11189.0,,,uncategorized,responsive and powerful 1.6l 4 cylinders inlin...,"semi bucket seats, drift racing full system, r...",


## Preprocessing

### Check out what preprocessing is doing

In [4]:
preprocessed_df = PreprocessedDataFrame(X_train, y_train, target_encoding=True) # set target_encoding False to get one-hot encoding for make and model

Date: ['reg_date', 'manufactured', 'lifespan', 'original_reg_date']
Num: ['engine_cap', 'omv', 'dereg_value', 'mileage', 'power', 'curb_weight', 'no_of_owners', 'arf', 'road_tax', 'indicative_price', 'depreciation', 'coe']
OneHot Cat: ['title', 'make', 'model']
Label Cat: ['category', 'eco_category', 'description', 'accessories', 'opc_scheme', 'transmission', 'listing_id', 'features', 'fuel_type']



### Numerical transformation

In [5]:
test_df = preprocessed_df.check_feat_processing('num') # num, oh_cat, label_cat, date
test_df.head()

Unnamed: 0,engine_cap,omv,dereg_value,mileage,power,curb_weight,no_of_owners,arf,road_tax,depreciation,coe
15946,2995.0,102465.0,43327.0,125000.0,245.0,2315.0,4.0,61479.0,2380.0,67590.0,82289.0
483,2299.0,4882.0,,,100.0,1380.0,1.0,4882.0,280.0,,4771.0
971,1997.0,27948.0,,,,1700.0,2.0,1398.0,,5600.0,
15609,1496.0,25151.0,35287.0,5559.0,73.0,1380.0,1.0,17212.0,682.0,10030.0,26667.0
5955,1584.0,11189.0,26184.0,220000.0,79.0,1162.0,3.0,11189.0,1028.0,8460.0,38214.0


### Date transformation
Convert `year` to `years_since`.

In [6]:
test_df = preprocessed_df.check_feat_processing('date') 
test_df.head()

Unnamed: 0,years_since_reg_date,years_since_manufactured
15946,9.0,9.0
483,0.0,38.0
971,10.0,12.0
15609,1.0,2.0
5955,13.0,13.0


### Categorical transformation

Infer values for `make` and `model` from `title`.

In [8]:
test_df = preprocessed_df.check_feat_processing('oh_cat')
test_df.head()

Unnamed: 0,make,model
15946,porsche,cayenne
483,mercedes-benz,230
971,citroen,dispatch
15609,toyota,sienta
5955,mitsubishi,lancer


Label encoding for other other categorical features.

In [9]:
test_df = preprocessed_df.check_feat_processing('label_cat')
test_df.head()

Unnamed: 0,transmission,fuel_type,cat_parf,cat_premium_ad,cat_low_mileage,cat_imported_used,cat_coe,cat_almost_new,cat_rare_&_exotic,cat_hybrid,cat_direct_owner_sale,cat_sgcarmart_warranty,cat_vintage,cat_sta_evaluated,cat_opc,cat_consignment,cat_electric
15946,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0
483,0,0,1,0,0,0,0,1,1,0,1,0,1,0,0,0,0
971,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
15609,0,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0
5955,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0


### Preprocess data

Full pipeline that includes the data transformation shown above and impute missing values.

In [11]:
X_train_prepared = preprocessed_df.build_dataframe()
preprocessed_df.full_pipeline

Input shape: (13427, 29)
Transformed shape: (13427, 43)


ColumnTransformer(transformers=[('date',
                                 Pipeline(steps=[('date_trans',
                                                  DateTransformer()),
                                                 ('imputer',
                                                  SimpleImputer(fill_value=0,
                                                                strategy='constant')),
                                                 ('std_scaler',
                                                  StandardScaler())]),
                                 ['reg_date', 'manufactured', 'lifespan',
                                  'original_reg_date']),
                                ('oh_cat',
                                 Pipeline(steps=[('cat_trans',
                                                  OneHotCategoricalTransformer())]),
                                 ['title', 'make', 'model']),
                                ('label_...
                                ('nu

In [6]:
model_target_encoder = TargetEncoder()
X_train_prepared.loc[:, 'model'] = model_target_encoder.fit_transform(X_train_prepared.pop('model'), y=y_train)

make_target_encoder = TargetEncoder()
X_train_prepared.loc[:, 'make'] = make_target_encoder.fit_transform(X_train_prepared.pop('make'), y=y_train)

X_train_prepared.head()

Unnamed: 0,years_since_reg_date,years_since_manufactured,transmission,fuel_type,cat_parf,cat_premium_ad,cat_low_mileage,cat_imported_used,cat_coe,cat_almost_new,cat_rare_&_exotic,cat_hybrid,cat_direct_owner_sale,cat_sgcarmart_warranty,cat_vintage,cat_sta_evaluated,cat_opc,cat_consignment,cat_electric,road_tax,engine_cap,curb_weight,mileage,depreciation,dereg_value,coe,omv,no_of_owners,power,arf,tov_bus/mini bus,tov_hatchback,tov_luxury sedan,tov_mid-sized sedan,tov_mpv,tov_others,tov_sports car,tov_stationwagon,tov_suv,tov_truck,tov_van,model,make
0,0.550286,0.424494,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0.958902,0.792832,0.927538,1.190932,2.753244,-0.00513,2.555128,1.471315,1.449651,1.651557,0.390579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,99021.428571,94824.015748
1,-1.26477,5.932717,0,0,1,0,0,0,0,1,1,0,1,0,1,0,0,0,0,-0.921224,0.183714,-0.211488,-0.116606,-0.192373,-0.194052,-2.271259,-0.783893,-0.762807,-0.441377,-0.563227,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,124390.537634,111529.030007
2,0.751959,0.99431,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,-0.298097,-0.080588,0.178339,-0.116606,-0.451368,-0.194052,-0.059108,-0.250823,-0.025321,-0.297037,-0.621942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,88278.635342,97713.333333
3,-1.063097,-0.905077,0,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,-0.561314,-0.519048,-0.211488,-1.446601,-0.222356,-0.19994,-0.907981,-0.315463,-0.762807,-0.831096,-0.355435,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,99065.753425,109309.819967
4,1.356978,1.184249,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,-0.251541,-0.442033,-0.477058,3.28875,-0.303518,-0.420508,-0.189048,-0.638134,0.712165,-0.744492,-0.456938,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117315.789474,116086.078886


### Test data

In [21]:
X_test_prepared = preprocessed_df.transform_dataframe(X_test)
X_test_prepared.loc[:, 'model'] = model_target_encoder.transform(X_test_prepared.pop('model'))
X_test_prepared.loc[:, 'make'] = make_target_encoder.transform(X_test_prepared.pop('make'))
X_test_prepared.head()

Input shape: (3357, 29)
Transformed shape: (3357, 33)


Unnamed: 0,years_since_reg_date,years_since_manufactured,type_of_vehicle,transmission,fuel_type,cat_parf,cat_premium_ad,cat_low_mileage,cat_imported_used,cat_coe,cat_almost_new,cat_rare_&_exotic,cat_hybrid,cat_direct_owner_sale,cat_sgcarmart_warranty,cat_vintage,cat_sta_evaluated,cat_opc,cat_consignment,cat_electric,no_of_owners,road_tax,coe,mileage,arf,depreciation,power,curb_weight,dereg_value,engine_cap,omv,model,make
0,-1.063097,-0.905077,sports car,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,-0.762807,2.359148,-0.059108,-1.566706,7.3208,5.225328,3.527981,-0.10794,8.268114,1.668878,5.531951,78687.5,94824.015748
1,1.760324,1.564126,sports car,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,-0.025321,2.026993,0.646253,1.07363,0.929524,-0.093117,0.857686,0.269705,-0.438026,0.793707,1.066809,62411.783246,113232.86385
2,-0.256406,-0.335261,mid-sized sedan,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,-0.762807,-0.507597,0.868526,0.903862,-0.308618,-0.200127,-0.585717,-0.412492,-0.069873,-0.429781,-0.434737,107899.322034,109309.819967
3,-1.26477,-0.905077,mid-sized sedan,0,1,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,-0.762807,-0.561314,0.046674,-1.388281,-0.528005,-0.272501,-0.614585,-0.205397,0.030028,-0.519048,-0.401966,104748.979592,104023.684211
4,-0.054733,-0.145322,hatchback,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.025321,-0.095759,1.509882,0.528463,-0.008575,0.095055,0.352495,-0.10794,0.157745,-0.085839,-0.14077,65347.711218,111529.030007
