In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from category_encoders import MEstimateEncoder


In [2]:
df = pd.read_csv('data/cleaned_cars.csv')
df = pd.get_dummies(df, columns = ['country'], drop_first=True, dtype=np.int8)
df.head()

Unnamed: 0,engine_capacity,cylinder,horse_power,top_speed,seats,brand,price,year,country_egypt,country_ksa,country_kuwait,country_oman,country_qatar,country_uae
0,2.0,4,180,205.0,8,peugeot,37955.25,2021,0,1,0,0,0,0
1,1.5,4,102,145.0,4,suzuki,26671.95,2021,0,1,0,0,0,0
2,2.3,4,420,173.0,4,ford,53460.0,2021,0,1,0,0,0,0
3,1.8,4,140,190.0,5,honda,28179.975,2021,0,1,0,0,0,0
4,1.8,4,140,190.0,5,honda,25740.45,2021,0,1,0,0,0,0


## Target encoding

In [3]:
X = df.copy()
y = X.pop('price')

X_encode = X.sample(frac=0.2)
y_encode = y[X_encode.index]
X = X.drop(X_encode.index)
y = y[X.index]

# Create the encoder instance. Choose m to control noise.
encoder = MEstimateEncoder(cols=["brand"], m=5.0)

# Fit the encoder on the encoding split.
encoder.fit(X_encode, y_encode)

# Encode the Zipcode column to create the final training data
X = encoder.transform(X)
X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)  

In [4]:
X.head()

Unnamed: 0,engine_capacity,cylinder,horse_power,top_speed,seats,brand,year,country_egypt,country_ksa,country_kuwait,country_oman,country_qatar,country_uae
0,2.0,4,180,205.0,8,31415.147199,2021,0,1,0,0,0,0
1,1.5,4,102,145.0,4,29460.337242,2021,0,1,0,0,0,0
2,2.3,4,420,173.0,4,49881.417006,2021,0,1,0,0,0,0
3,1.8,4,140,190.0,5,37074.706745,2021,0,1,0,0,0,0
4,2.0,4,120,170.0,5,31415.147199,2021,0,1,0,0,0,0


In [5]:
y.head()

0    37955.25
1    26671.95
2    53460.00
3    25740.45
4    22368.15
Name: price, dtype: float64

In [6]:
# Split the dataset into training (60%) and temporary (40%) sets with a fixed random_state
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Split the temporary set into validation (50% of temporary, i.e., 20% of total) and test (50% of temporary, i.e., 20% of total) sets with the same random_state
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [7]:
# Print the sizes of the resulting datasets
print(f"Training set size: {(X_train.shape[0])}")
print(f"Validation set size: {(X_val.shape[0])}")
print(f"Test set size: {(X_test.shape[0])}")

# Optionally, print the datasets
print("\nTraining Set:")
X_train.head()

Training set size: 2935
Validation set size: 979
Test set size: 979

Training Set:


Unnamed: 0,engine_capacity,cylinder,horse_power,top_speed,seats,brand,year,country_egypt,country_ksa,country_kuwait,country_oman,country_qatar,country_uae
1904,2.7,4,164,170.0,3,36404.839512,2021,0,0,0,0,1,0
4445,3.7,6,275,155.0,2,49881.417006,2021,0,0,0,0,0,1
3231,2.0,4,225,185.0,7,37701.064433,2021,0,0,1,0,0,0
1869,1.5,4,119,190.0,5,37074.706745,2021,0,0,0,0,1,0
1565,5.0,8,407,240.0,5,79112.039197,2021,0,0,0,0,0,0


In [8]:
print("\nValidation Set:")
X_val.head()


Validation Set:


Unnamed: 0,engine_capacity,cylinder,horse_power,top_speed,seats,brand,year,country_egypt,country_ksa,country_kuwait,country_oman,country_qatar,country_uae
4526,3.6,6,290,190.0,5,51971.484363,2021,0,0,0,0,0,1
835,1.5,3,570,250.0,4,88092.833764,2021,1,0,0,0,0,0
84,1.4,4,100,180.0,5,28333.409485,2021,0,1,0,0,0,0
1533,3.0,6,340,250.0,2,88092.833764,2021,0,0,0,0,0,0
2647,1.5,4,170,190.0,5,47358.40543,2021,0,0,0,1,0,0


In [9]:
print("\nTest Set:")
X_test.head()


Test Set:


Unnamed: 0,engine_capacity,cylinder,horse_power,top_speed,seats,brand,year,country_egypt,country_ksa,country_kuwait,country_oman,country_qatar,country_uae
73,1.4,4,140,180.0,4,29460.337242,2021,0,1,0,0,0,0
1149,1.6,4,107,162.0,5,42798.247305,2021,0,0,0,0,0,0
3721,3.5,6,311,230.0,5,70178.988196,2021,0,0,1,0,0,0
106,2.5,5,400,280.0,5,63515.891057,2021,0,1,0,0,0,0
4137,1.4,4,100,182.0,4,34865.954691,2021,0,0,0,0,0,1
