In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from category_encoders import MEstimateEncoder


In [488]:
df = pd.read_csv('data/cleaned_cars.csv')
df = pd.get_dummies(df, columns = ['country'], drop_first=True, dtype=np.int8)
df.head()

Unnamed: 0,engine_capacity,cylinder,horse_power,top_speed,seats,brand,price,year,country_egypt,country_ksa,country_kuwait,country_oman,country_qatar,country_uae
0,2.0,4,180,205.0,8,peugeot,37955.25,2021,0,1,0,0,0,0
1,1.5,4,102,145.0,4,suzuki,26671.95,2021,0,1,0,0,0,0
2,2.3,4,420,173.0,4,ford,53460.0,2021,0,1,0,0,0,0
3,1.8,4,140,190.0,5,honda,28179.975,2021,0,1,0,0,0,0
4,1.8,4,140,190.0,5,honda,25740.45,2021,0,1,0,0,0,0


## Target encoding

In [492]:
X = df.copy()
y = X.pop('price')

X_encode = X.sample(frac=0.2)
y_encode = y[X_encode.index]
X = X.drop(X_encode.index)
y = y[X.index]

# Create the encoder instance. Choose m to control noise.
encoder = MEstimateEncoder(cols=["brand"], m=5.0)

# Fit the encoder on the encoding split.
encoder.fit(X_encode, y_encode)

# Encode the Zipcode column to create the final training data
X = encoder.transform(X)
X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)  

In [493]:
X.head()

Unnamed: 0,engine_capacity,cylinder,horse_power,top_speed,seats,brand,year,country_egypt,country_ksa,country_kuwait,country_oman,country_qatar,country_uae
0,2.0,4,180,205.0,8,29933.3133,2021,0,1,0,0,0,0
1,2.3,4,420,173.0,4,48408.401739,2021,0,1,0,0,0,0
2,1.8,4,140,190.0,5,34519.670927,2021,0,1,0,0,0,0
3,1.8,4,140,190.0,5,34519.670927,2021,0,1,0,0,0,0
4,2.0,4,120,170.0,5,29933.3133,2021,0,1,0,0,0,0


In [494]:
y.head()

0    37955.250
1    53460.000
2    28179.975
3    25740.450
4    22368.150
Name: price, dtype: float64

In [500]:
# Split the dataset into training (60%) and temporary (40%) sets with a fixed random_state
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Split the temporary set into validation (50% of temporary, i.e., 20% of total) and test (50% of temporary, i.e., 20% of total) sets with the same random_state
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [502]:
# Print the sizes of the resulting datasets
print(f"Training set size: {(X_train.shape[0])}")
print(f"Validation set size: {(X_val.shape[0])}")
print(f"Test set size: {(X_test.shape[0])}")

# Optionally, print the datasets
print("\nTraining Set:")
X_train.head()

Training set size: 2935
Validation set size: 979
Test set size: 979

Training Set:


Unnamed: 0,engine_capacity,cylinder,horse_power,top_speed,seats,brand,year,country_egypt,country_ksa,country_kuwait,country_oman,country_qatar,country_uae
1904,1.6,4,113,180.0,5,37402.913407,2021,0,0,0,0,1,0
4445,3.8,6,355,182.0,8,26704.474034,2021,0,0,0,0,0,1
3231,1.8,4,140,190.0,5,34519.670927,2021,0,0,1,0,0,0
1869,1.6,4,115,180.0,5,30916.937913,2021,0,0,0,0,1,0
1565,3.0,6,367,250.0,5,92684.142671,2021,0,0,0,0,0,0


In [503]:
print("\nValidation Set:")
X_val.head()


Validation Set:


Unnamed: 0,engine_capacity,cylinder,horse_power,top_speed,seats,brand,year,country_egypt,country_ksa,country_kuwait,country_oman,country_qatar,country_uae
4526,2.0,4,255,209.0,5,92684.142671,2022,0,0,0,0,0,1
835,1.5,4,165,218.0,5,51020.103477,2021,1,0,0,0,0,0
84,1.6,4,123,190.0,5,30522.559811,2021,0,1,0,0,0,0
1533,2.0,4,246,217.0,5,113791.520009,2021,0,0,0,0,0,0
2647,2.5,4,164,170.0,5,48408.401739,2021,0,0,0,1,0,0


In [504]:
print("\nTest Set:")
X_test.head()


Test Set:


Unnamed: 0,engine_capacity,cylinder,horse_power,top_speed,seats,brand,year,country_egypt,country_ksa,country_kuwait,country_oman,country_qatar,country_uae
73,1.2,4,82,165.0,5,27715.819891,2021,0,1,0,0,0,0
1149,2.0,4,245,230.0,5,60761.687026,2021,0,0,0,0,0,0
3721,3.5,6,296,200.0,5,74941.936909,2021,0,0,1,0,0,0
106,2.9,6,510,283.0,5,61870.49071,2021,0,1,0,0,0,0
4137,1.5,4,152,180.0,5,52818.790977,2021,0,0,0,0,0,1
