In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv('../data/car_price_cleaned.csv')

In [3]:
df.head()

Unnamed: 0,Car ID,Brand,Year,Engine Size,Fuel Type,Transmission,Mileage,Condition,Price,Model
0,1.0,Tesla,2016.0,2.3,Petrol,Manual,114832.0,New,20000-30000,Model X
1,2.0,BMW,2018.0,4.4,Electric,Manual,143190.0,Used,10000-20000,5 Series
2,3.0,Audi,2013.0,4.5,Electric,Manual,181601.0,New,40000-50000,A4
3,4.0,Tesla,2011.0,4.1,Diesel,Automatic,68682.0,New,80000-90000,Model Y
4,5.0,Ford,2009.0,2.6,Diesel,Manual,223009.0,Like New,70000-80000,Mustang


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2250 entries, 0 to 2249
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Car ID        2250 non-null   float64
 1   Brand         2250 non-null   object 
 2   Year          2250 non-null   float64
 3   Engine Size   2250 non-null   float64
 4   Fuel Type     2250 non-null   object 
 5   Transmission  2250 non-null   object 
 6   Mileage       2250 non-null   float64
 7   Condition     2250 non-null   object 
 8   Price         2250 non-null   object 
 9   Model         2250 non-null   object 
dtypes: float64(4), object(6)
memory usage: 175.9+ KB


### Changer les types des colonnes du type 'object' en type 'category'


In [5]:
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype('category')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2250 entries, 0 to 2249
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Car ID        2250 non-null   float64 
 1   Brand         2250 non-null   category
 2   Year          2250 non-null   float64 
 3   Engine Size   2250 non-null   float64 
 4   Fuel Type     2250 non-null   category
 5   Transmission  2250 non-null   category
 6   Mileage       2250 non-null   float64 
 7   Condition     2250 non-null   category
 8   Price         2250 non-null   category
 9   Model         2250 non-null   category
dtypes: category(6), float64(4)
memory usage: 86.1 KB


### Encodages des variables du dataframe


In [17]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

numeric_features = ["Year", "Engine Size", "Mileage"]
categorical_features_ord = ["Condition"]
categorical_features_non_ord = ["Fuel Type"]
label_encoding_features = ["Model","Price"]

 
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
 
categorical_non_ord_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
 
categorical_ord_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ordinal", OrdinalEncoder())
])
 
label_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ordinal", OrdinalEncoder())
])
 
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat_ord", categorical_ord_transformer, categorical_features_ord),
        ("cat_non_ord", categorical_non_ord_transformer, categorical_features_non_ord),
        ("label_enc", label_transformer, label_encoding_features),
    ],
    remainder="drop"
)

In [8]:
df['Price'].isna().sum()

np.int64(0)

In [19]:
y = df['Price']                # cible
X = df.drop(columns=['Price'])                # features (supprime Price)

# split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# pipeline complète avec modèle
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(n_estimators=100, random_state=42))
])

# fit et prédictions
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# évaluation
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse:.2f}, R2: {r2:.3f}")

ValueError: A given column is not a column of the dataframe

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
pipeline_robust = Pipeline([
    ('scaler', RobustScaler()),
    ('model', LogisticRegression())
])
pipeline_robust.fit(X_train, y_train)
y_pred_robust = pipeline_robust.predict(X_test)

In [15]:
print(y_train)

478     90000-100000
1247     50000-60000
2100     20000-30000
629      40000-50000
1040     70000-80000
            ...     
1638     40000-50000
1095     30000-40000
1130     20000-30000
1294     40000-50000
860      80000-90000
Name: Price, Length: 1800, dtype: category
Categories (10, object): ['0-10000', '10000-20000', '20000-30000', '30000-40000', ..., '60000-70000', '70000-80000', '80000-90000', '90000-100000']


In [None]:

# fit et prédictions
y_pred = pipeline.predict(X_test)

# évaluation
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse:.2f}, R2: {r2:.3f}")

In [None]:
# 1) fit + transform séparés
preprocessor.fit(df)
X = preprocessor.transform(df)

# 2) ou fit_transform en une seule ligne
X = preprocessor.fit_transform(df)

# obtenir les noms des features et construire un DataFrame encodé
cols = preprocessor.get_feature_names_out()
df_encoded = pd.DataFrame(X, columns=cols)
df_encoded.head()

Unnamed: 0,num__Year,num__Engine Size,num__Mileage,cat_ord__Condition,cat_non_ord__Fuel Type_Diesel,cat_non_ord__Fuel Type_Electric,cat_non_ord__Fuel Type_Hybrid,cat_non_ord__Fuel Type_Petrol,label_enc__Model,label_enc__Price,label_enc__Price_Range
0,0.633654,-0.830523,-0.401721,1.0,0.0,1.0,0.0,0.0,19.0,2.0,2.0
1,0.920232,0.640711,-0.079951,2.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
2,0.203788,0.71077,0.355888,1.0,0.0,1.0,0.0,0.0,3.0,3.0,3.0
3,-0.082789,0.430535,-0.925373,1.0,0.0,1.0,0.0,0.0,20.0,7.0,7.0
4,-0.369366,-0.620347,0.825733,0.0,1.0,0.0,0.0,0.0,21.0,6.0,6.0
