In [2]:
import pandas as pd

# Preprocessing et entrainement
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score


In [3]:
df = pd.read_csv('../data/car_price_cleaned.csv')

In [4]:
df.head()

Unnamed: 0,Year,Engine Size,Fuel Type,Transmission,Mileage,Condition,Price,Model
0,2016,2.3,Electric,Manual,114832,New,20000-25000,Model X
1,2018,4.4,Electric,Manual,143190,Used,10000-15000,5 Series
2,2013,4.5,Electric,Manual,181601,New,5000-10000,A4
3,2011,4.1,Electric,Automatic,68682,New,20000-25000,Model Y
4,2009,2.6,Diesel,Manual,223009,Like New,0-5000,Mustang


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2250 entries, 0 to 2249
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Year          2250 non-null   int64  
 1   Engine Size   2250 non-null   float64
 2   Fuel Type     2250 non-null   object 
 3   Transmission  2250 non-null   object 
 4   Mileage       2250 non-null   int64  
 5   Condition     2250 non-null   object 
 6   Price         2250 non-null   object 
 7   Model         2250 non-null   object 
dtypes: float64(1), int64(2), object(5)
memory usage: 140.8+ KB


### Changer les types des colonnes du type 'object' en type 'category'


In [6]:
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype('category')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2250 entries, 0 to 2249
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Year          2250 non-null   int64   
 1   Engine Size   2250 non-null   float64 
 2   Fuel Type     2250 non-null   category
 3   Transmission  2250 non-null   category
 4   Mileage       2250 non-null   int64   
 5   Condition     2250 non-null   category
 6   Price         2250 non-null   category
 7   Model         2250 non-null   category
dtypes: category(5), float64(1), int64(2)
memory usage: 65.8 KB


### Séparation de la variable cible et nos labels 

In [8]:
X = df.drop("Price", axis=1)
y = df["Price"]

### Encodages de la variable cible


In [9]:
price_label_encoder = LabelEncoder()
y = price_label_encoder.fit_transform(y)

print(y)

[3 1 5 ... 5 5 5]


### Préprocessing 

In [10]:
numeric_features = ["Year", "Engine Size", "Mileage"]
categorical_features_non_ord = ["Fuel Type", "Transmission"]
categorical_features_ord = ["Condition", "Model"]
 
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
 
categorical_non_ord_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
 
categorical_ord_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ordinay", OrdinalEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat_non_ord", categorical_non_ord_transformer, categorical_features_non_ord),
        ("cat_ord", categorical_ord_transformer, categorical_features_ord),
    ],
    remainder="drop"
)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Model v1 

In [12]:
from sklearn.linear_model import LogisticRegression


model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=1000, random_state=42))

    
])

# Entraîner le modèle
model_pipeline.fit(X_train, y_train)

In [13]:
y_pred = model_pipeline.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=price_label_encoder.classes_))

Accuracy: 0.61

Classification Report:
              precision    recall  f1-score   support

      0-5000       0.62      0.56      0.59        75
 10000-15000       0.60      0.56      0.58       119
 15000-20000       0.71      0.52      0.60        75
 20000-25000       0.71      0.25      0.37        20
 25000-30000       0.00      0.00      0.00         1
  5000-10000       0.60      0.77      0.67       160

    accuracy                           0.61       450
   macro avg       0.54      0.44      0.47       450
weighted avg       0.62      0.61      0.61       450



In [14]:
import pickle
 
with open("../models/modelv1.pkl", "wb") as f:
    pickle.dump(model_pipeline, f)
 
print("Modèle sauvegardé dans modelv1.pkl")
with open("../models/modelv1.pkl", "rb") as f:
    loaded_model = pickle.load(f)
 
print("Modèle chargé depuis modelv1.pkl")

Modèle sauvegardé dans modelv1.pkl
Modèle chargé depuis modelv1.pkl


Model v2

In [15]:
from sklearn.linear_model import LogisticRegression


model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=42)),

    
])

# Entraîner le modèle
model_pipeline.fit(X_train, y_train)

In [16]:
y_pred = model_pipeline.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=price_label_encoder.classes_))

Accuracy: 0.69

Classification Report:
              precision    recall  f1-score   support

      0-5000       0.86      0.73      0.79        75
 10000-15000       0.61      0.59      0.60       119
 15000-20000       0.66      0.56      0.60        75
 20000-25000       0.42      0.25      0.31        20
 25000-30000       0.00      0.00      0.00         1
  5000-10000       0.72      0.88      0.79       160

    accuracy                           0.69       450
   macro avg       0.54      0.50      0.52       450
weighted avg       0.69      0.69      0.69       450



In [17]:
import pickle
 
with open("../models/modelv2.pkl", "wb") as f:
    pickle.dump(model_pipeline, f)
 
print("Modèle sauvegardé dans modelv2.pkl")
with open("../models/modelv2.pkl", "rb") as f:
    loaded_model = pickle.load(f)
 
print("Modèle chargé depuis modelv2.pkl")

Modèle sauvegardé dans modelv2.pkl
Modèle chargé depuis modelv2.pkl
