In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import joblib

# Charger les données depuis le fichier Excel
data = pd.read_excel('new.xlsx')

# Convertir le rating en numérique (supposons que c'est au format '4/5*')
data['rating'] = data['rating'].str.extract(r'(\d+)').astype(float)
data['rating'] = data['rating'] / 5.0
data['rating'] = data['rating'].fillna(0)  # Remplacer les valeurs manquantes par 0

# Encoder la variable catégorielle 'brand_name' avec LabelEncoder
label_encoder = LabelEncoder()
data['brand_name_encoded'] = label_encoder.fit_transform(data['brand_name'])

# Sélectionner uniquement les colonnes 'rating' et 'brand_name_encoded' pour l'entraînement du modèle
x = data[['rating', 'brand_name_encoded']]
y = data['product_price']

# Sauvegarder les noms des caractéristiques
feature_names = x.columns.tolist()
joblib.dump(feature_names, 'feature_names.pkl')

# Diviser les données en ensembles d'entraînement et de test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Définir les hyperparamètres à rechercher
param_distributions = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialiser le modèle
rf = RandomForestRegressor(random_state=42)

# Initialiser RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_distributions,
                                   n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1, error_score='raise')

# Entraîner le modèle
random_search.fit(x_train, y_train)

# Meilleurs paramètres
print(f'Best parameters found: {random_search.best_params_}')

# Utiliser le meilleur modèle trouvé
best_model = random_search.best_estimator_

# Prédictions
y_pred_best = best_model.predict(x_test)

# Évaluation du modèle
mae_best = mean_absolute_error(y_test, y_pred_best)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
print(f'Optimized MAE: {mae_best}')
print(f'Optimized RMSE: {rmse_best}')

# Sauvegarder le modèle
joblib.dump(best_model, 'best_model.pkl')

# Sauvegarder l'encodeur
joblib.dump(label_encoder, 'label_encoder.pkl')

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters found: {'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 10}
Optimized MAE: 102.18429454562617
Optimized RMSE: 160.18545715374233


['label_encoder.pkl']

[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=400; total time=   0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   0.1s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=   0.2s
[CV] END max_depth=40, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   0.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=   0.2s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=500; total time=   0.3s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=400; total time=   0.2s
[CV] END max_depth=50, max_fe