In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score

class ScoreMatchedRandomForestPredictor:
    def __init__(self):
        self.model = RandomForestRegressor(
            n_estimators=400,
            max_depth=30,
            min_samples_split=6,
            min_samples_leaf=3,
            max_features=0.45844810252897433,
            bootstrap=True,
            oob_score=True,
            #random_state=42,
            n_jobs=-1
        )
        self.preprocessor = None
        
    def prepare_features(self, X):
        """Feature engineering calibré"""
        X = X.copy()
        
        # Features de base essentielles
        X['rooms_per_household'] = X['total_rooms'] / X['households']
        X['bedrooms_per_room'] = X['total_bedrooms'] / X['total_rooms']
        X['population_density'] = X['population'] / X['total_rooms']
        X['households_per_population'] = X['households'] / X['population']
        
        # Features financières simples mais efficaces
        X['income_per_household'] = X['median_income'] / X['households']
        X['income_per_population'] = X['median_income'] / X['population']
        
        # Features géographiques basiques
        X['location_cluster'] = (X['latitude'] // 0.5) * 100 + (X['longitude'] // 0.5)
        
        # Transformations log uniquement
        for col in ['median_income', 'total_rooms', 'total_bedrooms', 'population', 'households']:
            X[f'{col}_log'] = np.log1p(X[col])
        
        # Features d'interaction limitées
        X['income_density'] = X['median_income'] * X['population_density']
        
        # Gestion des valeurs aberrantes
        X = X.replace([np.inf, -np.inf], np.nan)
        
        return X
        
    def fit(self, X_train, y_train):
        X_train = self.prepare_features(X_train)
        
        categorical_features = [col for col in ['ocean_proximity'] if col in X_train.columns]
        numeric_features = [col for col in X_train.columns if col not in categorical_features]
        
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
        
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
        ])
        
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ])
        
        X_processed = self.preprocessor.fit_transform(X_train)
        self.model.fit(X_processed, y_train)
        
        print(f"Score Out-of-Bag: {self.model.oob_score_:.4f}")

    def predict(self, X):
        X = self.prepare_features(X)
        X_processed = self.preprocessor.transform(X)
        return self.model.predict(X_processed)
    
    def evaluate(self, X, y):
        predictions = self.predict(X)
        r2 = r2_score(y, predictions)
        rmse = np.sqrt(mean_squared_error(y, predictions))
        
        X = self.prepare_features(X)
        X_processed = self.preprocessor.transform(X)
        cv_scores = cross_val_score(self.model, X_processed, y, cv=5, scoring='r2')
        
        return {
            'r2_score': r2,
            'rmse': rmse,
            'cv_scores_mean': cv_scores.mean(),
            'cv_scores_std': cv_scores.std()
        }


def main():
    try:
        print("Chargement des données...")
        train_data = pd.read_csv('../ynov-data/train_housing_train.csv')
        valid_data = pd.read_csv('../ynov-data/train_housing_valid.csv')
        test_data = pd.read_csv('../ynov-data/test_housing.csv')

        # Préparation des données
        X_train = train_data.drop(['median_house_value', 'id'], axis=1)
        y_train = train_data['median_house_value']
        X_valid = valid_data.drop(['median_house_value', 'id', 'prediction'], axis=1)
        y_valid = valid_data['median_house_value']
        X_test = test_data.drop(['id'], axis=1)

        # Création et entraînement du modèle
        print("\nEntraînement du modèle Random Forest...")
        predictor = ScoreMatchedRandomForestPredictor()
        predictor.fit(X_train, y_train)

        # Évaluation
        print("\nÉvaluation sur l'ensemble d'entraînement...")
        train_metrics = predictor.evaluate(X_train, y_train)
        
        print("\nÉvaluation sur l'ensemble de validation...")
        valid_metrics = predictor.evaluate(X_valid, y_valid)

        # Affichage des résultats
        print(f"\nPerformances sur l'ensemble d'entraînement:")
        print(f"R² score: {train_metrics['r2_score']:.4f}")
        print(f"RMSE: {train_metrics['rmse']:.2f}")
        print(f"CV R² moyen: {train_metrics['cv_scores_mean']:.4f} (±{train_metrics['cv_scores_std']:.4f})")

        print(f"\nPerformances sur l'ensemble de validation:")
        print(f"R² score: {valid_metrics['r2_score']:.4f}")
        print(f"RMSE: {valid_metrics['rmse']:.2f}")

        # Génération des prédictions finales
        print("\nGénération des prédictions de test...")
        test_predictions = predictor.predict(X_test)

        # Création du fichier de soumission
        print("Création du fichier de soumission...")
        submission = pd.DataFrame({
            'id': test_data['id'],
            'median_house_value': test_predictions
        })
        submission.to_csv('../ynov-data/submit.csv', index=False)
        print("\nTerminé! Fichier de soumission créé avec succès.")

    except Exception as e:
        print(f"\nUne erreur s'est produite: {str(e)}")
        raise

if __name__ == "__main__":
    main()




Chargement des données...

Entraînement du modèle Random Forest...
Score Out-of-Bag: 0.8071

Évaluation sur l'ensemble d'entraînement...

Évaluation sur l'ensemble de validation...

Performances sur l'ensemble d'entraînement:
R² score: 0.9381
RMSE: 28710.95
CV R² moyen: 0.8017 (±0.0059)

Performances sur l'ensemble de validation:
R² score: 0.8117
RMSE: 50801.95

Génération des prédictions de test...
Création du fichier de soumission...

Terminé! Fichier de soumission créé avec succès.


In [None]:

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
import optuna
import time
import logging
from datetime import datetime

# Configuration du logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler(f'optuna_optimization_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')
    ]
)
logger = logging.getLogger(__name__)

class OptunaRandomForestPredictor:
    def __init__(self):
        self.model = None
        self.preprocessor = None
        self.best_params = None
        
    def prepare_features(self, X):
        """Feature engineering optimisé"""
        logger.info("Début de la préparation des features...")
        start_time = time.time()
        
        X = X.copy()
        
        # Features de base essentiels
        logger.info("Création des features de base...")
        X['rooms_per_household'] = X['total_rooms'] / X['households']
        X['bedrooms_per_room'] = X['total_bedrooms'] / X['total_rooms']
        X['population_density'] = X['population'] / X['total_rooms']
        X['income_per_household'] = X['median_income'] / X['households']
        
        # Features géographiques
        logger.info("Création des features géographiques...")
        X['distance_to_coast'] = np.abs(X['longitude'] + 122)
        X['location_cluster'] = (X['latitude'] // 0.5) * 100 + (X['longitude'] // 0.5)
        
        # Log transformation
        logger.info("Application des transformations logarithmiques...")
        for col in ['median_income', 'total_rooms', 'total_bedrooms', 'population', 'households']:
            X[f'{col}_log'] = np.log1p(X[col])
        
        # Gestion des valeurs aberrantes
        X = X.replace([np.inf, -np.inf], np.nan)
        
        duration = time.time() - start_time
        logger.info(f"Préparation des features terminée en {duration:.2f} secondes")
        logger.info(f"Nombre total de features: {X.shape[1]}")
        
        return X
        
    def optimize_hyperparameters(self, X, y):
        """Optimisation des hyperparamètres avec Optuna"""
        logger.info("Début de l'optimisation avec Optuna...")
        start_time = time.time()

        def objective(trial):
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 100, 1000, 100),
                'max_depth': trial.suggest_int('max_depth', 5, 30),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
                'max_features': trial.suggest_float('max_features', 0.1, 1.0)
            }
            
            model = RandomForestRegressor(
                **params,
                n_jobs=-1,
                random_state=42
            )
            
            scores = cross_val_score(
                model, X, y,
                cv=5,
                scoring='neg_root_mean_squared_error',
                n_jobs=-1
            )
            
            rmse = -scores.mean()
            
            logger.info(f"\nEssai {trial.number}:")
            logger.info(f"Paramètres testés: {params}")
            logger.info(f"RMSE: {rmse:.2f}")
            
            return rmse

        # Création de l'étude Optuna
        study = optuna.create_study(direction='minimize')
        
        # Lancement de l'optimisation
        logger.info("Démarrage des essais d'optimisation...")
        study.optimize(objective, n_trials=30, show_progress_bar=True)
        
        # Log des résultats
        logger.info("\nMeilleurs résultats:")
        logger.info(f"Meilleurs paramètres: {study.best_params}")
        logger.info(f"Meilleur score RMSE: {study.best_value:.2f}")
        
        duration = time.time() - start_time
        logger.info(f"\nOptimisation terminée en {duration/60:.2f} minutes")
        
        self.best_params = study.best_params
        return RandomForestRegressor(**study.best_params, n_jobs=-1, random_state=42)
        
    def fit(self, X_train, y_train):
        logger.info("Début de l'entraînement...")
        total_start_time = time.time()
        
        logger.info("Préparation des features...")
        X_train = self.prepare_features(X_train)
        
        categorical_features = [col for col in ['ocean_proximity'] if col in X_train.columns]
        numeric_features = [col for col in X_train.columns if col not in categorical_features]
        
        logger.info("Configuration du préprocesseur...")
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', RobustScaler())
        ])
        
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
        ])
        
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ])
        
        logger.info("Prétraitement des données...")
        X_processed = self.preprocessor.fit_transform(X_train)
        
        logger.info("Optimisation des hyperparamètres avec Optuna...")
        self.model = self.optimize_hyperparameters(X_processed, y_train)
        
        logger.info("Entraînement final avec les meilleurs paramètres...")
        self.model.fit(X_processed, y_train)
        
        total_duration = time.time() - total_start_time
        logger.info(f"Entraînement complet terminé en {total_duration/60:.2f} minutes")

    def predict(self, X):
        logger.info("Génération des prédictions...")
        start_time = time.time()
        X = self.prepare_features(X)
        X_processed = self.preprocessor.transform(X)
        predictions = self.model.predict(X_processed)
        logger.info(f"Prédictions générées en {time.time() - start_time:.2f} secondes")
        return predictions

    def evaluate(self, X, y):
        logger.info("Évaluation du modèle...")
        start_time = time.time()
        predictions = self.predict(X)
        r2 = r2_score(y, predictions)
        rmse = np.sqrt(mean_squared_error(y, predictions))
        logger.info(f"Évaluation terminée en {time.time() - start_time:.2f} secondes")
        return {
            'r2_score': r2,
            'rmse': rmse
        }

def main():
    try:
        logger.info("Démarrage du programme...")
        logger.info("Chargement des données...")
        
        # Chargement des données
        train_data = pd.read_csv('../ynov-data/train_housing_train.csv').drop(['id'], axis=1)
        valid_data = pd.read_csv('../ynov-data/train_housing_valid.csv').drop(['id', 'prediction'], axis=1)
        test_data = pd.read_csv('../ynov-data/test_housing.csv')
        test_ids = test_data['id']
        test_data = test_data.drop(['id'], axis=1)

        logger.info("Préparation des données...")
        X_train = train_data.drop(['median_house_value'], axis=1)
        y_train = train_data['median_house_value']
        X_valid = valid_data.drop(['median_house_value'], axis=1)
        y_valid = valid_data['median_house_value']
        X_test = test_data

        logger.info("Initialisation et entraînement du modèle...")
        predictor = OptunaRandomForestPredictor()
        predictor.fit(X_train, y_train)

        logger.info("Évaluation sur l'ensemble d'entraînement...")
        train_metrics = predictor.evaluate(X_train, y_train)
        
        logger.info("Évaluation sur l'ensemble de validation...")
        valid_metrics = predictor.evaluate(X_valid, y_valid)

        logger.info("\nRésultats des performances:")
        logger.info(f"Performances sur l'ensemble d'entraînement:")
        logger.info(f"R² score: {train_metrics['r2_score']:.4f}")
        logger.info(f"RMSE: {train_metrics['rmse']:.2f}")

        logger.info(f"\nPerformances sur l'ensemble de validation:")
        logger.info(f"R² score: {valid_metrics['r2_score']:.4f}")
        logger.info(f"RMSE: {valid_metrics['rmse']:.2f}")

        logger.info("Génération des prédictions de test...")
        test_predictions = predictor.predict(X_test)

        logger.info("Création du fichier de soumission...")
        submission = pd.DataFrame({
            'id': test_ids,
            'median_house_value': test_predictions
        })
        submission.to_csv('../ynov-data/submit.csv', index=False)
        logger.info("Fichier de soumission créé avec succès.")
        logger.info("Programme terminé avec succès.")

    except Exception as e:
        logger.error(f"Une erreur s'est produite: {str(e)}", exc_info=True)
        raise

if __name__ == "__main__":
    main()
