In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold, cross_val_score
import optuna
from tqdm import tqdm
from joblib import parallel_backend

def create_features(df):
    df = df.copy()
    
    # Gestion des outliers par IQR
    numeric_cols = ['median_income', 'total_rooms', 'total_bedrooms', 'population', 'households']
    for col in numeric_cols:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        df[col] = df[col].clip(q1 - 1.5*iqr, q3 + 1.5*iqr)
    
    # Transformations non-linéaires
    df['income_log'] = np.log1p(df['median_income'])
    df['rooms_log'] = np.log1p(df['total_rooms'])
    df['income_squared'] = df['median_income']**2
    
    # Ratios et densités
    df['rooms_per_household'] = df['total_rooms']/df['households']
    df['bedrooms_ratio'] = df['total_bedrooms']/df['total_rooms']
    df['population_density'] = df['population']/df['households']
    
    # Interactions économiques
    df['income_per_person'] = df['median_income']/df['population']
    df['rooms_per_income'] = df['total_rooms']/df['median_income']
    df['income_age_interaction'] = df['median_income'] * df['housing_median_age']
    
    # Métriques composites
    df['crowding_index'] = (df['population'] * df['housing_median_age'])/(df['total_rooms'] + 1)
    df['economic_density'] = (df['median_income'] * df['households'])/(df['total_rooms'] + 1)
    
    return df

# Load data
train_data = pd.read_csv('../ynov-data/train_housing_train.csv')
valid_data = pd.read_csv('../ynov-data/train_housing_valid.csv')

columns_to_drop = ['longitude', 'latitude', 'id', 'median_house_value']
X_train = create_features(train_data.drop(columns=columns_to_drop))
y_train = train_data['median_house_value']

X_valid = create_features(valid_data.drop(columns=columns_to_drop))
y_valid = valid_data['median_house_value']

# Preprocessing
numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 400, step=50),
        'max_depth': trial.suggest_int('max_depth', 10, 20),  # Réduit pour limiter overfitting
        'min_samples_split': trial.suggest_int('min_samples_split', 5, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 3, 7),
        'max_features': trial.suggest_float('max_features', 0.5, 0.7),  # Plage plus restreinte
        'max_samples': trial.suggest_float('max_samples', 0.6, 0.8),
        'bootstrap': True,
        'n_jobs': -1
    }
    
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(random_state=42, **params))
    ])
    
    with parallel_backend('threading', n_jobs=-1):
        cv_scores = cross_val_score(
            model, 
            X_train, 
            y_train,
            cv=KFold(n_splits=5, shuffle=True, random_state=42),
            scoring='neg_root_mean_squared_error',
            n_jobs=-1
        )
    
    return -np.mean(cv_scores)

# Moins de trials mais exploration plus large
n_trials = 20

# Create SQLite storage
storage_name = "sqlite:///./stats.sqlite3?timeout=10000&check_same_thread=False"
study_name = "random_forest_housing_optimization"

# Create study with modified storage settings
study = optuna.create_study(
    study_name=study_name,
    storage=storage_name,
    direction='minimize',
    sampler=optuna.samplers.TPESampler(multivariate=True, seed=42),
    load_if_exists=True
)

# Modify optimization to use less parallelism to avoid DB locks
with tqdm(total=n_trials, desc='Optimization Progress') as pbar:
    study.optimize(
        objective, 
        n_trials=n_trials,
        n_jobs=4,  # Reduced from -1 to avoid DB contention
        show_progress_bar=False,
        callbacks=[lambda study, trial: pbar.update(1)]
    )

print("\nMeilleurs paramètres:", study.best_params)
print("Meilleur score:", study.best_value)

# Final model with best parameters
final_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42, **study.best_params))
])

final_model.fit(X_train, y_train)

# Metrics
train_predictions = final_model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
train_r2 = r2_score(y_train, train_predictions)

print(f'\nTraining RMSE: {train_rmse:.2f}')
print(f'Training R²: {train_r2:.2f}')

valid_predictions = final_model.predict(X_valid)
valid_rmse = np.sqrt(mean_squared_error(y_valid, valid_predictions))
valid_r2 = r2_score(y_valid, valid_predictions)

print(f'Validation RMSE: {valid_rmse:.2f}')
print(f'Validation R²: {valid_r2:.2f}')

# Generate submission file
test_data = pd.read_csv('../ynov-data/test_housing.csv')
X_test = create_features(test_data.drop('id', axis=1))
test_predictions = final_model.predict(X_test)

submission = pd.DataFrame({
    'id': test_data['id'],
    'median_house_value': test_predictions
})
submission.to_csv('../ynov-data/submission.csv', index=False)

[I 2024-12-12 13:55:18,029] Using an existing study with name 'random_forest_housing_optimization' instead of creating a new one.
[I 2024-12-12 13:55:49,534] Trial 33 finished with value: 59246.61740435292 and parameters: {'n_estimators': 200, 'max_depth': 19, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 0.5, 'max_samples': 0.7756501862687577}. Best is trial 25 with value: 58139.32794555723.
Optimization Progress:   5%|█                    | 1/20 [00:31<09:58, 31.51s/it][I 2024-12-12 13:55:50,080] Trial 35 finished with value: 59188.93800457435 and parameters: {'n_estimators': 250, 'max_depth': 12, 'min_samples_split': 7, 'min_samples_leaf': 4, 'max_features': 0.7, 'max_samples': 0.7567481531154348}. Best is trial 25 with value: 58139.32794555723.
Optimization Progress:  10%|██                   | 2/20 [00:32<03:59, 13.29s/it][I 2024-12-12 13:55:50,084] Trial 34 finished with value: 59163.37725181684 and parameters: {'n_estimators': 400, 'max_depth': 19, 'min_samples_


Meilleurs paramètres: {'n_estimators': 300, 'max_depth': 30, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 0.6000000000000001, 'max_samples': 0.6, 'criterion': 'squared_error', 'ccp_alpha': 0.01}
Meilleur score: 58139.32794555723

Training RMSE: 42232.53
Training R²: 0.87
Validation RMSE: 60886.47
Validation R²: 0.74
