In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from sklearn.metrics import mean_squared_error, r2_score

def create_features(df):
    df = df.copy()
    df['rooms_per_household'] = df['total_rooms']/df['households']
    df['bedrooms_per_room'] = df['total_bedrooms']/df['total_rooms']
    df['population_per_household'] = df['population']/df['households']
    df['income_per_household'] = df['median_income']/df['households']
    return df

# Chargement données
train_data = pd.read_csv('../ynov-data/train_housing_train.csv')
valid_data = pd.read_csv('../ynov-data/train_housing_valid.csv')

columns_to_drop = ['longitude', 'latitude', 'id', 'median_house_value']
X_train = create_features(train_data.drop(columns=columns_to_drop))
y_train = train_data['median_house_value']

# Preprocessing
numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = TargetEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# XGBoost avec GPU
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(
        n_estimators=200,
        max_depth=7,
        learning_rate=0.1,
        tree_method='gpu_hist',  # Utilisation GPU
        random_state=42
    ))
])

# Entraînement
model.fit(X_train, y_train)

# Métriques
train_predictions = model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
train_r2 = r2_score(y_train, train_predictions)

print(f'Training RMSE: {train_rmse:.2f}')
print(f'Training R²: {train_r2:.2f}')

# Validation
X_valid = create_features(valid_data.drop(columns=columns_to_drop))
y_valid = valid_data['median_house_value']
valid_predictions = model.predict(X_valid)

rmse = np.sqrt(mean_squared_error(y_valid, valid_predictions))
r2 = r2_score(y_valid, valid_predictions)

print(f'Validation RMSE: {rmse:.2f}')
print(f'Validation R²: {r2:.2f}')


    E.g. tree_method = "hist", device = "cuda"



Training RMSE: 27313.21
Training R²: 0.94
Validation RMSE: 60280.19
Validation R²: 0.74



    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


