In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from sklearn.metrics import mean_squared_error, r2_score

def create_features(df):
    df = df.copy()
    
    # Features de base améliorées
    df['rooms_per_household'] = df['total_rooms']/df['households']
    df['bedrooms_per_room'] = df['total_bedrooms']/df['total_rooms']
    df['population_per_household'] = df['population']/df['households']
    
    # Features économiques
    df['income_per_person'] = df['median_income']/df['population']
    df['income_per_room'] = df['median_income']/df['total_rooms']
    
    # Features d'âge et occupation
    df['age_income_ratio'] = df['housing_median_age']/df['median_income']
    df['density'] = df['population']/df['total_rooms']
    
    return df

# Load data
train_data = pd.read_csv('../ynov-data/train_housing_train.csv')
valid_data = pd.read_csv('../ynov-data/train_housing_valid.csv')

# Drop colonnes non nécessaires
columns_to_drop = ['longitude', 'latitude', 'id', 'median_house_value']
X_train = create_features(train_data.drop(columns_to_drop, axis=1))
y_train = train_data['median_house_value']

# Identify features types
numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Preprocessing
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = TargetEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Pipeline avec DecisionTreeRegressor optimisé
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(
        max_depth=8,
        min_samples_split=5,
        min_samples_leaf=15,
        random_state=10
    ))
])

# Train
model.fit(X_train, y_train)

# Métriques
train_predictions = model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
train_r2 = r2_score(y_train, train_predictions)

print(f'Training RMSE: {train_rmse:.2f}')
print(f'Training R²: {train_r2:.2f}')

# Validation
X_valid = create_features(valid_data.drop(columns_to_drop, axis=1))
y_valid = valid_data['median_house_value']
valid_predictions = model.predict(X_valid)

rmse = np.sqrt(mean_squared_error(y_valid, valid_predictions))
r2 = r2_score(y_valid, valid_predictions)

print(f'Validation RMSE: {rmse:.2f}')
print(f'Validation R²: {r2:.2f}')

# Predictions
test_data = pd.read_csv('../ynov-data/test_housing.csv')
X_test = create_features(test_data.drop('id', axis=1))
test_predictions = model.predict(X_test)

submission = pd.DataFrame({
    'id': test_data['id'],
    'median_house_value': test_predictions
})
submission.to_csv('../ynov-data/submission.csv', index=False)

Training RMSE: 56563.85
Training R²: 0.76
Validation RMSE: 65297.23
Validation R²: 0.70
