In [26]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from sklearn.metrics import mean_squared_error, r2_score

def remove_outliers(df, columns, n_std):
    df = df.copy()
    for col in columns:
        mean = df[col].mean()
        std = df[col].std()
        df = df[(df[col] <= mean + n_std * std) & (df[col] >= mean - n_std * std)]
    return df

def create_features(df):
    df = df.copy()
    
    # Transformations logarithmiques pour les variables positives
    df['log_income'] = np.log1p(df['median_income'])
    df['log_rooms'] = np.log1p(df['total_rooms'])
    df['log_population'] = np.log1p(df['population'])
    
    # Interactions complexes
    df['income_age'] = df['median_income'] * df['housing_median_age']
    df['rooms_income'] = df['total_rooms'] * df['median_income']
    df['density'] = df['population'] / df['households']
    df['rooms_per_person'] = df['total_rooms'] / df['population']
    
    # Polynomiales sur le revenu (feature la plus corrélée)
    df['income_squared'] = df['median_income']**2
    df['income_cubed'] = df['median_income']**3
    
    # Ratios économiques
    df['income_per_household'] = df['median_income']/df['households']
    df['rooms_per_household'] = df['total_rooms']/df['households']
    
    # Sélection finale des colonnes
    keep_cols = [
        'log_income', 'log_rooms', 'income_age', 'rooms_income',
        'density', 'rooms_per_person', 'income_squared', 'income_cubed',
        'income_per_household', 'rooms_per_household', 'housing_median_age',
        'ocean_proximity'
    ]
    
    return df[keep_cols]

# Load data
train_data = pd.read_csv('../ynov-data/train_housing_train.csv')
valid_data = pd.read_csv('../ynov-data/train_housing_valid.csv')

# Remove outliers from training data only
outlier_cols = ['median_income', 'total_rooms', 'population', 'households']
train_data = remove_outliers(train_data, outlier_cols, 3)

# Préparation des données
X_train = create_features(train_data)
y_train = train_data['median_house_value']

# Séparation des features
numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Preprocessing amélioré
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('power', PowerTransformer(method='yeo-johnson'))
])

categorical_transformer = TargetEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Pipeline finale
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Entraînement
model.fit(X_train, y_train)

# Métriques d'entraînement
train_predictions = model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
train_r2 = r2_score(y_train, train_predictions)

print(f'Training RMSE: {train_rmse:.2f}')
print(f'Training R²: {train_r2:.2f}')

# Validation
X_valid = create_features(valid_data)
y_valid = valid_data['median_house_value']
valid_predictions = model.predict(X_valid)

rmse = np.sqrt(mean_squared_error(y_valid, valid_predictions))
r2 = r2_score(y_valid, valid_predictions)

print(f'Validation RMSE: {rmse:.2f}')
print(f'Validation R²: {r2:.2f}')

Training RMSE: 67448.20
Training R²: 0.63
Validation RMSE: 72066.90
Validation R²: 0.63
