In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
import os

In [2]:
# Chargement des données nettoyées
df_clean = pd.read_pickle("../data/processed/df_clean.pkl")

# Vérification
print(f"Shape du dataset chargé : {df_clean.shape}")
display(df_clean.head())

Shape du dataset chargé : (4598, 14)


Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_basement,city_mean_price,house_age,renovated,age_since_renov
0,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,0,420392.364047,63,1,13
1,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,280,579509.755748,97,0,97
2,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,0,439492.444648,52,0,52
3,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,847180.662995,55,0,55
4,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,800,664965.780656,42,1,26


In [3]:
# Définir la cible et les features
TARGET = "price"
X = df_clean.drop(columns=[TARGET])
y = df_clean[TARGET]

#Séparer Train / Test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

#Standardiser les features numériques
df_cols = X_train.columns

scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[df_cols] = scaler.fit_transform(X_train[df_cols])
X_test_scaled[df_cols] = scaler.transform(X_test[df_cols])

#Sauvegarder les fichiers
os.makedirs("../data/processed", exist_ok=True)

X_train_scaled.to_csv("../data/processed/X_train.csv", index=False)
X_test_scaled.to_csv("../data/processed/X_test.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)

# Sauvegarder le scaler entraîné
joblib.dump(scaler, "../data/processed/scaler.pkl")

print("✅ Données et scaler sauvegardés dans ../data/processed/")


✅ Données et scaler sauvegardés dans ../data/processed/
