In [12]:
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [14]:
# Charger le dataset

data = pd.read_csv('Cleaned_House.csv')
data.head()

Unnamed: 0,property_id,location_id,property_type,price,location,city,province_name,baths,purpose,bedrooms,Area_in_Square_Meters
0,237062,3325,Flat,10000000,G-10,Islamabad,Islamabad Capital,2,For Sale,2,101.1716
1,346905,3236,Flat,6900000,E-11,Islamabad,Islamabad Capital,3,For Sale,3,141.64024
2,386513,764,House,16500000,G-15,Islamabad,Islamabad Capital,6,For Sale,5,202.3432
3,656161,340,House,43500000,Bani Gala,Islamabad,Islamabad Capital,4,For Sale,4,1011.714
4,841645,3226,House,7000000,DHA Defence,Islamabad,Islamabad Capital,3,For Sale,3,202.3432


In [15]:
# Diviser les données en caractéristiques (X) et cible (y)
X = data.drop('price', axis=1)
y = data['price']

In [16]:
 #Créer un préprocesseur pour encoder les colonnes catégorielles
categorical_columns = ['property_type', 'location', 'city']
numeric_columns = ['location_id', 'baths', 'bedrooms', 'Area_in_Square_Meters'] 

In [30]:
# Pipeline de prétraitement
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numeric_columns),  # Imputer les valeurs manquantes pour les variables numériques
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)  # Appliquer One-Hot Encoding pour les variables catégorielles
    ])

In [32]:
# Créer un modèle avec un pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse'))
])


In [34]:
# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [36]:
# Entraîner le modèle
model.fit(X_train, y_train)

In [37]:
# Tester la performance du modèle
y_pred = model.predict(X_test)

In [46]:
# Calculer l'erreur (par exemple, RMSE) et afficher la performance
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"RMSE du modèle réentraîné : {rmse}")
print(f"XGBoost R² : {r2}")



RMSE du modèle réentraîné : 21342820.913789634
XGBoost R² : 0.6070153705042064


In [51]:
# Sauvegarder le modèle
model.named_steps['regressor'].save_model("xgboost_retrained_model.json")