# L'IA au service des agents immobiliers - Silicon Valley 

## Imports

In [1]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import KNNImputer
from sklearn.model_selection import KFold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import FunctionTransformer
from sklearn import set_config
import joblib
from sklearn.preprocessing import RobustScaler

## Préparation des données

In [2]:
df = pd.read_csv("housing-train-data.csv")
df = df.drop(columns=['Unnamed: 0'])

In [3]:
display(df)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,72000.0,INLAND
1,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,274100.0,<1H OCEAN
2,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536,58300.0,INLAND
3,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284,200000.0,<1H OCEAN
4,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,117900.0,INLAND
...,...,...,...,...,...,...,...,...,...,...
16507,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625,93800.0,INLAND
16508,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133,103100.0,NEAR BAY
16509,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958,192600.0,INLAND
16510,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750,153500.0,<1H OCEAN


## Transformation des données

In [4]:
# Séparation des caractéristiques et des cibles
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]

num_features = [
    'longitude', 'latitude', 'housing_median_age', 'total_rooms',
    'total_bedrooms', 'population', 'households', 'median_income'
]

cat_features = ['ocean_proximity']

Définition de la fonction de calcul de la distance de Haversine: La fonction haversine_distance calcule la distance entre deux points géographiques donnés.

In [5]:
# Création de la fonction pour calculer les distances de Haversine
def calculate_distances(df):
    # Coordonnées des points de référence
    la_lat, la_lon = 34.003342, -118.485832  # Los Angeles
    sf_lat, sf_lon = 37.787994, -122.407437  # San Francisco
    ontario_lat, ontario_lon = 34.068871, -117.651215  # Ontario

    # Fonction de calcul de la distance de Haversine
    def haversine_distance(lat1, lon1, lat2, lon2):
        R = 6371.0  # Rayon de la Terre en kilomètres
        lat1_rad = math.radians(lat1)
        lon1_rad = math.radians(lon1)
        lat2_rad = math.radians(lat2)
        lon2_rad = math.radians(lon2)

        dlat = lat2_rad - lat1_rad
        dlon = lon2_rad - lon1_rad

        a = (math.sin(dlat / 2) ** 2 +
             math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon / 2) ** 2)
        c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

        return R * c

    # Calculer les distances et les ajouter au DataFrame
    df['distance_LA'] = df.apply(
        lambda row: haversine_distance(row['latitude'], row['longitude'], la_lat, la_lon), axis=1)
    df['distance_SF'] = df.apply(
        lambda row: haversine_distance(row['latitude'], row['longitude'], sf_lat, sf_lon), axis=1)
    df['distance_Ontario'] = df.apply(
        lambda row: haversine_distance(row['latitude'], row['longitude'], ontario_lat, ontario_lon), axis=1)

    # Retourner les nouvelles colonnes de distances
    return df[['distance_LA', 'distance_SF', 'distance_Ontario']]

# Création de la transformation personnalisée avec FunctionTransformer
distance_transformer = FunctionTransformer(calculate_distances, validate=False)

In [6]:
num_transformer = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

dist_transformer = Pipeline([
    ('distance', distance_transformer),
    ('scaler', StandardScaler())
])

In [7]:
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features),
    ('lat_lon', dist_transformer, ['latitude', 'longitude'])
])

# Création du modèle KNeighborsRegressor
model = KNeighborsRegressor() #(n_neighbors=5)

# Création du pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

In [8]:
set_config(display='diagram')
pipeline

## Séparation des données

In [9]:
# Séparation des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=39)

In [10]:
# Définition des paramètres pour `GridSearchCV`
param_grid = {
    'model__n_neighbors': [8, 9, 10, 11, 12, 13, 14, 15],
    'model__weights': ['uniform', 'distance'],
    'model__metric': ['euclidean', 'manhattan'],
    'model__algorithm': ['ball_tree', 'kd_tree', 'brute'],
    'model__leaf_size': [10, 30, 50]
}

In [11]:
# Initialisation de `GridSearchCV` avec le `pipeline` et les paramètres
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)

# Effectuer la recherche d'hyperparamètres sur l'ensemble de données d'entraînement
grid_search.fit(X_train, y_train)

# Récupérer les meilleurs paramètres
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Afficher les meilleurs paramètres
print("Meilleurs paramètres: ", best_params)
print(f"Score R²: {r2:.2f}")
print(f"RMSE: {rmse:.2f}")

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


Meilleurs paramètres:  {'model__algorithm': 'ball_tree', 'model__leaf_size': 10, 'model__metric': 'manhattan', 'model__n_neighbors': 9, 'model__weights': 'distance'}
Score R²: 0.76
RMSE: 56616.02


## Choix du modèle

In [12]:
joblib.dump(best_model,"KNN_model.joblib")

['KNN_model.joblib']

## Entraînement des données

## Test du modèle

## Evaluation des résultats