## Test de différents encodage

#### Import du dataframe néttoyé

In [1]:
# Importing the librairies
import pandas as pd

# Exporting the csv file into a DataFrame pandas
chemin_fichier = "../data/data_cleaned.csv"
df = pd.read_csv(chemin_fichier)

In [2]:
# A mettre directement dans wrangling
df.drop(columns='date_mutation', inplace=True)

#### One Hot Encoding

In [3]:
from sklearn.preprocessing import OneHotEncoder

# Defining our Encoder
encoder = OneHotEncoder(sparse_output=False)


# Normalisation des noms de rue (supression espace majuscules)
df['adresse_nom_voie'] = df['adresse_nom_voie'].astype(str).str.lower().str.strip()

# Encoding
encoded = encoder.fit_transform(df[['adresse_nom_voie']])
col_names = encoder.get_feature_names_out(['adresse_nom_voie'])
encoded_df = pd.DataFrame(encoded, columns=col_names, index=df.index)


df_one_hot = pd.concat([df.drop(columns=['adresse_nom_voie','adresse_code_voie']), encoded_df], axis=1)

In [4]:
df_one_hot.shape

(57957, 3386)

#### Enregistrement de l'encoder pour l'app

In [None]:
import pickle

# Sauvegarde
with open("../models/encoder.pkl", "wb") as f:
    pickle.dump(encoder, f)

#### Encodage entier 

In [5]:
df_encodage_entier = df.drop(columns='adresse_nom_voie')

#### Entrainement des deux dataframe sur un arbre des décision

In [6]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score, mean_squared_error
import numpy as np

# Creating feature
X = df_one_hot.drop(columns='prix_au_m2')
y = df_one_hot['prix_au_m2']

# Spliting train/test
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

# Training the model
model = DecisionTreeRegressor(max_depth=10, random_state=42)
model.fit(X_train, y_train)

# Making the predictions
y_pred = model.predict(X_test)

# Evaluating the model
mae = mean_absolute_error(y_test,y_pred)
rmse = root_mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

print(f"MAE : {mae:.2f}")
print(f"RMSE : {rmse:.2f}")
print(f"R²   : {r2:.4f}")

MAE : 1765.90
RMSE : 3210.20
R²   : 0.2537


In [7]:
# Creating feature
X = df_encodage_entier.drop(columns='prix_au_m2')
y = df_encodage_entier['prix_au_m2']

# Spliting train/test
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

# Training the model
model = DecisionTreeRegressor(max_depth=10, random_state=42)
model.fit(X_train, y_train)

# Making the predictions
y_pred = model.predict(X_test)

# Evaluating the model
mae = mean_absolute_error(y_test,y_pred)
rmse = root_mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

print(f"MAE : {mae:.2f}")
print(f"RMSE : {rmse:.2f}")
print(f"R²   : {r2:.4f}")

MAE : 1804.14
RMSE : 3352.25
R²   : 0.1862


#### Encodage arrondisement

In [8]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Normaliser 'adresse_nom_voie'
df['adresse_nom_voie'] = df['adresse_nom_voie'].astype(str).str.lower().str.strip()

# Définir l'encodeur pour deux colonnes
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit-transform sur les deux colonnes
encoded = encoder.fit_transform(df[['adresse_nom_voie', 'arrondissement']])
col_names = encoder.get_feature_names_out(['adresse_nom_voie', 'arrondissement'])

# Créer le DataFrame encodé
encoded_df = pd.DataFrame(encoded, columns=col_names, index=df.index)

# Concaténer avec le reste du DataFrame
df_one_hot = pd.concat([
    df.drop(columns=['adresse_nom_voie', 'adresse_code_voie', 'arrondissement']),
    encoded_df
], axis=1)

In [None]:
df_one_hot.shape

In [9]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score, mean_squared_error
import numpy as np

# Creating feature
X = df_one_hot.drop(columns='prix_au_m2')
y = df_one_hot['prix_au_m2']

# Spliting train/test
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

# Training the model
model = DecisionTreeRegressor(max_depth=10, random_state=42)
model.fit(X_train, y_train)

# Making the predictions
y_pred = model.predict(X_test)

# Evaluating the model
mae = mean_absolute_error(y_test,y_pred)
rmse = root_mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

print(f"MAE : {mae:.2f}")
print(f"RMSE : {rmse:.2f}")
print(f"R²   : {r2:.4f}")

MAE : 1806.75
RMSE : 3263.39
R²   : 0.2288


#### L'encodage OneHot est ici meilleur sur chacune des métriques mais est 40 fois plus long : 4.0s contre 0.1

In [10]:
from sklearn.ensemble import RandomForestRegressor

# Creating feature
X = df_one_hot.drop(columns='prix_au_m2')
y = df_one_hot['prix_au_m2']

# Spliting train/test
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

# Training the model
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Making the predictions
y_pred = model.predict(X_test)

# Evaluating the model
mae = mean_absolute_error(y_test,y_pred)
rmse = root_mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

print(f"MAE : {mae:.2f}")
print(f"RMSE : {rmse:.2f}")
print(f"R²   : {r2:.4f}")

MAE : 1720.49
RMSE : 2992.19
R²   : 0.3517


#### Saving best model

In [None]:
from sklearn.ensemble import RandomForestRegressor
import pickle

# Creating feature
X = df_one_hot.drop(columns='prix_au_m2')
y = df_one_hot['prix_au_m2']

# Training the model on all the data for the app
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X, y)

# Sauvegarde
with open("../models/model.pkl", "wb") as f:
    pickle.dump(encoder, f)

In [None]:
# Sauvegarde
with open("../models/model.pkl", "wb") as f:
    pickle.dump(model, f)

## Test avec normalisation aussi (à mettre dans un autre notebook)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer

X = df.drop(columns="prix_au_m2")
y = df["prix_au_m2"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['adresse_nom_voie', 'arrondissement']),
    ('num', StandardScaler(), ['adresse_numero', 'adresse_code_voie',
       'lot1_surface_carrez', 'surface_reelle_bati',
       'nombre_pieces_principales', 'longitude', 'latitude',
       'année', 'arrondissement'])  
])

pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', DecisionTreeRegressor(max_depth=10, random_state=42))
])

pipeline.fit(X_train, y_train)

# Predictions

y_pred = pipeline.predict(X_test)

# Metrics
mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
rmse = root_mean_squared_error(y_pred,y_test)
r2 = r2_score(y_test, y_pred)

print(f"MAE : {mae}\n")
print(f"RMSE: {rmse}\n")
print(f"R2 : {r2}")

In [None]:
df.columns

#### A priori la normalisation des données n'améliore pas du tout les prédictions sur un arbre de Décision. Ce qui semble être le cas théoriquement. En revanche on lit que cela devrait améliorer l'erreur sur un KNN ou resaux de neurones. Regardons donc cela sur KNN. On pourra comparer à l'erreur que l'on a sans normalisation et sans OneHot.

In [None]:
from sklearn.neighbors import KNeighborsRegressor

X = df.drop(columns="prix_au_m2")
y = df["prix_au_m2"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['adresse_nom_voie', 'arrondissement']),
    ('num', StandardScaler(), ['adresse_numero', 'adresse_code_voie',
       'lot1_surface_carrez', 'surface_reelle_bati',
       'nombre_pieces_principales', 'longitude', 'latitude',
       'année', 'arrondissement'])  
])

pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', KNeighborsRegressor(n_neighbors=25, weights='distance'))
])

pipeline.fit(X_train, y_train)

# Predictions

y_pred = pipeline.predict(X_test)

# Metrics
mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
rmse = root_mean_squared_error(y_pred,y_test)
r2 = r2_score(y_true=y_test, y_pred=y_pred)

print(f"MAE : {mae}\n")
print(f"RMSE: {rmse}\n")
print(f"R2 : {r2}")

#### On a carrément une meilleure erreur avec la normalisation sur KNN