## Test de différents encodage

#### Import du dataframe néttoyé

In [1]:
# Importing the librairies
import pandas as pd

# Exporting the csv file into a DataFrame pandas
chemin_fichier = "../data/data_cleaned.csv"
df = pd.read_csv(chemin_fichier)

In [2]:
# A mettre directement dans wrangling
df.drop(columns='date_mutation', inplace=True)

#### One Hot Encoding

In [3]:
from sklearn.preprocessing import OneHotEncoder

# Defining our Encoder
encoder = OneHotEncoder(sparse_output=False)

# Encoding
encoded = encoder.fit_transform(df[['adresse_nom_voie']])
col_names = encoder.get_feature_names_out(['adresse_nom_voie'])
encoded_df = pd.DataFrame(encoded, columns=col_names, index=df.index)


df_one_hot = pd.concat([df.drop(columns=['adresse_nom_voie','adresse_code_voie']), encoded_df], axis=1)

In [19]:
df_one_hot.shape

(57957, 3386)

#### Enregistrement de l'encoder pour l'app

In [4]:
import pickle

# Sauvegarde
with open("../models/encoder.pkl", "wb") as f:
    pickle.dump(encoder, f)

#### Encodage entier 

In [5]:
df_encodage_entier = df.drop(columns='adresse_nom_voie')

#### Entrainement des deux dataframe sur un arbre des décision

In [15]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score, mean_squared_error
import numpy as np

# Creating feature
X = df_one_hot.drop(columns='prix_au_m2')
y = df_one_hot['prix_au_m2']

# Spliting train/test
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

# Training the model
model = DecisionTreeRegressor(max_depth=10, random_state=42)
model.fit(X_train, y_train)

# Making the predictions
y_pred = model.predict(X_test)

# Evaluating the model
mae = mean_absolute_error(y_test,y_pred)
rmse = root_mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

print(f"MAE : {mae:.2f}")
print(f"RMSE : {rmse:.2f}")
print(f"R²   : {r2:.4f}")

MAE : 1765.90
RMSE : 3210.20
R²   : 0.2537


In [None]:
# Creating feature
X = df_encodage_entier.drop(columns='prix_au_m2')
y = df_encodage_entier['prix_au_m2']

# Spliting train/test
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

# Training the model
model = DecisionTreeRegressor(max_depth=10, random_state=42)
model.fit(X_train, y_train)

# Making the predictions
y_pred = model.predict(X_test)

# Evaluating the model
mae = mean_absolute_error(y_test,y_pred)
rmse = root_mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

print(f"MAE : {mae:.2f}")
print(f"RMSE : {rmse:.2f}")
print(f"R²   : {r2:.4f}")

MAE : 1804.14
RMSE : 3352.25
R²   : 0.1862


#### L'encodage OneHot est ici meilleur sur chacune des métriques mais est 40 fois plus long : 4.0s contre 0.1

In [16]:
from sklearn.ensemble import RandomForestRegressor

# Creating feature
X = df_one_hot.drop(columns='prix_au_m2')
y = df_one_hot['prix_au_m2']

# Spliting train/test
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

# Training the model
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Making the predictions
y_pred = model.predict(X_test)

# Evaluating the model
mae = mean_absolute_error(y_test,y_pred)
rmse = root_mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

print(f"MAE : {mae:.2f}")
print(f"RMSE : {rmse:.2f}")
print(f"R²   : {r2:.4f}")

MAE : 1707.83
RMSE : 2959.46
R²   : 0.3658


#### Saving best model

In [17]:
from sklearn.ensemble import RandomForestRegressor
import pickle

# Creating feature
X = df_one_hot.drop(columns='prix_au_m2')
y = df_one_hot['prix_au_m2']

# Training the model on all the data for the app
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X, y)

# Sauvegarde
with open("../models/model.pkl", "wb") as f:
    pickle.dump(encoder, f)

In [18]:
# Sauvegarde
with open("../models/model.pkl", "wb") as f:
    pickle.dump(model, f)