In [None]:
import pandas as pd 
import numpy as np 
import warnings
from src.quick_rf import train_test_model_rf
warnings.filterwarnings('ignore')

In [29]:
path_data_full = r"data/data_merged_20250922.parquet"


In [37]:
cols_explicatives = [
    "annee",
    "type",
    "codecommune",
    "pvotepreviouspvoteD",
    "pvotepvoteD",
    'lat',
    'long',


    "popcommunes/pop",
    "popcommunesvbbm/vbbm",
    "agesexcommunes/prop014",
    "agesexcommunes/prop60p",
    "agesexcommunes/perage",
    "diplomescommunes/pbac",
    "diplomescommunes/psup",
    "diplomescommunes/nodip",
    "cspcommunes/pouvr",
    "cspcommunes/pcadr",
    "cspcommunes/pchom",
    "revcommunes/revratio",
]

In [38]:
data = pd.read_parquet(path_data_full, columns=cols_explicatives)
data.dropna(inplace=True)
data = data[data["type"]==0]
data.shape 

(372867, 19)

## RF


### Model 1: base

In [41]:
y = data["pvotepvoteD"]
X = data.drop(columns=["pvotepvoteD", "annee", "codecommune", "pvotepreviouspvoteD"])

In [None]:
train_test_model_rf(
    X, y
)

R2: 0.14731448345467213
RMSE: 0.13921435071641383
Std de y: 0.151 --- Moyenne de y: 0.225
Permutation Importance:                     feature  importance_mean  importance_std
8    diplomescommunes/pbac     2.107051e-01    3.320465e-03
2                     long     6.784459e-02    6.213531e-04
12       cspcommunes/pcadr     4.376586e-02    6.727811e-04
1                      lat     3.306115e-02    9.686263e-04
6   agesexcommunes/prop60p     3.158149e-02    1.029943e-03
7    agesexcommunes/perage     1.186510e-02    4.206683e-04
14    revcommunes/revratio     1.169673e-02    2.458504e-04
3          popcommunes/pop     9.600474e-03    2.715285e-04
9    diplomescommunes/psup     7.585627e-03    3.189413e-04
11       cspcommunes/pouvr     7.100519e-03    6.566598e-04
13       cspcommunes/pchom     3.413652e-03    1.726785e-04
10  diplomescommunes/nodip     3.288744e-03    1.163594e-04
5   agesexcommunes/prop014     9.293672e-04    1.068954e-04
4     popcommunesvbbm/vbbm     2.304034e-04  

(RandomForestRegressor(max_depth=6, min_samples_leaf=2, min_samples_split=4,
                       n_estimators=600, n_jobs=-1, random_state=42),
                    feature  importance_mean  importance_std
 8    diplomescommunes/pbac     2.107051e-01    3.320465e-03
 2                     long     6.784459e-02    6.213531e-04
 12       cspcommunes/pcadr     4.376586e-02    6.727811e-04
 1                      lat     3.306115e-02    9.686263e-04
 6   agesexcommunes/prop60p     3.158149e-02    1.029943e-03
 7    agesexcommunes/perage     1.186510e-02    4.206683e-04
 14    revcommunes/revratio     1.169673e-02    2.458504e-04
 3          popcommunes/pop     9.600474e-03    2.715285e-04
 9    diplomescommunes/psup     7.585627e-03    3.189413e-04
 11       cspcommunes/pouvr     7.100519e-03    6.566598e-04
 13       cspcommunes/pchom     3.413652e-03    1.726785e-04
 10  diplomescommunes/nodip     3.288744e-03    1.163594e-04
 5   agesexcommunes/prop014     9.293672e-04    1.068954e-04

### Model 2: 3 steps stretch

In [44]:
np.unique(data.annee)

array([1965, 1969, 1974, 1981, 1988, 1995, 2002, 2007, 2012, 2017, 2022])

In [43]:
def augment_df(current_df: pd.DataFrame, 
               next_df: pd.DataFrame, 
               cols: list
)->pd.DataFrame:
    delta_df = pd.merge(
        current_df[["codecommune"] + cols],
        next_df[["codecommune"] + cols],
        on="codecommune",
        suffixes=("_prev", "_next"),
        how="inner",
    )
    new_cols = ["codecommune"]
    for col in cols:
        delta_df[f"delta_{col}"]= delta_df[f"{col}_next"] - delta_df[f"{col}_prev"]
        new_cols.append(f"delta_{col}")
    return delta_df[new_cols]


In [45]:
# Last 3 years
df_17, df_22, df_12 = data[data["annee"]==2017], data[data["annee"]==2022], data[data["annee"]==2012]
df_17.shape, df_22.shape, df_12.shape

((34695, 19), (34639, 19), (34770, 19))

In [47]:
cols_to_track = [
    "popcommunes/pop",
    "popcommunesvbbm/vbbm",
    "agesexcommunes/prop014",
    "agesexcommunes/prop60p",
    "agesexcommunes/perage",
    "diplomescommunes/pbac",
    "diplomescommunes/psup",
    "diplomescommunes/nodip",
    "cspcommunes/pouvr",
    "cspcommunes/pcadr",
    "cspcommunes/pchom",
    "revcommunes/revratio"
    ]
delta_12_17 = augment_df(df_12, df_17, cols_to_track)
delta_17_22 = augment_df(df_17, df_22, cols_to_track)

In [None]:
histories_delta = delta_12_17.merge(delta_17_22, on="codecommune", how="inner", suffixes=("17_22_", "12_17_"))
histories_delta.shape 

(34608, 25)

In [53]:
df = pd.merge(df_22, histories_delta, on="codecommune", how="inner")

In [54]:
y = df["pvotepvoteD"]
X = df.drop(columns=["pvotepvoteD", "annee", "codecommune", "pvotepreviouspvoteD"])

In [55]:
train_test_model_rf(
    X, y
)

R2: 0.4380396097714554
RMSE: 0.07510166459959247
Std de y: 0.100 --- Moyenne de y: 0.399


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Permutation Importance:                                 feature  importance_mean  importance_std
2                                 long     4.085682e-01    8.669645e-03
1                                  lat     2.649424e-01    9.104769e-03
9                diplomescommunes/psup     9.182759e-02    4.685276e-03
3                      popcommunes/pop     8.290815e-02    3.001528e-03
14                revcommunes/revratio     5.484896e-02    3.050968e-03
33   delta_diplomescommunes/psup12_17_     1.249792e-02    1.550603e-03
10              diplomescommunes/nodip     6.230503e-03    9.008119e-04
12                   cspcommunes/pcadr     3.067513e-03    3.982392e-04
21   delta_diplomescommunes/psup17_22_     2.648875e-03    5.813474e-04
4                 popcommunesvbbm/vbbm     2.356176e-03    3.254791e-04
8                diplomescommunes/pbac     8.816354e-04    1.900357e-04
32   delta_diplomescommunes/pbac12_17_     6.722303e-04    1.643283e-04
7                agesexcommunes/perage 

(RandomForestRegressor(max_depth=6, min_samples_leaf=2, min_samples_split=4,
                       n_estimators=600, n_jobs=-1, random_state=42),
                                feature  importance_mean  importance_std
 2                                 long     4.085682e-01    8.669645e-03
 1                                  lat     2.649424e-01    9.104769e-03
 9                diplomescommunes/psup     9.182759e-02    4.685276e-03
 3                      popcommunes/pop     8.290815e-02    3.001528e-03
 14                revcommunes/revratio     5.484896e-02    3.050968e-03
 33   delta_diplomescommunes/psup12_17_     1.249792e-02    1.550603e-03
 10              diplomescommunes/nodip     6.230503e-03    9.008119e-04
 12                   cspcommunes/pcadr     3.067513e-03    3.982392e-04
 21   delta_diplomescommunes/psup17_22_     2.648875e-03    5.813474e-04
 4                 popcommunesvbbm/vbbm     2.356176e-03    3.254791e-04
 8                diplomescommunes/pbac     8.8163