In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, r2_score

In [5]:
df = pd.read_csv("../data/licenses_by_year_region_fed.csv")

In [14]:
df["annee"] = df['annee'].astype(int)
numeric_cols = ["total_license", "total_f", "total_h","f_1_9", "f_10_19", "f_20_29","f_30_59","f_60_74","f_75","h_1_9", "h_10_19", "h_20_29","h_30_59","h_60_74","h_75"]
df[numeric_cols] = df[numeric_cols].astype(int)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18784 entries, 0 to 18783
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   annee          18784 non-null  int64 
 1   region         18784 non-null  object
 2   nom_fed        18784 non-null  object
 3   total_license  18784 non-null  int64 
 4   total_f        18784 non-null  int64 
 5   total_h        18784 non-null  int64 
 6   h_1_9          18784 non-null  int64 
 7   h_10_19        18784 non-null  int64 
 8   h_20_29        18784 non-null  int64 
 9   h_30_59        18784 non-null  int64 
 10  h_60_74        18784 non-null  int64 
 11  h_75           18784 non-null  int64 
 12  f_1_9          18784 non-null  int64 
 13  f_10_19        18784 non-null  int64 
 14  f_20_29        18784 non-null  int64 
 15  f_30_59        18784 non-null  int64 
 16  f_60_74        18784 non-null  int64 
 17  f_75           18784 non-null  int64 
dtypes: int64(16), object(2)
me

In [8]:
print(df.dtypes)

annee            object
region           object
nom_fed          object
total_license     int64
total_f           int64
total_h           int64
h_1_9             int64
h_10_19           int64
h_20_29           int64
h_30_59           int64
h_60_74           int64
h_75              int64
f_1_9             int64
f_10_19           int64
f_20_29           int64
f_30_59           int64
f_60_74           int64
f_75              int64
dtype: object


In [17]:
# Agr√©gation annuelle
df_year = df.groupby("annee")["total_f"].sum().reset_index()

print(df_year)

    annee  total_f
0    2012  4120606
1    2013  4145472
2    2014  6554402
3    2015  5006476
4    2016  5938794
5    2017  5473101
6    2018  5465275
7    2019  6101623
8    2020  6144494
9    2021  4849765
10   2022  5827108
11   2023  6390363


In [18]:
# Variables
X = df_year[["annee"]]
y = df_year["total_f"]

# Train / test split (optionnel ici car s√©rie temporelle courte)
X_train = X[X["annee"] <= 2021]
X_test = X[X["annee"] > 2021]

y_train = y[X["annee"] <= 2021]
y_test = y[X["annee"] > 2021]

# Mod√®le
model = LinearRegression()
model.fit(X_train, y_train)

# Pr√©diction
y_pred = model.predict(X_test)

print("R¬≤ :", r2_score(y_test, y_pred))
print("RMSE :", np.sqrt(mean_squared_error(y_test, y_pred)))

R¬≤ : 0.35884123669187695
RMSE : 225505.87002008932


## üîπ R¬≤ (coefficient de d√©termination)

Le R¬≤ mesure la part de la variance expliqu√©e par le mod√®le.

R2 = 1 - (erreur_mod√®le / erreur_moyenne)

üëâ 0.3588 = 35,9 %

Cela signifie que :

Ton mod√®le explique environ 36 % de la variation du nombre de licences f√©minines d‚Äôune ann√©e √† l‚Äôautre.

Interpr√©tation :
	‚Ä¢	1.0 ‚Üí mod√®le parfait \
	‚Ä¢	0 ‚Üí aussi mauvais que pr√©dire la moyenne \
	‚Ä¢	< 0 ‚Üí pire que la moyenne \

üìå Ici ‚Üí pouvoir explicatif faible √† moyen
Ce n‚Äôest pas surprenant : tu n‚Äôutilises que l‚Äôann√©e comme variable.


## üîπ RMSE (Root Mean Squared Error)

Le RMSE est l‚Äôerreur moyenne en valeur absolue (racine carr√©e de l‚Äôerreur quadratique).

RMSE = 225 505
 
Cela signifie que :\

En moyenne, ton mod√®le se trompe d‚Äôenviron 225 000 licences f√©minines par an.

Pour interpr√©ter correctement, il faut comparer √† l‚Äôordre de grandeur de total_f.

Si par exemple :\
	‚Ä¢	total annuel ‚âà 3 000 000\
‚Üí erreur ‚âà 7 % ‚Üí raisonnable

Si total annuel ‚âà 800 000\
‚Üí erreur ‚âà 28 % ‚Üí tr√®s mauvais


In [19]:
future_years = pd.DataFrame({
    "annee": np.arange(2024, 2031)
})

future_predictions = model.predict(future_years)

future_years["prediction_total_f"] = future_predictions

print(future_years)

   annee  prediction_total_f
0   2024        6.252837e+06
1   2025        6.369215e+06
2   2026        6.485593e+06
3   2027        6.601971e+06
4   2028        6.718349e+06
5   2029        6.834728e+06
6   2030        6.951106e+06


# Mod√®le de r√©gression lin√©aire par r√©gion

In [20]:
df = pd.read_csv("../data/licenses_by_year_region_fed.csv")

In [21]:
df = df.copy()
df["annee"] = df["annee"].astype(int)

# Agr√©gation par r√©gion et ann√©e
df_region = df.groupby(["region", "annee"])["total_f"].sum().reset_index()

print(df_region.head())

                 region  annee   total_f
0  Auvergne-Rh√¥ne-Alpes   2012  501040.0
1  Auvergne-Rh√¥ne-Alpes   2013  491743.0
2  Auvergne-Rh√¥ne-Alpes   2014  568446.0
3  Auvergne-Rh√¥ne-Alpes   2015  704818.0
4  Auvergne-Rh√¥ne-Alpes   2016  769467.0


## Entrainer un mod√®le par r√©gion

In [22]:
models = {}
predictions = []

regions = df_region["region"].unique()

for region in regions:
    
    data_region = df_region[df_region["region"] == region]
    
    X = data_region[["annee"]]
    y = data_region["total_f"]
    
    model = LinearRegression()
    model.fit(X, y)
    
    models[region] = model
    
    # Pr√©diction 2024‚Äì2030
    future_years = pd.DataFrame({
        "annee": np.arange(2024, 2031)
    })
    
    future_pred = model.predict(future_years)
    
    temp = future_years.copy()
    temp["region"] = region
    temp["prediction_total_f"] = future_pred
    
    predictions.append(temp)

# Fusion
df_future = pd.concat(predictions, ignore_index=True)

print(df_future.head())

   annee                region  prediction_total_f
0   2024  Auvergne-Rh√¥ne-Alpes       873306.500000
1   2025  Auvergne-Rh√¥ne-Alpes       899809.192308
2   2026  Auvergne-Rh√¥ne-Alpes       926311.884615
3   2027  Auvergne-Rh√¥ne-Alpes       952814.576923
4   2028  Auvergne-Rh√¥ne-Alpes       979317.269231


In [26]:
import plotly.express as px
import pandas as pd

region_test = "√éle-de-France"  # adapte selon ton dataset

# Donn√©es historiques
data_region = df_region[df_region["region"] == region_test].copy()
data_region["type"] = "Historique"

# Donn√©es futures
future_region = df_future[df_future["region"] == region_test].copy()
future_region = future_region.rename(columns={"prediction_total_f": "total_f"})
future_region["type"] = "Projection"

# Fusion
df_plot = pd.concat([data_region, future_region], ignore_index=True)

# Graphique
fig = px.line(
    df_plot,
    x="annee",
    y="total_f",
    color="type",
    title=f"Projection des licences f√©minines - {region_test}",
    markers=True
)

fig.update_layout(
    xaxis_title="Ann√©e",
    yaxis_title="Nombre de licences f√©minines",
    template="plotly_white"
)

fig.show()

# Mod√®le plus robuste avec Ridge

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# ------------------------------
# 1Ô∏è‚É£ Pr√©paration des donn√©es
# ------------------------------
df = pd.read_csv("../data/licenses_by_year_region_fed.csv")
df["annee"] = df["annee"].astype(int)

In [2]:
# Variables explicatives
features = [
    "annee",
    "region",
    "nom_fed",
    "total_license",
    "f_1_9", "f_10_19", "f_20_29", "f_30_59", "f_60_74", "f_75"
]

X = df[features]
y = df["total_f"]

# Colonnes cat√©gorielles et num√©riques
categorical_cols = ["region", "nom_fed"]
numeric_cols = [col for col in features if col not in categorical_cols]

# Pr√©processing
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

# ------------------------------
# 2Ô∏è‚É£ Pipeline Ridge
# ------------------------------
ridge_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", Ridge(alpha=1.0))  # alpha = force de r√©gularisation
])

# Split temporel simple : ann√©es <=2021 pour train, >2021 pour test
X_train = X[df["annee"] <= 2021]
X_test  = X[df["annee"] > 2021]
y_train = y[df["annee"] <= 2021]
y_test  = y[df["annee"] > 2021]

# Entra√Ænement
ridge_pipeline.fit(X_train, y_train)

# ------------------------------
# 3Ô∏è‚É£ √âvaluation
# ------------------------------
y_pred = ridge_pipeline.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"R¬≤ : {r2:.3f}")
print(f"RMSE : {rmse:.0f}")

R¬≤ : 0.709
RMSE : 4544


Super ! Ces r√©sultats montrent une √©norme am√©lioration par rapport au premier mod√®le lin√©aire simple. Voyons ce que √ßa signifie.

‚∏ª

## 1Ô∏è‚É£ Interpr√©tation des scores

üîπ R¬≤ = 0.709 
- 0.709 ‚âà 71 % de la variance expliqu√©e par le mod√®le. 
- Compar√© √† 0.36 avant, √ßa veut dire que le mod√®le Ridge capture beaucoup mieux la tendance des licences f√©minines. 
- En clair : les variables utilis√©es (ann√©e + r√©gion + f√©d√©ration + tranches d‚Äô√¢ge + total_license) expliquent la majorit√© des variations.

‚∏ª

## üîπ RMSE = 4 544

- RMSE est l‚Äôerreur moyenne en valeur absolue (racine de l‚Äôerreur quadratique). 
- Donc, en moyenne, le mod√®le se trompe de ~4 500 licences par r√©gion/f√©d√©ration par an. 
- Si les effectifs par f√©d√©ration sont de l‚Äôordre de 50 000 √† 200 000 licences, l‚Äôerreur relative est tr√®s faible (<10%). 

üí° En r√©sum√© : c‚Äôest un mod√®le robuste et pr√©cis, bien meilleur que le mod√®le simple bas√© seulement sur l‚Äôann√©e.

‚∏ª

## 2Ô∏è‚É£ Ce que √ßa change pour tes projections

- Les pr√©dictions pour 2024‚Äì2030 seront plus fiables, surtout si tu regardes par r√©gion ou par f√©d√©ration.
- Tu peux maintenant int√©grer ce mod√®le dans ton dashboard Streamlit et laisser les utilisateurs s√©lectionner la r√©gion pour voir les projections robustes.

In [3]:
future_years = pd.DataFrame({
    "annee": np.arange(2024, 2031),
    "region": "√éle-de-France",  # exemple : tu peux cr√©er plusieurs r√©gions
    "nom_fed": "Toutes",        # valeur g√©n√©rique si tu veux une projection globale
    "total_license": df["total_license"].mean(),
    "f_1_9": df["f_1_9"].mean(),
    "f_10_19": df["f_10_19"].mean(),
    "f_20_29": df["f_20_29"].mean(),
    "f_30_59": df["f_30_59"].mean(),
    "f_60_74": df["f_60_74"].mean(),
    "f_75": df["f_75"].mean()
})

future_pred = ridge_pipeline.predict(future_years)
future_years["prediction_total_f"] = future_pred

print(future_years)

   annee         region nom_fed  total_license        f_1_9      f_10_19  \
0   2024  √éle-de-France  Toutes    9055.895336  1191.693409  9911.660828   
1   2025  √éle-de-France  Toutes    9055.895336  1191.693409  9911.660828   
2   2026  √éle-de-France  Toutes    9055.895336  1191.693409  9911.660828   
3   2027  √éle-de-France  Toutes    9055.895336  1191.693409  9911.660828   
4   2028  √éle-de-France  Toutes    9055.895336  1191.693409  9911.660828   
5   2029  √éle-de-France  Toutes    9055.895336  1191.693409  9911.660828   
6   2030  √éle-de-France  Toutes    9055.895336  1191.693409  9911.660828   

      f_20_29      f_30_59     f_60_74       f_75  prediction_total_f  
0  263.002502  3541.515226  391.518899  61.848648         3489.393384  
1  263.002502  3541.515226  391.518899  61.848648         3489.393518  
2  263.002502  3541.515226  391.518899  61.848648         3489.393651  
3  263.002502  3541.515226  391.518899  61.848648         3489.393784  
4  263.002502  3541.5152