# 1. Initializations

## 1.1 General imports

In [None]:
### Data management
import pandas as pd
import numpy as np
from functools import partial

### Machine Learning

# metrics and evaluation
from scipy.stats import probplot, anderson, chi2_contingency, pearsonr
import statsmodels.api as sm
import statsmodels.formula.api as smf

# pipelines
from sklearn.compose import ColumnTransformer

### Data Viz

# graphical basics
import matplotlib.pyplot as plt
%matplotlib inline

# graphical seaborn
import seaborn as sns

# graphical plotly
# import plotly.graph_objects as go
import plotly.express as px
# for jupyter notebook display management
import plotly.io as pio
pio.renderers.default = "notebook"

# graphical missingno
import missingno as msno


## 1.2 General dataframe functions

In [None]:
import smartcheck.dataframe_common as dfc
import smartcheck.dataframe_project_specific as dfps
import smartcheck.preprocessing_project_specific as pps
import smartcheck.paths as pth

# 2. Loading and Data Quality

## 2.1 Loading of Pre-processed velo comptage data 2024/2025

In [None]:
df_cpt_raw = dfc.load_dataset_from_config('velo_comptage_refactored_data', sep=',', index_col=0)

if df_cpt_raw is not None and isinstance(df_cpt_raw, pd.DataFrame):
    df_cpt = df_cpt_raw.copy()

### Column filtering and preprocessing

In [None]:
no_change_cols = [
    "identifiant_du_compteur",
    "comptage_horaire",
    "date_et_heure_de_comptage",
    "orientation_compteur",
    "latitude",
    "longitude",
    "arrondissement",
]

preproc_transf = ColumnTransformer(
    transformers=[
        (
            "filter",
            pps.ColumnFilterTransformer(columns_to_keep=no_change_cols),
            no_change_cols
        ),
        (
            "datetime",
            pps.DatetimePreprocessingTransformer(timestamp_col="date_et_heure_de_comptage"),
            ["date_et_heure_de_comptage"]
        ),
    ],
    remainder="drop", # toute les colonnes non mappées dans ce column transformer sont mises de côté
)

preproc_transf.set_output(transform="pandas")

df = pd.DataFrame(preproc_transf.fit_transform(df_cpt)) # type: ignore
df = df.rename(columns={
    col: col.split("__")[-1] for col in df.columns if "__" in col
})

## 2.2 Loading of additional data sets and general exploration

In [None]:
df_meteo = dfps.fetch_weather_data_from_dataframe(df, 'latitude', 'longitude', 'date_et_heure_de_comptage_utc')

In [None]:
display(df_meteo.head())
df_meteo.info()

In [None]:
df_merged_raw = pd.merge(
    df,
    df_meteo,
    how='left',
    on=['latitude', 'longitude', 'date_et_heure_de_comptage_utc'],
)

#### Loading and column management (columns names normalization)

In [None]:
df_merged = dfc.normalize_column_names(df_merged_raw)

#### Sauvegarde en mémoire de l'origine

In [None]:
# Original backup before missing value management
df_merged_bckp_orig = df_merged.copy()

In [None]:
# Restore (if needed to recover)
df_merged = df_merged_bckp_orig.copy()

#### Search for general informations, duplicates and missing values stats

In [None]:
display(df_merged.head())
dfc.log_general_info(df_merged)
nb_first, nb_total = dfc.detect_and_log_duplicates_and_missing(df_merged)
if nb_first != nb_total:
    print(dfc.duplicates_index_map(df_merged))

In [None]:
# Représentation des valeur NA graphiquement
msno.matrix(df_merged_bckp_orig)

#### Missing value correlation exploration

In [None]:
df_merged_desc = df_merged.select_dtypes(include=np.number).describe()
display(df_merged_desc)
df_merged_desc = df_merged.select_dtypes(include='object').describe()
display(df_merged_desc)
df_merged_cr = df_merged.select_dtypes(include=np.number).corr()
display(df_merged)

## 2.3 Data quality refinement

#### Suppression des colonnes de données périodiques qui doivent être recalculées (allègement du dataset)

In [None]:
df_merged = df_merged.drop(
    columns=[
        'date_et_heure_de_comptage_utc',
        'date_et_heure_de_comptage_local',
        'date_et_heure_de_comptage_year',
        'date_et_heure_de_comptage_month',
        'date_et_heure_de_comptage_day',
        'date_et_heure_de_comptage_day_of_year',
        'date_et_heure_de_comptage_day_of_week',
        'date_et_heure_de_comptage_hour',
        'date_et_heure_de_comptage_week',
        'date_et_heure_de_comptage_dayname',
        'date_et_heure_de_comptage_monthname',
    ]
)

#### Sauvegarde du dataset en CSV

In [None]:
df_merged.to_csv("comptage-velo-donnees-compteurs-2024-2025_Enriched_ML-ready_data.csv")