# 1. Initializations

## 1.1 General imports

In [None]:
### general

### data management
import pandas as pd
import numpy as np

### machine learning (scikit-learn)
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.compose import ColumnTransformer

### graphical
import matplotlib.pyplot as plt
# for jupyter notebook management
%matplotlib inline
import seaborn as sns


## 1.2 General dataframe functions

In [None]:
import smartcheck.dataframe_common as dfc

## 1.3 Specific preprocessing classes

In [None]:
import smartcheck.preprocessing_project_specific as pps

# 2. Loading of Pre-processed data

In [None]:
df_cpt_raw = dfc.load_dataset_from_config('velo_comptage_pcd_data', sep=',', index_col=0)

if df_cpt_raw is not None and isinstance(df_cpt_raw, pd.DataFrame):
    dfc.log_general_info(df_cpt_raw)
    df_cpt = df_cpt_raw.copy()
    display(df_cpt.head())

In [None]:
df_cpt_desc = df_cpt.select_dtypes(include=np.number).describe()
display(df_cpt_desc)
df_cpt_desc = df_cpt.select_dtypes(exclude=np.number).describe()
display(df_cpt_desc)
df_cpt_cr = df_cpt.select_dtypes(include=np.number).corr()
display(df_cpt_cr)

## 2.1 Column preprocessing pipelines

In [None]:
no_change_cols = [
    "identifiant_du_compteur",
    "nom_du_compteur",
    "identifiant_du_site_de_comptage",
    "nom_du_site_de_comptage",
    "comptage_horaire",
    "coordonnees_geographiques",
    "mois_annee_comptage",
    "orientation_compteur",
    "latitude",
    "longitude",
    "arrondissement",
]

preproc_pipeline = ColumnTransformer(
    transformers=[
        (
            "filter",
            pps.ColumnFilterTransformer(columns_to_keep=no_change_cols),
            no_change_cols
        ),
        (
            "datetime",
            pps.DatetimePreprocessingTransformer(timestamp_col="date_et_heure_de_comptage"),
            ["date_et_heure_de_comptage"]
        ),
    ],
    remainder="drop",
)

preproc_pipeline.set_output(transform="pandas")

df = pd.DataFrame(preproc_pipeline.fit_transform(df_cpt))
df = df.rename(columns={
    col: col.split("__")[-1] for col in df.columns if "__" in col
})

df.head()