# 1. Initializations

## 1.1 General imports

In [1]:
### data management
import pandas as pd
import numpy as np
# import statsmodels.api as sm

### graphical plotly basics
import plotly.graph_objects as go
import plotly.express as px
# for jupyter notebook display management
import plotly.io as pio
pio.renderers.default = "notebook"

## 1.2 General dataframe functions

In [2]:
import smartcheck.dataframe_common as dfc

# 2. Loading and Data Quality

## 2.1 Loading of data sets and general exploration

### 2.2.1 VELIB DISPO

In [3]:
df_disp_velib_raw = dfc.load_dataset_from_config('velib_dispo_data', sep=';')

if df_disp_velib_raw is not None and isinstance(df_disp_velib_raw, pd.DataFrame):
    display(df_disp_velib_raw.head())
    dfc.log_general_info(df_disp_velib_raw)
    nb_first, nb_total = dfc.detect_and_log_duplicates_and_missing(df_disp_velib_raw)
    if nb_first != nb_total:
        print(dfc.duplicates_index_map(df_disp_velib_raw))
    df_disp_velib = dfc.normalize_column_names(df_disp_velib_raw)

[INFO]-[2025-06-01 11:12:57,732] File path resolved from configuration : https://drive.google.com/file/d/1kBkBYm55NrNZ55fjD2G7jvdJqIz517Qm/view?usp=drive_link.
[INFO]-[2025-06-01 11:12:57,732] File ID extracted from URL: 1kBkBYm55NrNZ55fjD2G7jvdJqIz517Qm


Unnamed: 0,Identifiant station,Nom station,Station en fonctionnement,Capacité de la station,Nombre bornettes libres,Nombre total vélos disponibles,Vélos mécaniques disponibles,Vélos électriques disponibles,Borne de paiement disponible,Retour vélib possible,Actualisation de la donnée,Coordonnées géographiques,Nom communes équipées,Code INSEE communes équipées,station_opening_hours
0,16107,Benjamin Godard - Victor Hugo,OUI,0,27,8,0,8,OUI,OUI,2025-04-17T15:39:50+00:00,"48.865983, 2.275725",Paris,75056,
1,44015,Rouget de L'isle - Watteau,OUI,0,12,8,4,4,OUI,OUI,2025-04-17T15:37:57+00:00,"48.778192750803, 2.3963020229163",Vitry-sur-Seine,94081,
2,40001,Hôpital Mondor,OUI,0,23,6,2,4,OUI,OUI,2025-04-17T15:37:02+00:00,"48.798922410229, 2.4537451531298",Créteil,94028,
3,9020,Toudouze - Clauzel,OUI,0,18,2,0,2,OUI,OUI,2025-04-17T15:41:24+00:00,"48.87929591733507, 2.3373600840568547",Paris,75056,
4,13007,Le Brun - Gobelins,OUI,0,37,10,6,4,OUI,OUI,2025-04-17T15:38:07+00:00,"48.835092787824, 2.3534681351338",Paris,75056,


[INFO]-[2025-06-01 11:12:59,883] Dataset shape: 1472 rows x 15 columns
[INFO]-[2025-06-01 11:12:59,884] For quantitative variable description use:
df.select_dtypes(include=np.number).describe()
[INFO]-[2025-06-01 11:12:59,884] For quantitative correlation matrix use:
df.select_dtypes(include=np.number).corr()
[INFO]-[2025-06-01 11:12:59,890] DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1472 entries, 0 to 1471
Data columns (total 15 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Identifiant station             1472 non-null   object 
 1   Nom station                     1472 non-null   object 
 2   Station en fonctionnement       1472 non-null   object 
 3   Capacité de la station          1472 non-null   int64  
 4   Nombre bornettes libres         1472 non-null   int64  
 5   Nombre total vélos disponibles  1472 non-null   int64  
 6   Vélos mécaniques disponibles    1472 non

In [4]:
df_disp_velib.info()
display(df_disp_velib.head())
df_cpt_velib_desc = df_disp_velib.select_dtypes(include=np.number).describe()
display(df_cpt_velib_desc)
df_cpt_velib_cr = df_disp_velib.select_dtypes(include=np.number).corr()
display(df_cpt_velib_cr)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1472 entries, 0 to 1471
Data columns (total 15 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   identifiant_station             1472 non-null   object 
 1   nom_station                     1472 non-null   object 
 2   station_en_fonctionnement       1472 non-null   object 
 3   capacite_de_la_station          1472 non-null   int64  
 4   nombre_bornettes_libres         1472 non-null   int64  
 5   nombre_total_velos_disponibles  1472 non-null   int64  
 6   velos_mecaniques_disponibles    1472 non-null   int64  
 7   velos_electriques_disponibles   1472 non-null   int64  
 8   borne_de_paiement_disponible    1472 non-null   object 
 9   retour_velib_possible           1472 non-null   object 
 10  actualisation_de_la_donnee      1472 non-null   object 
 11  coordonnees_geographiques       1472 non-null   object 
 12  nom_communes_equipees           14

Unnamed: 0,identifiant_station,nom_station,station_en_fonctionnement,capacite_de_la_station,nombre_bornettes_libres,nombre_total_velos_disponibles,velos_mecaniques_disponibles,velos_electriques_disponibles,borne_de_paiement_disponible,retour_velib_possible,actualisation_de_la_donnee,coordonnees_geographiques,nom_communes_equipees,code_insee_communes_equipees,station_opening_hours
0,16107,Benjamin Godard - Victor Hugo,OUI,0,27,8,0,8,OUI,OUI,2025-04-17T15:39:50+00:00,"48.865983, 2.275725",Paris,75056,
1,44015,Rouget de L'isle - Watteau,OUI,0,12,8,4,4,OUI,OUI,2025-04-17T15:37:57+00:00,"48.778192750803, 2.3963020229163",Vitry-sur-Seine,94081,
2,40001,Hôpital Mondor,OUI,0,23,6,2,4,OUI,OUI,2025-04-17T15:37:02+00:00,"48.798922410229, 2.4537451531298",Créteil,94028,
3,9020,Toudouze - Clauzel,OUI,0,18,2,0,2,OUI,OUI,2025-04-17T15:41:24+00:00,"48.87929591733507, 2.3373600840568547",Paris,75056,
4,13007,Le Brun - Gobelins,OUI,0,37,10,6,4,OUI,OUI,2025-04-17T15:38:07+00:00,"48.835092787824, 2.3534681351338",Paris,75056,


Unnamed: 0,capacite_de_la_station,nombre_bornettes_libres,nombre_total_velos_disponibles,velos_mecaniques_disponibles,velos_electriques_disponibles,code_insee_communes_equipees,station_opening_hours
count,1472.0,1472.0,1472.0,1472.0,1472.0,1472.0,0.0
mean,0.366848,20.063179,10.33356,6.832201,3.501359,80987.33356,
std,2.312426,11.796055,10.603836,9.2496,3.296786,8415.144634,
min,0.0,0.0,0.0,0.0,0.0,75056.0,
25%,0.0,11.75,3.0,1.0,1.0,75056.0,
50%,0.0,19.0,7.0,3.0,3.0,75056.0,
75%,0.0,27.0,14.0,10.0,5.0,92040.0,
max,50.0,65.0,68.0,63.0,23.0,95018.0,


Unnamed: 0,capacite_de_la_station,nombre_bornettes_libres,nombre_total_velos_disponibles,velos_mecaniques_disponibles,velos_electriques_disponibles,code_insee_communes_equipees,station_opening_hours
capacite_de_la_station,1.0,0.047723,0.005541,-0.011645,0.050495,-0.056901,
nombre_bornettes_libres,0.047723,1.0,-0.435707,-0.40712,-0.25918,-0.082124,
nombre_total_velos_disponibles,0.005541,-0.435707,1.0,0.953942,0.539997,-0.063042,
velos_mecaniques_disponibles,-0.011645,-0.40712,0.953942,1.0,0.262633,-0.043577,
velos_electriques_disponibles,0.050495,-0.25918,0.539997,0.262633,1.0,-0.080509,
code_insee_communes_equipees,-0.056901,-0.082124,-0.063042,-0.043577,-0.080509,1.0,
station_opening_hours,,,,,,,


In [5]:
dfc.analyze_by_reference_variable(df_disp_velib[df_disp_velib.columns[2:]], 'station_en_fonctionnement')
ref_column = 'station_en_fonctionnement'
cross_columns = [ref_column] + ['borne_de_paiement_disponible', 'retour_velib_possible']
dfc.log_cross_distributions(
    df_disp_velib[cross_columns], 
    ref_column
)

[INFO]-[2025-06-01 11:12:59,960] Distribution of station_en_fonctionnement:
station_en_fonctionnement
OUI    0.991168
NON    0.008832
[INFO]-[2025-06-01 11:12:59,968] Medians by station_en_fonctionnement:
                           capacite_de_la_station  nombre_bornettes_libres  nombre_total_velos_disponibles  velos_mecaniques_disponibles  velos_electriques_disponibles  code_insee_communes_equipees  station_opening_hours
station_en_fonctionnement                                                                                                                                                                                                   
NON                                           0.0                     27.0                             0.0                           0.0                            0.0                       75056.0                    NaN
OUI                                           0.0                     19.0                             7.0                          

### 2.2.2 VELIB COMPTAGE

In [6]:
# df_cpt_velib_raw = dfc.load_dataset_from_config('velib_comptage_data', sep=';')

# if df_cpt_velib_raw is not None and isinstance(df_cpt_velib_raw, pd.DataFrame):
#     display(df_cpt_velib_raw.head())
#     dfc.log_general_info(df_cpt_velib_raw)
#     nb_first, nb_total = dfc.detect_and_log_duplicates_and_missing(df_cpt_velib_raw)
#     if nb_first != nb_total:
#         print(dfc.duplicates_index_map(df_cpt_velib_raw))
#     df_cpt_velib = dfc.normalize_column_names(df_cpt_velib_raw)

In [7]:
# df_cpt_velib.info()
# display(df_cpt_velib.head())
# df_cpt_velib_desc = df_cpt_velib.select_dtypes(include=np.number).describe()
# display(df_cpt_velib_desc)
# df_cpt_velib_cr = df_cpt_velib.select_dtypes(include=np.number).corr()
# display(df_cpt_velib_cr)

## 2.2 Data quality refinement

In [8]:
# Original backup and duplicates management
df_disp_velib_orig = df_disp_velib.copy()
df_disp_velib = df_disp_velib.drop_duplicates()
# df_cpt_velib_orig = df_cpt_velib.copy()
# df_cpt_velib = df_cpt_velib.drop_duplicates()

## 2.3 Data combination and rework

# 2. Data Viz' and Analysis

## 2.1 General Data Viz'

## 2.2 Quantitative mono variable distribution

## 2.3 Qualitative mono variable distribution

## 2.4 Qualitative multi variable distribution

## 2.5 Quantitative multi variable correlation