# Exploratory data analysis

Initial data loading and exploratory analysis of the Catalan traffic accident dataset.

In [1]:
import pandas as pd
import numpy as np
import os

# Display settings for better visibility
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1.1 Import required libraries

In [2]:
data_path = '../data/accidents_catalonia_2010_2023.csv'
df = pd.read_csv(data_path)
print(f"Dataset loaded successfully from {data_path}")

Dataset loaded successfully from ../data/accidents_catalonia_2010_2023.csv


## 1.2 Load dataset

In [3]:
print("Dataset Shape:")
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}\n")

print("Column Information:")
df.info()

Dataset Shape:
Rows: 24478, Columns: 58

Column Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24478 entries, 0 to 24477
Data columns (total 58 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Any                        24478 non-null  int64  
 1   zona                       24478 non-null  object 
 2   dat                        24478 non-null  object 
 3   via                        24478 non-null  object 
 4   pk                         24477 non-null  object 
 5   nomMun                     24478 non-null  object 
 6   nomCom                     24478 non-null  object 
 7   nomDem                     24478 non-null  object 
 8   F_MORTS                    24478 non-null  int64  
 9   F_FERITS_GREUS             24478 non-null  int64  
 10  F_FERITS_LLEUS             24478 non-null  int64  
 11  F_VICTIMES                 24478 non-null  int64  
 12  F_UNITATS_IMPLICADES       24478 non-null

In [4]:
df.head()

Unnamed: 0,Any,zona,dat,via,pk,nomMun,nomCom,nomDem,F_MORTS,F_FERITS_GREUS,F_FERITS_LLEUS,F_VICTIMES,F_UNITATS_IMPLICADES,F_VIANANTS_IMPLICADES,F_BICICLETES_IMPLICADES,F_CICLOMOTORS_IMPLICADES,F_MOTOCICLETES_IMPLICADES,F_VEH_LLEUGERS_IMPLICADES,F_VEH_PESANTS_IMPLICADES,F_ALTRES_UNIT_IMPLICADES,F_UNIT_DESC_IMPLICADES,C_VELOCITAT_VIA,D_ACC_AMB_FUGA,D_BOIRA,D_CARACT_ENTORN,D_CARRIL_ESPECIAL,D_CIRCULACIO_MESURES_ESP,D_CLIMATOLOGIA,D_FUNC_ESP_VIA,D_GRAVETAT,D_INFLUIT_BOIRA,D_INFLUIT_CARACT_ENTORN,D_INFLUIT_CIRCULACIO,D_INFLUIT_ESTAT_CLIMA,D_INFLUIT_INTEN_VENT,D_INFLUIT_LLUMINOSITAT,D_INFLUIT_MESU_ESP,D_INFLUIT_OBJ_CALCADA,D_INFLUIT_SOLCS_RASES,D_INFLUIT_VISIBILITAT,D_INTER_SECCIO,D_LIMIT_VELOCITAT,D_LLUMINOSITAT,D_REGULACIO_PRIORITAT,D_SENTITS_VIA,D_SUBTIPUS_ACCIDENT,D_SUBTIPUS_TRAM,D_SUBZONA,D_SUPERFICIE,D_TIPUS_VIA,D_TITULARITAT_VIA,D_TRACAT_ALTIMETRIC,D_VENT,grupDiaLab,hor,grupHor,tipAcc,tipDia
0,2010,Zona urbana,25/01/2010,SE,999999,Cànoves i Samalús,Vallès Oriental,Barcelona,0,1,0,1,2,0,0,0,0,1,0,1,0,100.0,No,No n'hi ha,Desmunt,No n'hi ha,No n'hi ha,Bon temps,Sense funció especial,Accident greu,No,No,No,No,No,No,No,No,No,No,Arribant o eixint intersecció fins 50m,Genérica via,"De nit, il·luminació artificial suficient",Sols norma prioritat de pas,Un sol sentit,Encalç,Intersecció en T o Y,Zona urbana,Sec i net,Via urbana( inclou carrer i carrer residencial),,,"Calma, vent molt suau",Feiners,2333,Nit,Col.lisió de vehicles en marxa,dill-dij
1,2010,Carretera,31/10/2010,N-240,999,Lleida,Segrià,Lleida,0,1,3,4,1,0,0,0,0,1,0,0,0,40.0,No,No n'hi ha,A nivell,No n'hi ha,No n'hi ha,Bon temps,Sense funció especial,Accident greu,No,No,No,No,No,No,No,No,No,No,Dintre intersecció,Senyal velocitat,"De nit, il·luminació artificial suficient",Senyal Stop o cedeix pas,Doble sentit,Resta sortides de via,Giratòria,Carretera,Sec i net,Carretera convencional,Estatal,Pla,"Calma, vent molt suau",CapDeSetmana,1,Nit,Sortida de la calcada sense especificar,dg
2,2010,Carretera,17/05/2010,N-II,7087,Fornells de la Selva,Gironès,Girona,1,0,2,3,4,0,0,0,0,2,2,0,0,80.0,No,No n'hi ha,A nivell,No n'hi ha,No n'hi ha,Bon temps,Variant,Accident mortal,No,No,No,No,No,No,No,No,No,No,En secció,Senyal velocitat,"De dia, dia clar",,Doble sentit,Col·lisió frontal,,Carretera,Sec i net,Carretera convencional,Estatal,Rampa o pendent,"Calma, vent molt suau",Feiners,1527,Tarda,Col.lisió de vehicles en marxa,dill-dij
3,2010,Zona urbana,21/08/2010,SE,999999,Barcelona,Barcelonès,Barcelona,0,2,7,9,2,0,0,0,0,2,0,0,0,100.0,No,No n'hi ha,Sense Especificar,No n'hi ha,No n'hi ha,Bon temps,Sense funció especial,Accident greu,No,No,No,No,No,No,No,No,No,No,Dintre intersecció,Genérica via,"De nit, il·luminació artificial suficient",Semàfor,Un sol sentit,Envestida (frontal lateral),Encreuament o intersecció en X o +,Zona urbana,Sec i net,Via urbana( inclou carrer i carrer residencial),,,"Calma, vent molt suau",CapDeSetmana,223,Nit,Col.lisió de vehicles en marxa,dis
4,2010,Zona urbana,07/05/2010,SE,999999,Badalona,Barcelonès,Barcelona,0,1,0,1,1,0,0,0,1,0,0,0,0,100.0,No,No n'hi ha,Sense Especificar,No n'hi ha,No n'hi ha,Bon temps,Sense funció especial,Accident greu,No,No,No,No,No,No,No,No,No,No,Dintre intersecció,Genérica via,"De dia, dia clar",Sols norma prioritat de pas,Un sol sentit,Caiguda en la via,Encreuament o intersecció en X o +,Zona urbana,Sec i net,Via urbana( inclou carrer i carrer residencial),,,"Calma, vent molt suau",CapDeSetmana,1745,Tarda,Bolcada a la calcada,div


In [5]:
df.describe()

Unnamed: 0,Any,F_MORTS,F_FERITS_GREUS,F_FERITS_LLEUS,F_VICTIMES,F_UNITATS_IMPLICADES,F_VIANANTS_IMPLICADES,F_BICICLETES_IMPLICADES,F_CICLOMOTORS_IMPLICADES,F_MOTOCICLETES_IMPLICADES,F_VEH_LLEUGERS_IMPLICADES,F_VEH_PESANTS_IMPLICADES,F_ALTRES_UNIT_IMPLICADES,F_UNIT_DESC_IMPLICADES,C_VELOCITAT_VIA
count,24478.0,24478.0,24478.0,24478.0,24478.0,24478.0,24478.0,24478.0,24478.0,24478.0,24478.0,24478.0,24478.0,24478.0,21416.0
mean,2016.148296,0.138696,0.997345,0.393537,1.529578,1.886061,0.245036,0.093472,0.075537,0.407182,0.910287,0.125174,0.029251,8.2e-05,158.879716
std,4.02813,0.396296,0.526723,1.010709,1.226088,0.793023,0.503299,0.335439,0.270526,0.540194,0.819269,0.39123,0.174234,0.012783,248.981334
min,2010.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2013.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80.0
50%,2016.0,0.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,100.0
75%,2019.0,0.0,1.0,0.0,2.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,100.0
max,2023.0,13.0,23.0,32.0,49.0,27.0,10.0,8.0,5.0,7.0,27.0,16.0,4.0,2.0,999.0


## 1.3 Check for missing values

In [6]:
missing_values = df.isnull().sum()
print("Missing Values per Column:")
print(missing_values[missing_values > 0] if missing_values.sum() > 0 else "No missing values found!")
print(f"\nTotal missing values: {missing_values.sum()}")

Missing Values per Column:
pk                              1
C_VELOCITAT_VIA              3062
D_CARACT_ENTORN                41
D_CARRIL_ESPECIAL            1741
D_CIRCULACIO_MESURES_ESP       49
D_REGULACIO_PRIORITAT       17488
D_SENTITS_VIA                4301
D_SUBTIPUS_TRAM             16494
D_TITULARITAT_VIA           12509
D_TRACAT_ALTIMETRIC          8898
dtype: int64

Total missing values: 64584
