In [1]:
import glob
import pandas as pd
import numpy as np
import re

# Data import and primary data curation

**Author:** Tobias Schulze
**Last update:** 16 July 2023

All imported files were preprocessed to standardize file name schemes and separators.

In general, issues occurring during import and data type conversion was fixed on the fly.

Further curation will be performed after in deep evaluation of the data.


## `usagers` files
- import the files
- fix `dtype` issues on the fly
- fix an issue with the `id_vehicule` variable
- export the file to `users.csv` in `data`

### Data import and dtype issues fixing

In [2]:
# read file names
files = []
for file in glob.glob("../mar23cds_accidents_data_import/usagers*.csv"):
    files.append(file)

# import user files
users = pd.DataFrame()

for i in range(0, len(files)):
    file = pd.read_csv(files[i], sep = ',', index_col=0)
    if i == 0:
        users = file
    else:
        users = pd.concat([users, file], axis = 0)

# show info
users.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2509620 entries, 201900000001 to 201800057783
Data columns (total 15 columns):
 #   Column       Dtype  
---  ------       -----  
 0   id_vehicule  object 
 1   num_veh      object 
 2   place        float64
 3   catu         int64  
 4   grav         int64  
 5   sexe         int64  
 6   an_nais      float64
 7   trajet       float64
 8   secu1        float64
 9   secu2        float64
 10  secu3        float64
 11  locp         float64
 12  actp         object 
 13  etatp        float64
 14  secu         float64
dtypes: float64(9), int64(3), object(3)
memory usage: 306.4+ MB


### Fixing the issue in `id_vehicule`
We have a curious encoding in `id_vehicule`.

The value `\xa0` encodes a protected blank, which cannot be removed by integrated stripping functions.
Hence, we need a lambda function to:

1. remove all non-numeric characters
2. replace empty vales by nan
3. finally, convert the variable to integer

In [3]:
# show example
users.iloc[0].id_vehicule

'138\xa0306\xa0524'

In [4]:
# remove all non-numeric characters by a regex
users['id_vehicule'] = users['id_vehicule'].apply(lambda x: re.sub('[^0-9]', '', str(x)))

# replace empty values by nan
users['id_vehicule'] = users['id_vehicule'].replace('', np.nan).astype(float)


In [5]:
# check the result
users.iloc[0].id_vehicule

138306524.0

In [6]:
users.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2509620 entries, 201900000001 to 201800057783
Data columns (total 15 columns):
 #   Column       Dtype  
---  ------       -----  
 0   id_vehicule  float64
 1   num_veh      object 
 2   place        float64
 3   catu         int64  
 4   grav         int64  
 5   sexe         int64  
 6   an_nais      float64
 7   trajet       float64
 8   secu1        float64
 9   secu2        float64
 10  secu3        float64
 11  locp         float64
 12  actp         object 
 13  etatp        float64
 14  secu         float64
dtypes: float64(10), int64(3), object(2)
memory usage: 306.4+ MB


### Export the data

In [7]:
users.to_csv("./data/users.csv", sep = ',', header = True, na_rep = 'n/a', index=True)

## `caracteristique` files
- import the files
- fix `dtype` issues on the fly
- fix issues with the variables `lat`, `long`, and `dep`
- export the file to `features.csv` in `data`

### Data import and dtype issues fixing

- encoding is `latin-1` to map French characters


In [8]:
# read characteristics (=features) files in data directory
files = []
for file in glob.glob("../mar23cds_accidents_data_import/caracter*.csv"):
    files.append(file)

# import user files
features = pd.DataFrame()

for i in range(0, len(files)):
    file = pd.read_csv(files[i], sep = ',', encoding='latin-1', na_values=['N/A', 'n/a',''], index_col=0)
    if i == 0:
        features = file
    else:
        features = pd.concat([features, file], axis = 0)


In [9]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1121571 entries, 200900000001 to 200500087954
Data columns (total 15 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   an      1121571 non-null  int64  
 1   mois    1121571 non-null  int64  
 2   jour    1121571 non-null  int64  
 3   hrmn    1121571 non-null  object 
 4   lum     1121571 non-null  int64  
 5   agg     1121571 non-null  int64  
 6   int     1121571 non-null  int64  
 7   atm     1121498 non-null  float64
 8   col     1121552 non-null  float64
 9   com     1121569 non-null  object 
 10  adr     978295 non-null   object 
 11  gps     480052 non-null   object 
 12  lat     634503 non-null   object 
 13  long    634499 non-null   object 
 14  dep     1121571 non-null  object 
dtypes: float64(2), int64(6), object(7)
memory usage: 136.9+ MB


In [10]:
features.head()

Unnamed: 0_level_0,an,mois,jour,hrmn,lum,agg,int,atm,col,com,adr,gps,lat,long,dep
Num_Acc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
200900000001,9,1,30,2030,3,1,1,1.0,6.0,67,RTE DE GUï¿½MENï¿½,M,0.0,0,440
200900000002,9,1,17,300,3,1,1,1.0,3.0,17,LE BOIS JOLI,M,0.0,0,440
200900000003,9,1,29,645,3,1,1,1.0,3.0,52,,M,0.0,0,440
200900000004,9,1,4,615,3,1,1,9.0,6.0,44,LE BECO,M,0.0,0,440
200900000005,9,1,16,1500,1,1,1,1.0,7.0,36,MOULIN DE LA GARENNE,M,0.0,0,440


In [11]:
# Pad the year to four numbers
features['an'] = [f'2{str(num).zfill(3)[-3:]}' if num < 10000 else str(num)[-4:] for num in features['an']]
features['an'] = pd.Series(features['an'], dtype='int32')

In [12]:
features.head()


Unnamed: 0_level_0,an,mois,jour,hrmn,lum,agg,int,atm,col,com,adr,gps,lat,long,dep
Num_Acc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
200900000001,2009,1,30,2030,3,1,1,1.0,6.0,67,RTE DE GUï¿½MENï¿½,M,0.0,0,440
200900000002,2009,1,17,300,3,1,1,1.0,3.0,17,LE BOIS JOLI,M,0.0,0,440
200900000003,2009,1,29,645,3,1,1,1.0,3.0,52,,M,0.0,0,440
200900000004,2009,1,4,615,3,1,1,9.0,6.0,44,LE BECO,M,0.0,0,440
200900000005,2009,1,16,1500,1,1,1,1.0,7.0,36,MOULIN DE LA GARENNE,M,0.0,0,440


### The latitude and longitude data is formatted in two schemes:

1. decimal format with decimal separator
2. string format without decimal separator
3. NAs

The string format encodes the decimal format without decimals.
Hence, we have 3 data formats:

1. `float64`
2. `int64`
3. `NA`


In [13]:
# This part was optimised with help of ChatGPT 3.0
# remove all non-numeric characters by a regex
features['lat'] = features['lat'].apply(lambda x: re.sub('[^0-9]', '', str(x)))
features['long'] = features['long'].apply(lambda x: re.sub('[^0-9]', '', str(x)))

# split the string in two decimals before decimal separator
features['lat'] = features['lat'].apply(lambda x: f"{x[:2]}.{x[2:]}" if x and isinstance(x, str) else x)
features['long'] = features['long'].apply(lambda x: f"{x[:2]}.{x[2:]}" if x and isinstance(x, str) else x)

# add nan to all empty values
features['lat'] = features['lat'].replace('', np.nan).astype(float)
features['long'] = features['long'].replace('', np.nan).astype(float)

In [14]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1121571 entries, 200900000001 to 200500087954
Data columns (total 15 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   an      1121571 non-null  int32  
 1   mois    1121571 non-null  int64  
 2   jour    1121571 non-null  int64  
 3   hrmn    1121571 non-null  object 
 4   lum     1121571 non-null  int64  
 5   agg     1121571 non-null  int64  
 6   int     1121571 non-null  int64  
 7   atm     1121498 non-null  float64
 8   col     1121552 non-null  float64
 9   com     1121569 non-null  object 
 10  adr     978295 non-null   object 
 11  gps     480052 non-null   object 
 12  lat     634503 non-null   float64
 13  long    631290 non-null   float64
 14  dep     1121571 non-null  object 
dtypes: float64(4), int32(1), int64(5), object(5)
memory usage: 132.6+ MB


In [15]:
features.head()

Unnamed: 0_level_0,an,mois,jour,hrmn,lum,agg,int,atm,col,com,adr,gps,lat,long,dep
Num_Acc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
200900000001,2009,1,30,2030,3,1,1,1.0,6.0,67,RTE DE GUï¿½MENï¿½,M,0.0,0.0,440
200900000002,2009,1,17,300,3,1,1,1.0,3.0,17,LE BOIS JOLI,M,0.0,0.0,440
200900000003,2009,1,29,645,3,1,1,1.0,3.0,52,,M,0.0,0.0,440
200900000004,2009,1,4,615,3,1,1,9.0,6.0,44,LE BECO,M,0.0,0.0,440
200900000005,2009,1,16,1500,1,1,1,1.0,7.0,36,MOULIN DE LA GARENNE,M,0.0,0.0,440


### Export the data
The data is exported to `features.csv` in the `data` folder.

In [16]:
features.to_csv("./data/features.csv", sep = ',', header = True, na_rep = 'n/a', index=True)

## `lieux` files
- import the files
- fix `dtype` issues on the fly
- fix issues with latitude and longitude
- export the file to `places.csv` in `data`

### Data import and dtype issues fixing

- encoding is `latin-1` to map Frensh characters


In [17]:
# read places files in data directory
files = []
for file in glob.glob("../mar23cds_accidents_data_import/lieux*.csv"):
    files.append(file)

    # import user files
places = pd.DataFrame()

for i in range(0, len(files)):
    file = pd.read_csv(files[i], sep = ',', encoding='latin-1', na_values=['N/A', 'n/a',''], low_memory=False, index_col=0)
    if i == 0:
        places = file
    else:
        places = pd.concat([places, file], axis = 0)

In [18]:
places.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1121571 entries, 202100000001 to 200700084726
Data columns (total 18 columns):
 #   Column   Non-Null Count    Dtype  
---  ------   --------------    -----  
 0   catr     1121570 non-null  float64
 1   voie     1014391 non-null  object 
 2   v1       485747 non-null   float64
 3   v2       51369 non-null    object 
 4   circ     1119997 non-null  float64
 5   nbv      1118840 non-null  float64
 6   vosp     1118810 non-null  float64
 7   prof     1119622 non-null  float64
 8   pr       646087 non-null   object 
 9   pr1      644268 non-null   object 
 10  plan     1119290 non-null  float64
 11  lartpc   902739 non-null   object 
 12  larrout  1008730 non-null  object 
 13  surf     1119647 non-null  float64
 14  infra    1116163 non-null  float64
 15  situ     1116601 non-null  float64
 16  vma      163102 non-null   float64
 17  env1     953029 non-null   float64
dtypes: float64(12), object(6)
memory usage: 162.6+ MB


In [19]:
places.head()

Unnamed: 0_level_0,catr,voie,v1,v2,circ,nbv,vosp,prof,pr,pr1,plan,lartpc,larrout,surf,infra,situ,vma,env1
Num_Acc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
202100000001,3.0,981,-1.0,,2.0,2.0,0.0,1.0,(1),(1),1.0,,-1,1.0,0.0,1.0,80.0,
202100000002,3.0,20,0.0,E,2.0,2.0,0.0,1.0,0,10,1.0,,-1,1.0,0.0,1.0,80.0,
202100000003,4.0,GENERAL GALLIENI (BOULEVARD DU),0.0,,2.0,2.0,0.0,1.0,(1),(1),1.0,,-1,1.0,0.0,1.0,50.0,
202100000004,3.0,GENERAL GALLIENI (BOULEVARD DU),0.0,,2.0,4.0,0.0,1.0,0,1,1.0,,-1,1.0,0.0,1.0,50.0,
202100000005,7.0,PARIS. ROUTE DE,-1.0,,2.0,2.0,-1.0,1.0,11,150,1.0,,-1,1.0,0.0,1.0,50.0,


### Export the data
The data is exported to `places.csv` in the `data` folder.

In [20]:
places.to_csv("./data/places.csv", sep = ',', header = True, na_rep = 'n/a', index=True)

## `vehicules` files
- import the files
- fix `dtype` issues on the fly
- fix issues with latitude and longitude
- export the file to `vehicles.csv` in `data`

### Data import

In [21]:
# read vehicle files in data directory
files = []
for file in glob.glob("../mar23cds_accidents_data_import/vehicules*.csv"):
    files.append(file)

# import user files
vehicles = pd.DataFrame()

for i in range(0, len(files)):
    file = pd.read_csv(files[i], sep = ',', encoding='utf-8', na_values=['N/A', 'n/a',''], low_memory=False, index_col=0)
    if i == 0:
        vehicles = file
    else:
        vehicles = pd.concat([vehicles, file], axis = 0)

In [22]:
vehicles.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1914902 entries, 200600000001 to 200900074898
Data columns (total 10 columns):
 #   Column       Dtype  
---  ------       -----  
 0   senc         float64
 1   catv         int64  
 2   occutc       float64
 3   obs          float64
 4   obsm         float64
 5   choc         float64
 6   manv         float64
 7   num_veh      object 
 8   id_vehicule  object 
 9   motor        float64
dtypes: float64(7), int64(1), object(2)
memory usage: 160.7+ MB


In [23]:
vehicles.head()

Unnamed: 0_level_0,senc,catv,occutc,obs,obsm,choc,manv,num_veh,id_vehicule,motor
Num_Acc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
200600000001,0.0,7,0.0,11.0,2.0,6.0,1.0,B01,,
200600000001,0.0,13,0.0,0.0,2.0,1.0,15.0,A01,,
200600000002,0.0,7,0.0,0.0,2.0,3.0,15.0,A01,,
200600000002,0.0,7,0.0,0.0,2.0,3.0,1.0,B01,,
200600000003,0.0,10,0.0,0.0,1.0,4.0,4.0,A01,,


### Fixing the issue in `id_vehicule`

In [24]:
# remove all non-numeric characters by a regex
vehicles['id_vehicule'] = vehicles['id_vehicule'].apply(lambda x: re.sub('[^0-9]', '', str(x)))

# replace empty values by nan
vehicles['id_vehicule'] = vehicles['id_vehicule'].replace('', np.nan).astype(float)

In [25]:
vehicles.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1914902 entries, 200600000001 to 200900074898
Data columns (total 10 columns):
 #   Column       Dtype  
---  ------       -----  
 0   senc         float64
 1   catv         int64  
 2   occutc       float64
 3   obs          float64
 4   obsm         float64
 5   choc         float64
 6   manv         float64
 7   num_veh      object 
 8   id_vehicule  float64
 9   motor        float64
dtypes: float64(8), int64(1), object(1)
memory usage: 160.7+ MB


### Export the data
The data is exported to `vehicles.csv` in the `data` folder.

In [26]:
vehicles.to_csv("./data/vehicles.csv", sep = ',', header = True, na_rep = 'n/a', index=True)

## `registered vehicules` files
- import the files
- fix `dtype` issues on the fly
- fix issues with latitude and longitude
- export the file to `registered_vehicles.csv` in `data`

### Data import

In [27]:
# read registered vehicle files in data directory
files = []
for file in glob.glob("../mar23cds_accidents_data_import/registered_vehicles*.csv"):
    files.append(file)

# import user files
registered_vehicles = pd.DataFrame()

for i in range(0, len(files)):
    file = pd.read_csv(files[i], sep = ';', na_values=['N/A', 'n/a',''], 
                       low_memory=False, encoding="UTF-8")
    #dtype = {'Age véhicule': 'Int64'})
    if i == 0:
        registered_vehicles = file
    else:
        registered_vehicles = pd.concat([registered_vehicles, file], axis = 0)

registered_vehicles.head()

Unnamed: 0,Id_accident,Lettre Conventionnelle Véhicule,Année,Lieu Admin Actuel - Territoire Nom,Type Accident - Libellé,CNIT,Catégorie véhicule,Age véhicule,Type Accident - Libellé (old)
0,63 387 461,A,2019,Métropole,Accident Léger,N10MCDCTPV62189,VU,3.0,
1,63 387 461,B,2019,Métropole,Accident Léger,M10JT0VP015D744,VU,2.0,
2,63 387 462,A,2019,Métropole,Accident grave non mortel,JS1CJ2251001004,Moto lourde,10.0,
3,63 387 463,A,2019,Métropole,Accident Léger,VF7FC8HZB273438,VT,14.0,
4,63 387 464,A,2019,Métropole,Accident Léger,M10NSSVP035L180,VT,1.0,


In [28]:
registered_vehicles['Age véhicule'] = registered_vehicles['Age véhicule'].replace('#VALEURMULTI', np.nan).astype(float)

In [29]:
registered_vehicles.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1290207 entries, 0 to 98024
Data columns (total 9 columns):
 #   Column                              Non-Null Count    Dtype  
---  ------                              --------------    -----  
 0   Id_accident                         1290207 non-null  object 
 1   Lettre Conventionnelle Véhicule     1290206 non-null  object 
 2   Année                               1290207 non-null  int64  
 3   Lieu Admin Actuel - Territoire Nom  1290207 non-null  object 
 4   Type Accident - Libellé             1078193 non-null  object 
 5   CNIT                                845998 non-null   object 
 6   Catégorie véhicule                  1290207 non-null  object 
 7   Age véhicule                        1181107 non-null  float64
 8   Type Accident - Libellé (old)       212014 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 98.4+ MB


### Fixing the issue in `Id_accident`

In [30]:
# remove all non-numeric characters by a regex
registered_vehicles['Id_accident'] = registered_vehicles['Id_accident'].apply(lambda x: re.sub('[^0-9]', '', str(x)))

# replace empty values by nan
registered_vehicles['Id_accident'] = registered_vehicles['Id_accident'].replace('', np.nan).astype(float)

In [31]:
registered_vehicles.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1290207 entries, 0 to 98024
Data columns (total 9 columns):
 #   Column                              Non-Null Count    Dtype  
---  ------                              --------------    -----  
 0   Id_accident                         1290207 non-null  float64
 1   Lettre Conventionnelle Véhicule     1290206 non-null  object 
 2   Année                               1290207 non-null  int64  
 3   Lieu Admin Actuel - Territoire Nom  1290207 non-null  object 
 4   Type Accident - Libellé             1078193 non-null  object 
 5   CNIT                                845998 non-null   object 
 6   Catégorie véhicule                  1290207 non-null  object 
 7   Age véhicule                        1181107 non-null  float64
 8   Type Accident - Libellé (old)       212014 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 98.4+ MB


In [32]:
registered_vehicles.head()

Unnamed: 0,Id_accident,Lettre Conventionnelle Véhicule,Année,Lieu Admin Actuel - Territoire Nom,Type Accident - Libellé,CNIT,Catégorie véhicule,Age véhicule,Type Accident - Libellé (old)
0,63387461.0,A,2019,Métropole,Accident Léger,N10MCDCTPV62189,VU,3.0,
1,63387461.0,B,2019,Métropole,Accident Léger,M10JT0VP015D744,VU,2.0,
2,63387462.0,A,2019,Métropole,Accident grave non mortel,JS1CJ2251001004,Moto lourde,10.0,
3,63387463.0,A,2019,Métropole,Accident Léger,VF7FC8HZB273438,VT,14.0,
4,63387464.0,A,2019,Métropole,Accident Léger,M10NSSVP035L180,VT,1.0,


### Export the data
The data is exported to `registered_vehicles.csv` in the `data` folder.

In [33]:
registered_vehicles.to_csv("./data/registered_vehicles.csv", sep = ',', header = True, na_rep = 'n/a', index=False)

In [34]:
registered_vehicles['Année'].unique()

array([2019, 2020, 2018, 2016, 2010, 2012, 2011, 2013, 2015, 2009, 2021,
       2017, 2014])

### Merge all data
- Merge all files except `registered_vehicles`.
- The variables `id_vehicule` and `num_veh` do exist in `users` and `vehicules`.
- Thus, we need to merge also by these variables to avoid the creation of cross-merged datasets.

In [35]:
# merge files
merged_df = pd.merge(users, features, on='Num_Acc', suffixes=('_users', '_features'))
merged_df = pd.merge(merged_df, places, on='Num_Acc', suffixes=('_merged_df', '_places'))
merged_df = pd.merge(merged_df, vehicles, on=['Num_Acc', 'id_vehicule', 'num_veh'])

In [36]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2509598 entries, 201900000001 to 201800057783
Data columns (total 56 columns):
 #   Column       Dtype  
---  ------       -----  
 0   id_vehicule  float64
 1   num_veh      object 
 2   place        float64
 3   catu         int64  
 4   grav         int64  
 5   sexe         int64  
 6   an_nais      float64
 7   trajet       float64
 8   secu1        float64
 9   secu2        float64
 10  secu3        float64
 11  locp         float64
 12  actp         object 
 13  etatp        float64
 14  secu         float64
 15  an           int32  
 16  mois         int64  
 17  jour         int64  
 18  hrmn         object 
 19  lum          int64  
 20  agg          int64  
 21  int          int64  
 22  atm          float64
 23  col          float64
 24  com          object 
 25  adr          object 
 26  gps          object 
 27  lat          float64
 28  long         float64
 29  dep          object 
 30  catr         float64
 31  voie         

In [37]:
merged_df.head()

Unnamed: 0_level_0,id_vehicule,num_veh,place,catu,grav,sexe,an_nais,trajet,secu1,secu2,...,vma,env1,senc,catv,occutc,obs,obsm,choc,manv,motor
Num_Acc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201900000001,138306524.0,B01,2.0,2,4,2,2002.0,0.0,1.0,0.0,...,70.0,,2.0,7,,0.0,2.0,5.0,23.0,1.0
201900000001,138306524.0,B01,1.0,1,4,2,1993.0,5.0,1.0,0.0,...,70.0,,2.0,7,,0.0,2.0,5.0,23.0,1.0
201900000001,138306525.0,A01,1.0,1,1,1,1959.0,0.0,1.0,0.0,...,70.0,,2.0,17,,1.0,0.0,3.0,11.0,1.0
201900000002,138306523.0,A01,1.0,1,4,2,1994.0,0.0,1.0,0.0,...,70.0,,1.0,7,,4.0,0.0,1.0,0.0,1.0
201900000003,138306520.0,A01,1.0,1,1,1,1996.0,0.0,1.0,0.0,...,90.0,,1.0,7,,0.0,2.0,1.0,2.0,1.0


This looks good now.

In [38]:
merged_df.to_csv("./data/merged_tables.csv", sep = ',', header = True, na_rep = 'n/a', index=True)