# Production d'un csv utilisable de la base FINESS

En l'état, l'export CSV de la [base FINESS][finess] n'est pas vraiment satisfaisant et utilisable.

- Le fichier n'est pas réellement un CSV.
    - Il est bizarrement découpé en deux sections qui correspondent au XML.
    - Les colonnes n'ont pas de nom.
- Le fichier est encodé au format windows.

[finess]: https://www.data.gouv.fr/en/datasets/finess-extraction-du-fichier-des-etablissements/

In [1]:
import pandas as pd
import numpy as np
import requests

In [2]:
dataset_api = "https://www.data.gouv.fr/api/1/datasets/finess-extraction-du-fichier-des-etablissements/"

In [3]:
resources = (requests
    .get(dataset_api)
    .json()
    ['resources']
)

resource_geoloc = [ r for r in resources if r['type'] == 'main' and 'géolocalisés' in r['title']][0]

In [4]:
headers = [
    'section',
    'nofinesset',
    'nofinessej',
    'rs',
    'rslongue',
    'complrs',
    'compldistrib',
    'numvoie',
    'typvoie',
    'voie',
    'compvoie',
    'lieuditbp',
    'commune',
    'departement',
    'libdepartement',
    'ligneacheminement',
    'telephone',
    'telecopie',
    'categetab',
    'libcategetab',
    'categagretab',
    'libcategagretab',
    'siret',
    'codeape',
    'codemft',
    'libmft',
    'codesph',
    'libsph',
    'dateouv',
    'dateautor',
    'maj',
    'numuai'
]

In [5]:
geoloc_names = [
    'nofinesset',
    'coordxet',
    'coordyet',
    'sourcecoordet',
    'datemaj'
]

In [6]:
raw_df = (pd
    .read_csv(resource_geoloc['url'],
              sep=";", encoding="Windows-1252", header=None, skiprows=1,
              dtype='str',
              names=headers)
    .drop(columns=['section'])
)

raw_df

Unnamed: 0,nofinesset,nofinessej,rs,rslongue,complrs,compldistrib,numvoie,typvoie,voie,compvoie,...,siret,codeape,codemft,libmft,codesph,libsph,dateouv,dateautor,maj,numuai
0,010000024,010780054,CH DE FLEYRIAT,CENTRE HOSPITALIER DE BOURG-EN-BRESSE FLEYRIAT,,,900,RTE,DE PARIS,,...,26010004500012,8610Z,03,ARS établissements Publics de santé dotation g...,1,Etablissement public de santé,1979-02-13,1979-02-13,2020-02-04,
1,010000032,010780062,CH BUGEY SUD,CENTRE HOSPITALIER BUGEY SUD,,,700,AV,DE NARVIK,,...,26010003700068,8610Z,03,ARS établissements Publics de santé dotation g...,1,Etablissement public de santé,1901-01-01,1901-01-01,2021-07-07,
2,010000065,010780096,CH DE TREVOUX - MONTPENSIER,CENTRE HOSPITALIER DE TREVOUX - MONTPENSIER,,,14,R,DE L'HOPITAL,,...,26010028400017,8610Z,03,ARS établissements Publics de santé dotation g...,1,Etablissement public de santé,1901-01-01,1901-01-01,2018-01-12,
3,010000081,010780112,CH DU PAYS DE GEX,CENTRE HOSPITALIER DU PAYS DE GEX,,,160,R,MARC PANISSOD,,...,26010010200011,8610Z,03,ARS établissements Publics de santé dotation g...,1,Etablissement public de santé,1901-01-01,1901-01-01,2020-02-04,
4,010000099,010780120,CH DE MEXIMIEUX,CENTRE HOSPITALIER DE MEXIMIEUX,,,13,AV,DU DOCTEUR BOYER,,...,26010013600019,8610Z,03,ARS établissements Publics de santé dotation g...,1,Etablissement public de santé,1945-01-01,1945-01-01,2020-06-30,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189551,970412276,342298.0,7688116.7,"1,ATLASANTE,100,IGN,BD_ADRESSE,V2.2,UTM_S40",2022-03-10,,,,,,...,,,,,,,,,,
189552,980501738,521486.0,8583077.0,"1,ATLASANTE,100,IGN,BD_ADRESSE,V2.2,UTM_S38",2022-03-10,,,,,,...,,,,,,,,,,
189553,980501779,524168.6,8588089.7,"2,ATLASANTE,84,IGN,BD_ADRESSE,V2.2,UTM_S38",2022-03-10,,,,,,...,,,,,,,,,,
189554,980501878,514247.9,8582244.9,"1,ATLASANTE,100,IGN,BD_ADRESSE,V2.2,UTM_S38",2022-03-10,,,,,,...,,,,,,,,,,


In [7]:
structures = (raw_df
    .iloc[:int(raw_df.index.size/2)]
)

structures

Unnamed: 0,nofinesset,nofinessej,rs,rslongue,complrs,compldistrib,numvoie,typvoie,voie,compvoie,...,siret,codeape,codemft,libmft,codesph,libsph,dateouv,dateautor,maj,numuai
0,010000024,010780054,CH DE FLEYRIAT,CENTRE HOSPITALIER DE BOURG-EN-BRESSE FLEYRIAT,,,900,RTE,DE PARIS,,...,26010004500012,8610Z,03,ARS établissements Publics de santé dotation g...,1,Etablissement public de santé,1979-02-13,1979-02-13,2020-02-04,
1,010000032,010780062,CH BUGEY SUD,CENTRE HOSPITALIER BUGEY SUD,,,700,AV,DE NARVIK,,...,26010003700068,8610Z,03,ARS établissements Publics de santé dotation g...,1,Etablissement public de santé,1901-01-01,1901-01-01,2021-07-07,
2,010000065,010780096,CH DE TREVOUX - MONTPENSIER,CENTRE HOSPITALIER DE TREVOUX - MONTPENSIER,,,14,R,DE L'HOPITAL,,...,26010028400017,8610Z,03,ARS établissements Publics de santé dotation g...,1,Etablissement public de santé,1901-01-01,1901-01-01,2018-01-12,
3,010000081,010780112,CH DU PAYS DE GEX,CENTRE HOSPITALIER DU PAYS DE GEX,,,160,R,MARC PANISSOD,,...,26010010200011,8610Z,03,ARS établissements Publics de santé dotation g...,1,Etablissement public de santé,1901-01-01,1901-01-01,2020-02-04,
4,010000099,010780120,CH DE MEXIMIEUX,CENTRE HOSPITALIER DE MEXIMIEUX,,,13,AV,DU DOCTEUR BOYER,,...,26010013600019,8610Z,03,ARS établissements Publics de santé dotation g...,1,Etablissement public de santé,1945-01-01,1945-01-01,2020-06-30,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94773,980502116,750054157,CAARUD POPAM,,,,6,IMM,MANGA PAPAYE,,...,,8899B,99,Indéterminé,,,2021-07-01,2021-05-29,2021-08-26,
94774,980502124,750054157,CSAPA POPAM,,,,6,IMM,MANGA PAPAYA,,...,,8899B,34,ARS / DG dotation globale,,,2021-07-01,2021-05-29,2021-08-26,
94775,980502173,980502165,PHARMACIE DE ILONI,PHARMACIE DE ILONI,,,559,,Route nationale 2,,...,,,01,Etablissement Tarif Libre,,,2022-01-03,2021-04-07,2022-02-26,
94776,980502199,980502181,UBIPHARM-MAYOTTE,,,,,ZI,VALLÉE 3 BP 208,,...,,4646Z,99,Indéterminé,0,Non concerné,2020-09-01,2018-10-01,2021-12-09,


In [8]:
geolocalisations = (raw_df
    .iloc[int(raw_df.index.size/2):]
    .drop(columns=raw_df.columns[5:])
    .rename(columns=lambda x: geoloc_names[list(raw_df.columns).index(x)])
)

geolocalisations

Unnamed: 0,nofinesset,coordxet,coordyet,sourcecoordet,datemaj
94778,020002978,714412.4,6946033.2,"1,ATLASANTE,100,IGN,BD_ADRESSE,V2.2,LAMBERT_93",2022-03-10
94779,020012779,736596.2,6900631.2,"1,ATLASANTE,100,IGN,BD_ADRESSE,V2.2,LAMBERT_93",2022-03-10
94780,030002208,705086.4,6591177.1,"3,ATLASANTE,100,IGN,BD_ADRESSE,V2.2,LAMBERT_93",2022-03-10
94781,040000069,923147.4,6321804.0,"3,ATLASANTE,100,IGN,BD_ADRESSE,V2.2,LAMBERT_93",2022-03-10
94782,040002313,950414.1,6331695.3,"3,ATLASANTE,100,IGN,BD_ADRESSE,V2.2,LAMBERT_93",2022-03-10
...,...,...,...,...,...
189551,970412276,342298.0,7688116.7,"1,ATLASANTE,100,IGN,BD_ADRESSE,V2.2,UTM_S40",2022-03-10
189552,980501738,521486.0,8583077.0,"1,ATLASANTE,100,IGN,BD_ADRESSE,V2.2,UTM_S38",2022-03-10
189553,980501779,524168.6,8588089.7,"2,ATLASANTE,84,IGN,BD_ADRESSE,V2.2,UTM_S38",2022-03-10
189554,980501878,514247.9,8582244.9,"1,ATLASANTE,100,IGN,BD_ADRESSE,V2.2,UTM_S38",2022-03-10


In [9]:
clean_df = (structures
    .merge(geolocalisations, on="nofinesset", how="left")
)

clean_df

Unnamed: 0,nofinesset,nofinessej,rs,rslongue,complrs,compldistrib,numvoie,typvoie,voie,compvoie,...,codesph,libsph,dateouv,dateautor,maj,numuai,coordxet,coordyet,sourcecoordet,datemaj
0,010000024,010780054,CH DE FLEYRIAT,CENTRE HOSPITALIER DE BOURG-EN-BRESSE FLEYRIAT,,,900,RTE,DE PARIS,,...,1,Etablissement public de santé,1979-02-13,1979-02-13,2020-02-04,,870215.7,6571590.5,"1,ATLASANTE,100,IGN,BD_ADRESSE,V2.2,LAMBERT_93",2022-03-10
1,010000032,010780062,CH BUGEY SUD,CENTRE HOSPITALIER BUGEY SUD,,,700,AV,DE NARVIK,,...,1,Etablissement public de santé,1901-01-01,1901-01-01,2021-07-07,,908351.7,6520414.1,"2,ATLASANTE,100,IGN,BD_ADRESSE,V2.2,LAMBERT_93",2022-03-10
2,010000065,010780096,CH DE TREVOUX - MONTPENSIER,CENTRE HOSPITALIER DE TREVOUX - MONTPENSIER,,,14,R,DE L'HOPITAL,,...,1,Etablissement public de santé,1901-01-01,1901-01-01,2018-01-12,,837272.3,6539470.4,"2,ATLASANTE,100,IGN,BD_ADRESSE,V2.2,LAMBERT_93",2022-03-10
3,010000081,010780112,CH DU PAYS DE GEX,CENTRE HOSPITALIER DU PAYS DE GEX,,,160,R,MARC PANISSOD,,...,1,Etablissement public de santé,1901-01-01,1901-01-01,2020-02-04,,935201.9,6584824.4,"1,ATLASANTE,100,IGN,BD_ADRESSE,V2.2,LAMBERT_93",2022-03-10
4,010000099,010780120,CH DE MEXIMIEUX,CENTRE HOSPITALIER DE MEXIMIEUX,,,13,AV,DU DOCTEUR BOYER,,...,1,Etablissement public de santé,1945-01-01,1945-01-01,2020-06-30,,870112.6,6536420.5,"2,ATLASANTE,100,IGN,BD_ADRESSE,V2.2,LAMBERT_93",2022-03-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94773,980502116,750054157,CAARUD POPAM,,,,6,IMM,MANGA PAPAYE,,...,,,2021-07-01,2021-05-29,2021-08-26,,524649.0,8586741.0,"1,ATLASANTE,87,IGN,BD_ADRESSE,V2.2,UTM_S38",2022-03-10
94774,980502124,750054157,CSAPA POPAM,,,,6,IMM,MANGA PAPAYA,,...,,,2021-07-01,2021-05-29,2021-08-26,,525148.3,8587106.6,"3,ATLASANTE,100,IGN,BD_ADRESSE,V2.2,UTM_S38",2022-03-10
94775,980502173,980502165,PHARMACIE DE ILONI,PHARMACIE DE ILONI,,,559,,Route nationale 2,,...,,,2022-01-03,2021-04-07,2022-02-26,,518105.1,8580466.8,"2,ATLASANTE,84,IGN,BD_ADRESSE,V2.2,UTM_S38",2022-03-10
94776,980502199,980502181,UBIPHARM-MAYOTTE,,,,,ZI,VALLÉE 3 BP 208,,...,0,Non concerné,2020-09-01,2018-10-01,2021-12-09,,522250.7,8592077.7,"3,ATLASANTE,100,IGN,BD_ADRESSE,V2.2,UTM_S38",2022-03-10


In [10]:
clean_df.sample().T

Unnamed: 0,15332
nofinesset,220011787
nofinessej,220012363
rs,PHARMACIE BOUVRAIS - RAULT
rslongue,PHARMACIE BOUVRAIS - RAULT
complrs,
compldistrib,
numvoie,8
typvoie,R
voie,DE MONTIFAULT
compvoie,B


In [11]:
clean_df["siret"]

0        26010004500012
1        26010003700068
2        26010028400017
3        26010010200011
4        26010013600019
              ...      
94773               NaN
94774               NaN
94775               NaN
94776               NaN
94777    89938889600016
Name: siret, Length: 94778, dtype: object

## Vérification de la qualité des données

In [12]:
intersection = pd.Series(np.intersect1d(structures.nofinesset.values, geolocalisations.nofinesset.values))

intersection.shape

(94778,)

In [13]:
only_structures = (structures
    [ ~structures.nofinesset.isin(intersection) ]
)

only_structures

Unnamed: 0,nofinesset,nofinessej,rs,rslongue,complrs,compldistrib,numvoie,typvoie,voie,compvoie,...,siret,codeape,codemft,libmft,codesph,libsph,dateouv,dateautor,maj,numuai


In [14]:
only_geolocalisations = (geolocalisations
    [ ~geolocalisations.nofinesset.isin(intersection) ]
)

only_geolocalisations

Unnamed: 0,nofinesset,coordxet,coordyet,sourcecoordet,datemaj


In [15]:
geolocalisations_missing = []

## Export final

In [16]:
clean_df.to_csv('finess-clean.csv', encoding='utf-8')