# Aims to check example of siren on full process

In [125]:
import json
import dask.dataframe as dd
import pandas as pd

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [126]:
# '349599092' is in initial, partiel, new, evt
# '344981501' is in initial, partiel, new
# '6041099' is in initial, partiel

In [127]:
inpi_nature = 'PP' #* nature → ACTES/COMPTES/ETS/etc
siren_test = '969401835'
load_type = 'local' # 'aws'/'local'

In [128]:
source='insee'
insee_filename = 'StockEtablissement_utf8.csv'
siretisation_filename = 'insee_PP_all_matche.gz'

# define import paths
if load_type == 'local':
    ## INSEE on local machine
    insee_filepath = 'data'
    insee_csv_relative_filepath = "{}/{}".format(insee_filepath,insee_filename)
    insee = insee_csv_relative_filepath
    inpi_processed_import_full_path = 'data'
    siretisation_import_path = 'data'
    #matched data
    siretisation_fullpath =  "{}/{}".format(siretisation_import_path,siretisation_filename)
    matche = siretisation_fullpath
else :
    ## INSEE on AWS
    import boto3
    import Match_inpi_insee.aws_connectors as aws
    insee_filepath = "INSEE/Stock/ETS"
    insee_csv_relative_filepath = "{}/{}".format(insee_filepath,insee_filename)
    instance_aws = 'https://calfdata.s3.eu-west-3.amazonaws.com'
    bucket = 'calfdata'
    # instanciate AWS connection
    AWS_connection = aws.aws_instantiate(instance_aws, bucket)
    insee = AWS_connection.url_instance_bucket(path_file = insee_csv_relative_filepath)
    inpi_processed_import_full_path = "INPI/TC_1/Stock_processed"
    
    siretisation_import_path = 'SIRETISATION/matche'
    siretisation_fullpath =  "{}/{}".format(siretisation_import_path,siretisation_filename)
    matche = AWS_connection.url_instance_bucket(path_file = siretisation_fullpath)

# INSEE : source data

In [129]:
usecols_=['siren',
                                   'siret',
                                   "numeroVoieEtablissement",
                                   "indiceRepetitionEtablissement",
                                   "typeVoieEtablissement",
                                   "libelleVoieEtablissement",
                                   "complementAdresseEtablissement",
                                   "codeCommuneEtablissement",
                                   "libelleCommuneEtablissement",
                                   "codePostalEtablissement",
                                   "codeCedexEtablissement",
                                   "libelleCedexEtablissement",
                                   "distributionSpecialeEtablissement",
                                   "libelleCommuneEtrangerEtablissement",
                                   "codePaysEtrangerEtablissement",
                                   "libellePaysEtrangerEtablissement"
                                   ]
dtype_={'siren': 'object',
                                 'siret': 'object',
                                 "numeroVoieEtablissement":'object',
                                   "indiceRepetitionEtablissement":'object',
                                   "typeVoieEtablissement":'object',
                                   "libelleVoieEtablissement":'object',
                                   "complementAdresseEtablissement":'object',
                                   "codeCommuneEtablissement":'object',
                                   "libelleCommuneEtablissement":'object',
                                   "codePostalEtablissement":'object',
                                   "codeCedexEtablissement":'object',
                                   "libelleCedexEtablissement":'object',
                                   "distributionSpecialeEtablissement":'object',
                                   "libelleCommuneEtrangerEtablissement":'object',
                                   "codePaysEtrangerEtablissement":'object',
                                   "libellePaysEtrangerEtablissement":'object'
                                 }

In [130]:
#load insee full data
data_insee_ = dd.read_csv(insee,
                          usecols=usecols_,
                          dtype=dtype_
                          )

In [131]:
# correction of missing zeros in siren
def fill0(text, num):
    while len(text) < num:
        text = str('0' + text)
    return text

siren_test = fill0(siren_test,9)

In [132]:
# filter on example siren
data_insee_ = data_insee_.compute().loc[lambda x: x['siren'].isin([siren_test])]
data_insee_

Unnamed: 0,siren,siret,complementAdresseEtablissement,numeroVoieEtablissement,indiceRepetitionEtablissement,typeVoieEtablissement,libelleVoieEtablissement,codePostalEtablissement,libelleCommuneEtablissement,libelleCommuneEtrangerEtablissement,distributionSpecialeEtablissement,codeCommuneEtablissement,codeCedexEtablissement,libelleCedexEtablissement,codePaysEtrangerEtablissement,libellePaysEtrangerEtablissement
117969,969401835,96940183500026,,16,,CHE,DU STADE,1700,BEYNOST,,,1043,,,,
117970,969401835,96940183500034,,416,,MTE,MONTEE DE BELLEGARDE,1160,PRIAY,,,1314,,,,


# INPI : source data

In [133]:
year='2017'

initial_file_name = "{}_{}.{}".format('initial',inpi_nature,'gz')
new_file_name = "{}_{}_{}.{}".format(year,'NEW',inpi_nature.upper(),'gz')
evt_file_name = "{}_{}_{}.{}".format(year,'EVT',inpi_nature.upper(),'gz')
partiel_file_name = "{}_{}.{}".format('partiel',inpi_nature,'gz')
initial_filepath = "{}/{}".format(inpi_processed_import_full_path,initial_file_name)
new_filepath = "{}/{}".format(inpi_processed_import_full_path,new_file_name)
evt_filepath = "{}/{}".format(inpi_processed_import_full_path,evt_file_name)
partiel_filepath = "{}/{}".format(inpi_processed_import_full_path,partiel_file_name)

In [134]:
# get files

if load_type == 'local':
    pp_initial = initial_filepath
    pp_new = new_filepath
    pp_evt = evt_filepath
    pp_partiel = partiel_filepath
else :
    pp_initial = AWS_connection.url_instance_bucket(path_file = initial_filepath)
    pp_new = AWS_connection.url_instance_bucket(path_file = new_filepath)
    pp_evt = AWS_connection.url_instance_bucket(path_file = evt_filepath)
    pp_partiel = AWS_connection.url_instance_bucket(path_file = partiel_filepath)

In [135]:
## dtypes__
dtypes__ = {'Siren':'object',
            'Adresse_Ligne1': 'object',
       'Code_Commune': 'object',
       'Code_Postal': 'object',
       'Conjoint_Collab_Pseudo': 'object',
       'DAP_Adresse_Ligne1': 'object',
       'DAP_Code_Commune': 'object',
       'Date_Immatriculation': 'object',
       'Date_1re_Immatriculation': 'object',
       'Date_Radiation': 'object',
       'Date_Greffe': 'object',
       'Sans_Activité': 'object',
       'Auto-entrepreneur': 'object',
       'DAP_Adresse_Ligne3': 'object',
       'Pseudonyme': 'object'
           }

In [136]:
#load data
data_pp_initial= dd.read_csv(pp_initial,
                         compression='gzip',
                         dtype=dtypes__,
                         blocksize=None,
                         low_memory=False
                      )
# filter on example siren
data_pp_initial = data_pp_initial.compute().loc[lambda x: x['Siren'].isin([siren_test])] 
data_pp_initial

Unnamed: 0,Code Greffe,Nom_Greffe,Numero_Gestion,Siren,Type_Inscription,Date_Immatriculation,Date_1re_Immatriculation,Date_Radiation,Date_Transfert,Sans_Activité,...,DAP_Ville,DAP_Code_Commune,DAP_Pays,Conjoint_Collab_Nom_Patronym,Conjoint_Collab_Nom_Usage,Conjoint_Collab_Pseudo,Conjoint_Collab_Prénoms,Conjoint_Collab_Date_Fin,Date_Greffe,Libelle_Evt
1561,101,Bourg-en-Bresse,1977A00266,969401835,P,1977-05-11,,,,,...,,,,,,,,,1996-09-02,Création


In [137]:
data_pp_new= dd.read_csv(pp_new,
                         compression='gzip',
                         dtype=dtypes__,
                         blocksize=None,
                         low_memory=False
                      )

# filter on example siren
data_pp_new = data_pp_new.compute().loc[lambda x: x['Siren'].isin([siren_test])] 
data_pp_new

Unnamed: 0,Code Greffe,Nom_Greffe,Numero_Gestion,Siren,Type_Inscription,Date_Immatriculation,Date_1re_Immatriculation,Date_Radiation,Date_Transfert,Sans_Activité,...,DAP_Ville,DAP_Code_Commune,DAP_Pays,Conjoint_Collab_Nom_Patronym,Conjoint_Collab_Nom_Usage,Conjoint_Collab_Pseudo,Conjoint_Collab_Prénoms,Conjoint_Collab_Date_Fin,Date_Greffe,Libelle_Evt


In [138]:
data_pp_evt= dd.read_csv(pp_evt,
                         compression='gzip',
                         dtype=dtypes__,
                         blocksize=None,
                         low_memory=False
                      )
# filter on example siren
data_pp_evt = data_pp_evt.compute().loc[lambda x: x['Siren'].isin([siren_test])] 
data_pp_evt

Unnamed: 0,Code Greffe,Nom_Greffe,Numero_Gestion,Siren,Type_Inscription,Date_Immatriculation,Date_1re_Immatriculation,Date_Radiation,Date_Transfert,Sans_Activité,...,DAP_Ville,DAP_Code_Commune,DAP_Pays,Conjoint_Collab_Nom_Patronym,Conjoint_Collab_Nom_Usage,Conjoint_Collab_Pseudo,Conjoint_Collab_Prénoms,Conjoint_Collab_Date_Fin,Date_Greffe,Libelle_Evt


In [139]:
data_pp_partiel= dd.read_csv(pp_partiel,
                         compression='gzip',
                         dtype=dtypes__,
                         blocksize=None,
                         low_memory=False
                      )
# filter on example siren
data_pp_partiel = data_pp_partiel.compute().loc[lambda x: x['Siren'].isin([siren_test])] 
data_pp_partiel

Unnamed: 0,Code Greffe,Nom_Greffe,Numero_Gestion,Siren,Type_Inscription,Date_Immatriculation,Date_1re_Immatriculation,Date_Radiation,Date_Transfert,Sans_Activité,...,DAP_Ville,DAP_Code_Commune,DAP_Pays,Conjoint_Collab_Nom_Patronym,Conjoint_Collab_Nom_Usage,Conjoint_Collab_Pseudo,Conjoint_Collab_Prénoms,Conjoint_Collab_Date_Fin,Date_Greffe,Libelle_Evt
81,101,Bourg-en-Bresse,1977A00266,969401835,P,1977-05-11,,,,,...,,,,,,,,,2017-08-16,Création


# SIRETISATION : matched data

In [140]:
#load matched data
data_matche= dd.read_csv(matche,
                         compression='gzip',
                         dtype=dtypes__,
                         blocksize=None,
                         low_memory=False
                      )
# filter on example siren
data_matche = data_matche.compute().loc[lambda x: x['siren'].isin([siren_test])] 
data_matche

Unnamed: 0,siren,siret,statutDiffusionEtablissement,Date_Immatriculation,Date_1re_Immatriculation,Date_Radiation,Sans_Activité,Adresse_Ligne1,Code_Postal,Code_Commune,DAP_Adresse_Ligne1,DAP_Code_Commune,Conjoint_Collab_Pseudo,Date_Greffe,_merge,url,source
1254718,969401835,96940183500026,O,1977-05-11,,,,,1160,,,,,1996-09-02,both,https://data.inpi.fr/entreprises/969401835,initial
1254719,969401835,96940183500034,O,1977-05-11,,,,,1160,,,,,1996-09-02,both,https://data.inpi.fr/entreprises/969401835,initial
1254720,969401835,96940183500026,O,1977-05-11,,,,,1160,,,,,2017-08-16,both,https://data.inpi.fr/entreprises/969401835,partiel
1254721,969401835,96940183500034,O,1977-05-11,,,,,1160,,,,,2017-08-16,both,https://data.inpi.fr/entreprises/969401835,partiel


# Recap of all data

In [141]:
siren_test

'969401835'

In [142]:
data_insee_

Unnamed: 0,siren,siret,complementAdresseEtablissement,numeroVoieEtablissement,indiceRepetitionEtablissement,typeVoieEtablissement,libelleVoieEtablissement,codePostalEtablissement,libelleCommuneEtablissement,libelleCommuneEtrangerEtablissement,distributionSpecialeEtablissement,codeCommuneEtablissement,codeCedexEtablissement,libelleCedexEtablissement,codePaysEtrangerEtablissement,libellePaysEtrangerEtablissement
117969,969401835,96940183500026,,16,,CHE,DU STADE,1700,BEYNOST,,,1043,,,,
117970,969401835,96940183500034,,416,,MTE,MONTEE DE BELLEGARDE,1160,PRIAY,,,1314,,,,


In [143]:
data_pp_initial

Unnamed: 0,Code Greffe,Nom_Greffe,Numero_Gestion,Siren,Type_Inscription,Date_Immatriculation,Date_1re_Immatriculation,Date_Radiation,Date_Transfert,Sans_Activité,...,DAP_Ville,DAP_Code_Commune,DAP_Pays,Conjoint_Collab_Nom_Patronym,Conjoint_Collab_Nom_Usage,Conjoint_Collab_Pseudo,Conjoint_Collab_Prénoms,Conjoint_Collab_Date_Fin,Date_Greffe,Libelle_Evt
1561,101,Bourg-en-Bresse,1977A00266,969401835,P,1977-05-11,,,,,...,,,,,,,,,1996-09-02,Création


In [144]:
data_pp_new

Unnamed: 0,Code Greffe,Nom_Greffe,Numero_Gestion,Siren,Type_Inscription,Date_Immatriculation,Date_1re_Immatriculation,Date_Radiation,Date_Transfert,Sans_Activité,...,DAP_Ville,DAP_Code_Commune,DAP_Pays,Conjoint_Collab_Nom_Patronym,Conjoint_Collab_Nom_Usage,Conjoint_Collab_Pseudo,Conjoint_Collab_Prénoms,Conjoint_Collab_Date_Fin,Date_Greffe,Libelle_Evt


In [145]:
data_pp_evt

Unnamed: 0,Code Greffe,Nom_Greffe,Numero_Gestion,Siren,Type_Inscription,Date_Immatriculation,Date_1re_Immatriculation,Date_Radiation,Date_Transfert,Sans_Activité,...,DAP_Ville,DAP_Code_Commune,DAP_Pays,Conjoint_Collab_Nom_Patronym,Conjoint_Collab_Nom_Usage,Conjoint_Collab_Pseudo,Conjoint_Collab_Prénoms,Conjoint_Collab_Date_Fin,Date_Greffe,Libelle_Evt


In [146]:
data_pp_partiel

Unnamed: 0,Code Greffe,Nom_Greffe,Numero_Gestion,Siren,Type_Inscription,Date_Immatriculation,Date_1re_Immatriculation,Date_Radiation,Date_Transfert,Sans_Activité,...,DAP_Ville,DAP_Code_Commune,DAP_Pays,Conjoint_Collab_Nom_Patronym,Conjoint_Collab_Nom_Usage,Conjoint_Collab_Pseudo,Conjoint_Collab_Prénoms,Conjoint_Collab_Date_Fin,Date_Greffe,Libelle_Evt
81,101,Bourg-en-Bresse,1977A00266,969401835,P,1977-05-11,,,,,...,,,,,,,,,2017-08-16,Création


In [147]:
data_matche

Unnamed: 0,siren,siret,statutDiffusionEtablissement,Date_Immatriculation,Date_1re_Immatriculation,Date_Radiation,Sans_Activité,Adresse_Ligne1,Code_Postal,Code_Commune,DAP_Adresse_Ligne1,DAP_Code_Commune,Conjoint_Collab_Pseudo,Date_Greffe,_merge,url,source
1254718,969401835,96940183500026,O,1977-05-11,,,,,1160,,,,,1996-09-02,both,https://data.inpi.fr/entreprises/969401835,initial
1254719,969401835,96940183500034,O,1977-05-11,,,,,1160,,,,,1996-09-02,both,https://data.inpi.fr/entreprises/969401835,initial
1254720,969401835,96940183500026,O,1977-05-11,,,,,1160,,,,,2017-08-16,both,https://data.inpi.fr/entreprises/969401835,partiel
1254721,969401835,96940183500034,O,1977-05-11,,,,,1160,,,,,2017-08-16,both,https://data.inpi.fr/entreprises/969401835,partiel


# search SIREN examples

In [124]:
#load matched data
data_matche= dd.read_csv(matche,
                         compression='gzip',
                         dtype=dtypes__,
                         blocksize=None,
                         low_memory=False
                      )
x_ = 2
# get siren that come from x_ different sources
(data_matche.compute()
         .groupby(['siren'])['source']
         .nunique()
         .rename('count')
         .loc[lambda x: x == x_] #.loc[lambda x: x> 1]
)

siren
6041099      2
6840540      2
37110095     2
45611332     2
45712411     2
            ..
969401835    2
972709885    2
976713081    2
977111020    2
997140207    2
Name: count, Length: 11147, dtype: int64

In [62]:
#load insee siren list
siren_test = '46412375'
siren_test = fill0(siren_test,9)
data_insee_listsiren = dd.read_csv(insee,
                          usecols=['siren'],
                          dtype={'siren': 'object'}
                          )
data_insee_listsiren.compute().loc[lambda x: x['siren'].isin([siren_test])] 

Unnamed: 0,siren
111550,46412375
