# Test Matching Insee/PM

## INSEE

- https://s3.console.aws.amazon.com/s3/object/calfdata/INSEE/Stock/ETS/
        - INSEE/Stock/ETS/StockEtablissement_utf8.csv
        
```
['siren', 'siret']
```

## INPI

- https://s3.console.aws.amazon.com/s3/buckets/calfdata/INPI/TC_1/Stock_processed/
    - INPI/TC_1/Stock_processed/initial_PP.gz
    - INPI/TC_1/Stock_processed/initial_PP.json
    
Colonnes test:

```
["Siren","Date_Immatriculation", "Date_Clôture", "Date_Greffe"]
```

## Sauvegarde

* La liste des SIREN matchés sera sauvegardée selon leur nature et origine
  * nature → ACTES/COMPTES/ETS/etc
  * origine → initial/partiel/new/evt

Les matchés seront sauvegardé dans calfdata/SIRETISATION/matche/ au format suivant:

* insee_nature_origine_matche.gz
    * ex: insee_pm_initial_matche.gz
    
    

## Moteur de recherche TEST

* Insee
  * http://avis-situation-sirene.insee.fr/IdentificationListeSiret.action
* INPI/TC
  * https://data.inpi.fr/


In [28]:
import boto3, json
import dask.dataframe as dd
import pandas as pd
import Match_inpi_insee.aws_connectors as aws
#from tqdm.notebook import tqdm
#import tqdm
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [169]:
instance_aws = 'https://calfdata.s3.eu-west-3.amazonaws.com'
bucket = 'calfdata'

# define import paths
insee_csv_relative_filepath = "INSEE/Stock/ETS/StockEtablissement_utf8.csv"

inpi_relative_path = "INPI/TC_1/Stock_processed"
inpi_nature = 'PP' #* nature → ACTES/COMPTES/ETS/etc
inpi_origin = 'initial' #* origine → initial/partiel/new/evt
inpi_json_file_name =  "{}_{}.{}".format(inpi_origin,inpi_nature,'json')
inpi_gz_file_name = "{}_{}.{}".format(inpi_origin,inpi_nature,'gz')

inpi_json_relative_filepath ="{}/{}".format(inpi_relative_path,inpi_json_file_name)
inpi_gz_relative_filepath = "{}/{}".format(inpi_relative_path,inpi_gz_file_name)

In [257]:
# instanciate AWS connection
AWS_connection = aws.aws_instantiate(instance_aws, bucket)
AWS_connection.uploadFileBucket(pathfile =inpi_json_relative_filepath)


In [None]:
# get files

# json recap
with open(json_file_name, 'r') as f:
    pp_json = json.load(f)

# data    
insee = AWS_connection.url_instance_bucket(path_file = insee_csv_relative_filepath)
pp = AWS_connection.url_instance_bucket(path_file = inpi_gz_relative_filepath)

In [234]:
# load data into dataframes

data_ets_= dd.read_csv(insee, 
                       usecols = ['siren', 'siret'],
                       dtype={'siren':'object',
                              'siret':'object'}
                      )

dtypes__ = {'Siren':'object',
            'Adresse_Ligne1': 'object',
       'Code_Commune': 'object',
       'Code_Postal': 'object',
       'Conjoint_Collab_Pseudo': 'object',
       'DAP_Adresse_Ligne1': 'object',
       'DAP_Code_Commune': 'object',
       'Date_1re_Immatriculation': 'object',
       'Sans_Activité': 'object'}

data_pp_= dd.read_csv(pp, 
                       usecols =["Siren",
                                ],
                      dtype= {'Siren':'object'},
                      compression='gzip',
                      blocksize=None,
                      low_memory=False
                      )


In [None]:
'''
J'ai un problème pour le compute avec toutes les colonnes de pp


dtypes__ = {'Siren':'object',
            'Adresse_Ligne1': 'object',
       'Code_Commune': 'object',
       'Code_Postal': 'object',
       'Conjoint_Collab_Pseudo': 'object',
       'DAP_Adresse_Ligne1': 'object',
       'DAP_Code_Commune': 'object',
       'Date_1re_Immatriculation': 'object',
       'Sans_Activité': 'object'}

data_pp_= dd.read_csv(pp, 
                      # usecols =["Siren",
                      #          ],
                      dtype=dtypes__,
                      compression='gzip',
                      blocksize=None,
                      low_memory=False
                      )'''

In [232]:
data_pp_.head()

Unnamed: 0,Code Greffe,Nom_Greffe,Numero_Gestion,Siren,Type_Inscription,Date_Immatriculation,Date_1re_Immatriculation,Date_Radiation,Date_Transfert,Sans_Activité,...,DAP_Ville,DAP_Code_Commune,DAP_Pays,Conjoint_Collab_Nom_Patronym,Conjoint_Collab_Nom_Usage,Conjoint_Collab_Pseudo,Conjoint_Collab_Prénoms,Conjoint_Collab_Date_Fin,Date_Greffe,Libelle_Evt
0,101,Bourg-en-Bresse,2002A00253,442377040,P,2002-06-18,,,,,...,,,,,,,,,2015-08-04,Création
1,101,Bourg-en-Bresse,2015A00093,809861297,P,2015-02-27,,,,,...,,,,,,,,,2015-02-27,Création
2,101,Bourg-en-Bresse,2005A00087,480952712,P,2005-02-23,,,,,...,,,,,,,,,2017-05-03,Création
3,101,Bourg-en-Bresse,1992A00503,388239667,P,1992-10-14,,,,,...,,,,,,,,,2017-05-03,Création
4,101,Bourg-en-Bresse,2017A00279,828427245,P,2017-04-18,,,,,...,,,,,,,,,2017-04-18,Création


In [235]:
# Merge data
data_merged = data_ets_.merge(
    data_pp_,
    how= 'right',
    left_on = "siren", 
    right_on ='Siren',
    indicator = True).compute()

In [236]:
# suffix insee or inpi source
data_merged = data_merged.rename(columns={"Siren": "Siren_inpi","siren": "siren_insee","siret": "siret_insee"})
data_merged.head()

Unnamed: 0,siren_insee,siret_insee,Siren_inpi,_merge
0,300004066,30000406600019,300004066,both
1,300036688,30003668800012,300036688,both
2,300044542,30004454200029,300044542,both
3,300047388,30004738800024,300047388,both
4,300054871,30005487100011,300054871,both


In [237]:
data_merged['Siren_inpi'].head()

0    300004066
1    300036688
2    300044542
3    300047388
4    300054871
Name: Siren_inpi, dtype: object

In [238]:
# create a check_url for online manual data verification
data_merged['check_url'] ='https://data.inpi.fr/entreprises/' + data_merged['Siren_inpi']
#data_merged['check_url'] = '{0}/{1}'.format('https://data.inpi.fr/entreprises',data_merged['Siren_inpi'].astype('str')) #KO

# to do :
# regarder sur quelques exemples si sont bien absents de l'insee
# faire des stats avec répartition des nb siret par siren
# cf /InseeInpi_matching/Test_notebook/INSEE

In [239]:
data_merged.head()

Unnamed: 0,siren_insee,siret_insee,Siren_inpi,_merge,check_url
0,300004066,30000406600019,300004066,both,https://data.inpi.fr/entreprises/300004066
1,300036688,30003668800012,300036688,both,https://data.inpi.fr/entreprises/300036688
2,300044542,30004454200029,300044542,both,https://data.inpi.fr/entreprises/300044542
3,300047388,30004738800024,300047388,both,https://data.inpi.fr/entreprises/300047388
4,300054871,30005487100011,300054871,both,https://data.inpi.fr/entreprises/300054871


In [240]:
# check how many matches
data_merged.groupby('_merge')['_merge'].count()
# verifier nb caractères des SIREN - pourquoi ne marche pas

_merge
left_only           0
right_only      51308
both          1182979
Name: _merge, dtype: int64

In [241]:
data_merged.groupby('_merge')['_merge'].count().right_only

51308

In [242]:
# faire une fonction qui retourne un dictionnaire avec :données matchées, données non matchées, json
data_merged.loc[lambda x: x['_merge'].isin(['right_only'])].head()

Unnamed: 0,siren_insee,siret_insee,Siren_inpi,_merge,check_url
14181,,,504630310,right_only,https://data.inpi.fr/entreprises/504630310
14182,,,823218854,right_only,https://data.inpi.fr/entreprises/823218854
14183,,,546110156,right_only,https://data.inpi.fr/entreprises/546110156
14184,,,490068327,right_only,https://data.inpi.fr/entreprises/490068327
14185,,,793005000,right_only,https://data.inpi.fr/entreprises/793005000


In [243]:
len(data_merged)

1234287

In [244]:
'''test_ = (data_merged
 .loc[lambda x: x['_merge'].isin(['right_only'])]['Siren']
 .apply(lambda x : len(x))
 .loc[lambda x: x ==1]
 .index
)
test_'''

"test_ = (data_merged\n .loc[lambda x: x['_merge'].isin(['right_only'])]['Siren']\n .apply(lambda x : len(x))\n .loc[lambda x: x ==1]\n .index\n)\ntest_"

In [245]:
'''data_merged.loc[lambda x :
                (x['_merge'].isin(['right_only']))
                & (x.index.isin(test_))
               ]'''

"data_merged.loc[lambda x :\n                (x['_merge'].isin(['right_only']))\n                & (x.index.isin(test_))\n               ]"

In [267]:
# define save_to paths
inpi_match_relative_path = 'SIRETISATION/matche'
inpi_unmatch_relative_path = 'SIRETISATION/non_matche'
inpi_match_gz_filename = "{}_{}_{}_{}.{}".format('insee',inpi_nature,inpi_origin,'matche','gz')
inpi_match_gz_absolute_filepath = "{}/{}".format(inpi_match_relative_path,inpi_match_gz_filename)
inpi_unmatch_gz_filename = "{}_{}_{}_{}.{}".format('insee',inpi_nature,inpi_origin,'unmatch','gz')
inpi_unmatch_gz_absolute_filepath = "{}/{}".format(inpi_unmatch_relative_path,inpi_unmatch_gz_filename)
inpi_match_json_filename = "{}_{}_{}.{}".format('insee',inpi_nature,inpi_origin,'json') 
inpi_match_json_absolute_filepath =  "{}/{}".format(inpi_match_relative_path,inpi_match_json_filename)
inpi_match_json_absolute_filepath

'SIRETISATION/matche/insee_PP_initial.json'

In [247]:
# prepare json file
        
merge_result = data_merged.groupby('_merge')['_merge'].count()
    
json_ = {
    'nature': inpi_nature, 
    'origin': inpi_origin,
    'path': inpi_gz_relative_filepath, # c'est bien le path du fichier inpi pp utilisé?
    "details": {
        'total_rows_origin':str(len(data_merged)),
        'total_match': str(merge_result.both),
        'total_unmatched_left':str(merge_result.left_only),
        'total_unmatched_right':str(merge_result.right_only),
        },
    }

with open(inpi_match_json_filename,
            'w') as outfile:
    json.dump(json_, outfile)
    # ajouter un argument save=yes/no

{'nature': 'PP',
 'origin': 'initial',
 'path': 'INPI/TC_1/Stock_processed/initial_PP.gz',
 'details': {'total_rows_origin': '1234287',
  'total_match': '1182979',
  'total_unmatched_left': '0',
  'total_unmatched_right': '51308'}}

In [258]:
# save json to s3
AWS_connection.save_to_s3(file_name=inpi_match_json_filename,file_path=inpi_match_json_absolute_filepath)

In [264]:
# prepare gz file
data_matched = data_merged[data_merged['_merge'] == 'both'] # save only merged values
gz_output = data_matched.to_csv(inpi_match_gz_filename,
            index = False,
            compression='gzip')

In [265]:
# save gz to s3
AWS_connection.save_to_s3(file_name=inpi_match_gz_filename,
                          file_path=inpi_match_gz_absolute_filepath)

In [268]:
# prepare gz file
data_unmatched = data_merged[data_merged['_merge'] == 'right_only'] # save only unmerged values
gz_output = data_unmatched.to_csv(inpi_unmatch_gz_filename,
            index = False,
            compression='gzip')

In [270]:
# save gz to s3
AWS_connection.save_to_s3(file_name=inpi_unmatch_gz_filename,file_path=inpi_unmatch_gz_absolute_filepath)

In [None]:
# 88 car max dans une ligne
# docstring