# US Name

Data preparation combine table information and journals


# Description

None

## Merge

**Main table** 

papers_meta_analysis_new

Merged with:

- papers_meta_analysis
- journals_scimago

# Target

- The file is saved in S3:
- bucket: datalake-datascience
- path: DATA/FINANCE/ESG/ESG_CFP
- Glue data catalog should be updated
- database: esg
- Table prefix: meta_analysis_
- table name: meta_analysis_esg_cfp
- Analytics
- HTML: ANALYTICS/HTML_OUTPUT/meta_analysis_esg_cfp
- Notebook: ANALYTICS/OUTPUT/meta_analysis_esg_cfp

# Metadata

- Key: 234_esg_metadata
- Epic: Dataset transformation
- US: Prepare meta-analysis table
- Task tag: #journal-information, #papers-information
- Analytics reports: https://htmlpreview.github.io/?https://github.com/thomaspernet/esg_metadata/blob/master/00_data_catalog/HTML_ANALYSIS/META_ANALYSIS_ESG_CFP.html

# Input

## Table/file

**Name**

- papers_meta_analysis_new
- papers_meta_analysis
- journals_scimago

**Github**

- https://github.com/thomaspernet/esg_metadata/blob/master/01_data_preprocessing/01_transform_tables/00_meta_analysis.md

In [1]:
from awsPy.aws_authorization import aws_connector
from awsPy.aws_s3 import service_s3
from awsPy.aws_glue import service_glue
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import os, shutil, json, re

path = os.getcwd()
parent_path = str(Path(path).parent.parent)


name_credential = 'financial_dep_SO2_accessKeys.csv'
region = 'eu-west-2'
bucket = 'datalake-london'
path_cred = "{0}/creds/{1}".format(parent_path, name_credential)

In [2]:
con = aws_connector.aws_instantiate(credential = path_cred,
                                       region = region)
client= con.client_boto()
s3 = service_s3.connect_S3(client = client,
                      bucket = bucket, verbose = True) 
glue = service_glue.connect_glue(client = client) 

In [3]:
pandas_setting = True
if pandas_setting:
    cm = sns.light_palette("green", as_cmap=True)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_colwidth', None)

# Prepare query 

Write query and save the CSV back in the S3 bucket `datalake-datascience` 

# Steps

## Merge journal and papers table

In [4]:
DatabaseName = 'esg'
s3_output_example = 'SQL_OUTPUT_ATHENA'

In [18]:
query = """
WITH merge AS (
  SELECT 
    id, 
    image,
    table_refer,
    incremental_id,
    paper_name, 
    publication_year, 
    publication_type, 
    regexp_replace(
      regexp_replace(
        lower(publication_name), 
        '\&', 
        'and'
      ), 
      '\-', 
      ' '
    ) as publication_name, 
    cnrs_ranking, 
    -- ranking, 
    peer_reviewed, 
    study_focused_on_social_environmental_behaviour, 
    type_of_data, 
    study_focusing_on_developing_or_developed_countries, 
    first_date_of_observations,
    last_date_of_observations,
    dependent, 
    independent, 
    lag, 
    interaction_term, 
    quadratic_term, 
    n, 
    r2, 
    beta, 
    to_remove, 
    critical_value, 
    true_standard_error, 
    true_t_value, 
    true_stars, 
    adjusted_dependent, 
    adjusted_independent, 
    adjusted_model, 
    significant,
    to_check_final
  FROM 
    esg.papers_meta_analysis_new 
    LEFT JOIN (
      SELECT 
        DISTINCT(title), 
        nr, 
        publication_year, 
        publication_type, 
        publication_name, 
        cnrs_ranking, 
        peer_reviewed, 
        study_focused_on_social_environmental_behaviour, 
        type_of_data, 
        study_focusing_on_developing_or_developed_countries
      FROM 
        esg.papers_meta_analysis
    ) as old on papers_meta_analysis_new.id = old.nr
    -- WHERE to_remove = 'TO_KEEP'
LEFT JOIN (
SELECT 
        nr,
        CAST(MIN(first_date_of_observations) as int) as first_date_of_observations,
        CAST(MAX(last_date_of_observations)as int) as last_date_of_observations
      FROM 
        esg.papers_meta_analysis
        GROUP BY nr
) as date_pub on papers_meta_analysis_new.id = date_pub.nr
) 
SELECT 
  * 
FROM 
  merge 
  LEFT JOIN (
    SELECT 
      rank, 
      regexp_replace(
        regexp_replace(
          lower(title), 
          '\&', 
          'and'
        ), 
        '\-', 
        ' '
      ) as title, 
      sjr, 
      sjr_best_quartile, 
      h_index, 
      total_docs_2020, 
      total_docs_3years, 
      total_refs, 
      total_cites_3years, 
      citable_docs_3years, 
      cites_doc_2years, 
      country 
    FROM 
      "scimago"."journals_scimago"
    WHERE sourceid not in (16400154787)
  ) as journal on merge.publication_name = journal.title
"""
output = (
    s3.run_query(
    query=query,
    database=DatabaseName,
    s3_output=s3_output_example,
    filename='example_1',
        dtype = {'publication_year':'string'}
)
    .sort_values(by = ['id', 'first_date_of_observations'])
    .drop_duplicates()
    .assign(weight = lambda x: x.groupby(['id'])['id'].transform('size'))
)
output.head()

Unnamed: 0,id,image,table_refer,incremental_id,paper_name,publication_year,publication_type,publication_name,cnrs_ranking,peer_reviewed,study_focused_on_social_environmental_behaviour,type_of_data,study_focusing_on_developing_or_developed_countries,first_date_of_observations,last_date_of_observations,dependent,independent,lag,interaction_term,quadratic_term,n,r2,beta,to_remove,critical_value,true_standard_error,true_t_value,true_stars,adjusted_dependent,adjusted_independent,adjusted_model,significant,to_check_final,rank,title,sjr,sjr_best_quartile,h_index,total_docs_2020,total_docs_3years,total_refs,total_cites_3years,citable_docs_3years,cites_doc_2years,country,weight
0,1,https://docs.google.com/spreadsheets/d/1d66_CVtWni7wmKlIMcpaoanvT2ghmjbXARiHgnLWvUw/edit#gid=479654123&range=B2,table 4,1,Corporate social responsibility financial instability and corporate,2020.0,Journal Article,finance research letters,3.0,Yes,Environmental Social and Governance,Panel data,Developed,2002.0,2017.0,ROA,CSR,no,no,no,469.0,0.108,0.0122,TO_KEEP,0.904,0.0135,,,ROA,,DIFF IN DIFF,NO,NO,2728.0,finance research letters,1.339,Q1,39.0,442.0,567.0,11373.0,2909.0,566.0,5.47,Netherlands,12
1,1,https://docs.google.com/spreadsheets/d/1d66_CVtWni7wmKlIMcpaoanvT2ghmjbXARiHgnLWvUw/edit#gid=479654123&range=B2,table 4,2,Corporate social responsibility financial instability and corporate,2020.0,Journal Article,finance research letters,3.0,Yes,Environmental Social and Governance,Panel data,Developed,2002.0,2017.0,ROA,CSR,no,no,no,469.0,0.198,1.2115,TO_KEEP,7.663,0.1581,,1_PERCENT,ROA,,DIFF IN DIFF,YES,NO,2728.0,finance research letters,1.339,Q1,39.0,442.0,567.0,11373.0,2909.0,566.0,5.47,Netherlands,12
2,1,https://docs.google.com/spreadsheets/d/1d66_CVtWni7wmKlIMcpaoanvT2ghmjbXARiHgnLWvUw/edit#gid=479654123&range=B2,table 4,3,Corporate social responsibility financial instability and corporate,2020.0,Journal Article,finance research letters,3.0,Yes,Environmental Social and Governance,Panel data,Developed,2002.0,2017.0,ROE,CSR,no,no,no,469.0,0.857,0.3959,TO_KEEP,4.949,0.08,,1_PERCENT,ROE,,DIFF IN DIFF,YES,NO,2728.0,finance research letters,1.339,Q1,39.0,442.0,567.0,11373.0,2909.0,566.0,5.47,Netherlands,12
3,1,https://docs.google.com/spreadsheets/d/1d66_CVtWni7wmKlIMcpaoanvT2ghmjbXARiHgnLWvUw/edit#gid=479654123&range=B2,table 4,4,Corporate social responsibility financial instability and corporate,2020.0,Journal Article,finance research letters,3.0,Yes,Environmental Social and Governance,Panel data,Developed,2002.0,2017.0,ROE,CSR,no,no,no,469.0,0.375,6.8766,TO_KEEP,12.546,0.5481,,1_PERCENT,ROE,,DIFF IN DIFF,YES,NO,2728.0,finance research letters,1.339,Q1,39.0,442.0,567.0,11373.0,2909.0,566.0,5.47,Netherlands,12
4,1,https://docs.google.com/spreadsheets/d/1d66_CVtWni7wmKlIMcpaoanvT2ghmjbXARiHgnLWvUw/edit#gid=479654123&range=B2,table 4,5,Corporate social responsibility financial instability and corporate,2020.0,Journal Article,finance research letters,3.0,Yes,Environmental Social and Governance,Panel data,Developed,2002.0,2017.0,TobinQ,CSR,no,no,no,467.0,0.539,-0.5037,TO_KEEP,-2.017,,,,TOBIN_Q,,DIFF IN DIFF,NO,YES,2728.0,finance research letters,1.339,Q1,39.0,442.0,567.0,11373.0,2909.0,566.0,5.47,Netherlands,12


In [6]:
output.shape

(701, 45)

In [7]:
output.describe()

Unnamed: 0,id,incremental_id,cnrs_ranking,first_date_of_observations,last_date_of_observations,n,r2,beta,critical_value,true_standard_error,true_t_value,adjusted_independent,rank,sjr,h_index,total_docs_2020,total_docs_3years,total_refs,total_cites_3years,citable_docs_3years,cites_doc_2years,weight
count,701.0,701.0,695.0,689.0,689.0,692.0,590.0,698.0,469.0,292.0,98.0,0.0,597.0,597.0,597.0,597.0,597.0,597.0,597.0,597.0,597.0,701.0
mean,49.25107,351.0,1.109353,2002.12627,2009.74746,1507.518786,1.330399,2.518595,33.744486,1.202663,0.988107,,3128.40201,2.651288,132.236181,793.857621,1427.798995,50777.407035,9698.144054,1415.222781,6.619229,19.610556
std,30.071862,202.505555,1.2275,6.877036,6.084442,3134.446149,7.279848,57.714197,519.895914,5.263863,2.157559,,3570.007939,2.792219,79.331114,2035.11191,3104.124126,125558.776862,21323.034999,3085.18317,3.01953,19.62843
min,1.0,1.0,0.0,1972.0,1986.0,20.0,-0.0145,-355.0,-28.897,-41.459694,-4.13,,59.0,0.314,16.0,23.0,52.0,1741.0,69.0,50.0,1.39,1.0
25%,28.0,176.0,0.0,1999.0,2006.0,183.25,0.13,-0.006,-0.128,0.005225,-0.01475,,652.0,1.115,73.0,152.0,282.0,9601.0,2331.0,280.0,3.75,6.0
50%,45.0,351.0,1.0,2004.0,2011.0,564.0,0.259,0.008,0.471,0.0795,0.8315,,1743.0,1.734,103.0,230.0,509.0,18613.0,3418.0,490.0,7.38,12.0
75%,79.0,526.0,2.0,2007.0,2014.0,937.0,0.4448,0.15275,2.16,1.013672,2.06525,,3714.0,2.983,199.0,439.0,891.0,20680.0,8098.0,881.0,8.98,24.0
max,110.0,701.0,4.0,2018.0,2018.0,26482.0,57.31,1451.91,10307.0,59.036,9.436,,14377.0,11.193,318.0,10654.0,14468.0,664559.0,103295.0,14339.0,16.3,70.0


In [9]:
output['weight'].describe()

count    701.000000
mean      19.610556
std       19.628430
min        1.000000
25%        6.000000
50%       12.000000
75%       24.000000
max       70.000000
Name: weight, dtype: float64

In [None]:
#output[output.duplicated(subset = ['id', 'beta',
#                                   'true_standard_error', 'critical_value', 'lag', 'independent',
#                                  'true_t_value', 'true_stars', 'adjusted_model'
#                                  ])].head()

Missing journals

In [None]:
output.loc[lambda x: x['title'].isin([np.nan])]['publication_name'].unique()

Currently, the missing values come from the rows to check in [METADATA_TABLES_COLLECTION](https://docs.google.com/spreadsheets/d/1d66_CVtWni7wmKlIMcpaoanvT2ghmjbXARiHgnLWvUw/edit#gid=899172650)

In [None]:
#output.loc[lambda x: x['true_standard_error'].isin([np.nan])].head(5)

In [8]:
output.isna().sum().loc[lambda x: x> 0].sort_values()

independent                                              2
dependent                                                2
adjusted_dependent                                       2
beta                                                     3
interaction_term                                         5
quadratic_term                                           5
lag                                                      5
cnrs_ranking                                             6
type_of_data                                             8
publication_type                                         8
n                                                        9
study_focusing_on_developing_or_developed_countries     10
last_date_of_observations                               12
first_date_of_observations                              12
peer_reviewed                                           26
sjr                                                    104
title                                                  1

Journal withouts critical information

### Save data to Google Spreadsheet for sharing

In [None]:
#!pip install --upgrade git+git://github.com/thomaspernet/GoogleDrive-python

In [10]:
from GoogleDrivePy.google_drive import connect_drive
from GoogleDrivePy.google_authorization import authorization_service

In [19]:
try:
    os.mkdir("creds")
except:
    pass

In [20]:
s3.download_file(key = "CREDS/Financial_dependency_pollution/creds/token.pickle", path_local = "creds")

In [21]:
auth = authorization_service.get_authorization(
    #path_credential_gcp=os.path.join(parent_path, "creds", "service.json"),
    path_credential_drive=os.path.join(path, "creds"),
    verbose=False,
    scope=['https://www.googleapis.com/auth/spreadsheets.readonly',
           "https://www.googleapis.com/auth/drive"]
)
gd_auth = auth.authorization_drive(path_secret=os.path.join(
    path, "creds", "credentials.json"))
drive = connect_drive.drive_operations(gd_auth)

In [22]:
import shutil
shutil.rmtree(os.path.join(path,"creds"))


In [23]:
FILENAME_SPREADSHEET = "METADATA_MODEL"
spreadsheet_id = drive.find_file_id(FILENAME_SPREADSHEET, to_print=False)

In [25]:
drive.add_data_to_spreadsheet(
    data =output.fillna(""),
    sheetID =spreadsheet_id,
    sheetName = "MODEL_DATA",
    detectRange = True,
    rangeData = None)

Range MODEL_DATA!A1:AU705 has been updated.


# Table `meta_analysis_esg_cfp`

Since the table to create has missing value, please use the following at the top of the query

```
CREATE TABLE database.table_name WITH (format = 'PARQUET') AS
```

Choose a location in S3 to save the CSV. It is recommended to save in it the `datalake-datascience` bucket. Locate an appropriate folder in the bucket, and make sure all output have the same format

In [26]:
s3_output = 'DATA/FINANCE/ESG/ESG_CFP'
table_name = 'meta_analysis_esg_cfp'

First, we need to delete the table (if exist)

In [38]:
try:
    response = glue.delete_table(
        database=DatabaseName,
        table=table_name
    )
    print(response)
except Exception as e:
    print(e)

{'ResponseMetadata': {'RequestId': '58671acb-d909-41e0-91c2-084ea3d3aa54', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Sun, 01 Aug 2021 08:11:17 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '2', 'connection': 'keep-alive', 'x-amzn-requestid': '58671acb-d909-41e0-91c2-084ea3d3aa54'}, 'RetryAttempts': 0}}


Clean up the folder with the previous csv file. Be careful, it will erase all files inside the folder

In [39]:
s3.remove_all_bucket(path_remove = s3_output)

True

In [40]:
%%time
query = """
CREATE TABLE {0}.{1} WITH (format = 'PARQUET') AS
WITH merge AS (
  SELECT 
    id, 
    image,
    table_refer,
    incremental_id,
    paper_name, 
    publication_year, 
    publication_type, 
    regexp_replace(
      regexp_replace(
        lower(publication_name), 
        '\&', 
        'and'
      ), 
      '\-', 
      ' '
    ) as publication_name, 
    cnrs_ranking, 
    -- ranking, 
    peer_reviewed, 
    study_focused_on_social_environmental_behaviour, 
    type_of_data, 
    study_focusing_on_developing_or_developed_countries, 
    first_date_of_observations,
    last_date_of_observations,
    dependent, 
    independent, 
    lag, 
    interaction_term, 
    quadratic_term, 
    n, 
    r2, 
    beta, 
    to_remove, 
    critical_value, 
    true_standard_error, 
    true_t_value, 
    true_stars, 
    adjusted_dependent, 
    adjusted_independent, 
    adjusted_model, 
    significant,
    to_check_final
  FROM 
    esg.papers_meta_analysis_new 
    LEFT JOIN (
      SELECT 
        DISTINCT(title), 
        nr, 
        publication_year, 
        publication_type, 
        publication_name, 
        cnrs_ranking, 
        peer_reviewed, 
        study_focused_on_social_environmental_behaviour, 
        type_of_data, 
        study_focusing_on_developing_or_developed_countries
      FROM 
        esg.papers_meta_analysis
    ) as old on papers_meta_analysis_new.id = old.nr
    -- WHERE to_remove = 'TO_KEEP'
LEFT JOIN (
SELECT 
        nr,
        CAST(MIN(first_date_of_observations) as int) as first_date_of_observations,
        CAST(MAX(last_date_of_observations)as int) as last_date_of_observations
      FROM 
        esg.papers_meta_analysis
        GROUP BY nr
) as date_pub on papers_meta_analysis_new.id = date_pub.nr
) 
SELECT 
  * 
FROM 
  merge 
  LEFT JOIN (
    SELECT 
      rank, 
      regexp_replace(
        regexp_replace(
          lower(title), 
          '\&', 
          'and'
        ), 
        '\-', 
        ' '
      ) as title, 
      sjr, 
      sjr_best_quartile, 
      h_index, 
      total_docs_2020, 
      total_docs_3years, 
      total_refs, 
      total_cites_3years, 
      citable_docs_3years, 
      cites_doc_2years, 
      country 
    FROM 
      "scimago"."journals_scimago"
    WHERE sourceid not in (16400154787)
  ) as journal on merge.publication_name = journal.title
""".format(DatabaseName, table_name)
output = s3.run_query(
                    query=query,
                    database=DatabaseName,
                    s3_output=s3_output,
                )
output

CPU times: user 150 ms, sys: 0 ns, total: 150 ms
Wall time: 3.17 s


{'Results': {'State': 'SUCCEEDED',
  'SubmissionDateTime': datetime.datetime(2021, 8, 1, 8, 11, 20, 5000, tzinfo=tzlocal()),
  'CompletionDateTime': datetime.datetime(2021, 8, 1, 8, 11, 22, 506000, tzinfo=tzlocal())},
 'QueryID': 'a5dc98c6-77fe-4d6e-b1b0-44458b419798'}

In [41]:
query_count = """
SELECT COUNT(*) AS CNT
FROM {}.{} 
""".format(DatabaseName, table_name)
output = s3.run_query(
                    query=query_count,
                    database=DatabaseName,
                    s3_output=s3_output_example,
    filename = 'count_{}'.format(table_name)
                )
output

Unnamed: 0,CNT
0,701


# Update Glue catalogue and Github

This step is mandatory to validate the query in the ETL.

## Create or update the data catalog

The query is saved in the S3 (bucket `datalake-london`), but the comments are not available. Use the functions below to update the catalogue and Github


Update the dictionary

- DatabaseName:
- TableName:
- ~TablePrefix:~
- input: 
- filename: Name of the notebook or Python script: to indicate
- Task ID: from Coda
- index_final_table: a list to indicate if the current table is used to prepare the final table(s). If more than one, pass the index. Start at 0
- if_final: A boolean. Indicates if the current table is the final table -> the one the model will be used to be trained
- schema: glue schema with comment
- description: details query objective

**Update schema**

If `automatic = False` in `automatic_update`, then the function returns only the variables to update the comments. Manually add the comment, **then**, pass the new schema (only the missing comment) to the argument `new_schema`. 

To update the schema, please use the following structure

```
schema = [
    {
        "Name": "VAR1",
        "Type": "",
        "Comment": ""
    },
    {
        "Name": "VAR2",
        "Type": "",
        "Comment": ""
    }
]
```

In [31]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append(os.path.join(parent_path, 'utils'))
import make_toc
import create_schema
import create_report
import update_glue_github

The function below manages everything automatically. If the final table comes from more than one query, then pass a list of table in `list_tables` instead of `automatic`

In [42]:
list_input,  schema = update_glue_github.automatic_update(
    list_tables = 'automatic',
    automatic= True,
    new_schema = None, ### override schema
    client = client,
    TableName = table_name,
    query = query)

In [43]:
description = """
Create table with journal information, papers and coefficients for the meta analysis
"""
name_json = 'parameters_ETL_esg_metadata.json'
partition_keys = ["id", 'incremental_id']
notebookname = "00_meta_analysis.ipynb"
dic_information = {
    "client":client,
    'bucket':bucket,
    's3_output':s3_output,
    'DatabaseName':DatabaseName,
    'TableName':table_name,
    'name_json':name_json,
    'partition_keys':partition_keys,
    'notebookname':notebookname,
    'index_final_table':[0],
    'if_final': 'True',
    'schema':schema,
    'description':description,
    'query':query,
    "list_input":list_input,
    'list_input_automatic':True
}

In [44]:
update_glue_github.update_glue_github(client = client,dic_information = dic_information)

Currently, the ETL has 1 tables


## Check Duplicates

One of the most important step when creating a table is to check if the table contains duplicates. The cell below checks if the table generated before is empty of duplicates. The code uses the JSON file to create the query parsed in Athena. 

You are required to define the group(s) that Athena will use to compute the duplicate. For instance, your table can be grouped by COL1 and COL2 (need to be string or varchar), then pass the list ['COL1', 'COL2'] 

In [35]:
update_glue_github.find_duplicates(
    client = client,
    bucket = bucket,
    name_json = name_json,
    partition_keys = partition_keys,
    TableName= table_name
)

Unnamed: 0,CNT,CNT_DUPLICATE
0,1,701


## Count missing values

In [36]:
update_glue_github.count_missing(client = client, name_json = name_json, bucket = bucket,TableName = table_name)

Unnamed: 0,total_missing,total_missing_pct
nb_obs,701,100.00%
adjusted_independent,701,100.00%
true_t_value,603,86.02%
true_standard_error,409,58.35%
critical_value,232,33.10%
r2,111,15.83%
total_refs,104,14.84%
total_docs_3years,104,14.84%
total_docs_2020,104,14.84%
sjr_best_quartile,104,14.84%


# Update Github Data catalog

The data catalog is available in Glue. Although, we might want to get a quick access to the tables in Github. In this part, we are generating a `README.md` in the folder `00_data_catalogue`. All tables used in the project will be added to the catalog. We use the ETL parameter file and the schema in Glue to create the README. 

Bear in mind the code will erase the previous README. 

In [45]:
create_schema.make_data_schema_github(name_json = name_json)

# Analytics

In this part, we are providing basic summary statistic. Since we have created the tables, we can parse the schema in Glue and use our json file to automatically generates the analysis.

The cells below execute the job in the key `ANALYSIS`. You need to change the `primary_key` and `secondary_key` 

For a full analysis of the table, please use the following Lambda function. Be patient, it can takes between 5 to 30 minutes. Times varies according to the number of columns in your dataset.

Use the function as follow:

- `output_prefix`:  s3://datalake-datascience/ANALYTICS/OUTPUT/TABLE_NAME/
- `region`: region where the table is stored
- `bucket`: Name of the bucket
- `DatabaseName`: Name of the database
- `table_name`: Name of the table
- `group`: variables name to group to count the duplicates
- `primary_key`: Variable name to perform the grouping -> Only one variable for now
- `secondary_key`: Variable name to perform the secondary grouping -> Only one variable for now
- `proba`: Chi-square analysis probabilitity
- `y_var`: Continuous target variables

Check the job processing in Sagemaker: https://eu-west-3.console.aws.amazon.com/sagemaker/home?region=eu-west-3#/processing-jobs

The notebook is available: https://s3.console.aws.amazon.com/s3/buckets/datalake-datascience?region=eu-west-3&prefix=ANALYTICS/OUTPUT/&showversions=false

Please, download the notebook on your local machine, and convert it to HTML:

```
cd "/Users/thomas/Downloads/Notebook"
aws s3 cp s3://datalake-datascience/ANALYTICS/OUTPUT/asif_unzip_data_csv/Template_analysis_from_lambda-2020-11-22-08-12-20.ipynb .

## convert HTML no code
jupyter nbconvert --no-input --to html Template_analysis_from_lambda-2020-11-21-14-30-45.ipynb
jupyter nbconvert --to html Template_analysis_from_lambda-2020-11-22-08-12-20.ipynb
```

Then upload the HTML to: https://s3.console.aws.amazon.com/s3/buckets/datalake-datascience?region=eu-west-3&prefix=ANALYTICS/HTML_OUTPUT/

Add a new folder with the table name in upper case

In [None]:
import boto3

key, secret_ = con.load_credential()
client_lambda = boto3.client(
    'lambda',
    aws_access_key_id=key,
    aws_secret_access_key=secret_,
    region_name = region)

In [None]:
primary_key = ''
secondary_key = ''
y_var = ''

In [None]:
payload = {
    "input_path": "s3://datalake-datascience/ANALYTICS/TEMPLATE_NOTEBOOKS/template_analysis_from_lambda.ipynb",
    "output_prefix": "s3://datalake-datascience/ANALYTICS/OUTPUT/{}/".format(table_name.upper()),
    "parameters": {
        "region": "{}".format(region),
        "bucket": "{}".format(bucket),
        "DatabaseName": "{}".format(DatabaseName),
        "table_name": "{}".format(table_name),
        "group": "{}".format(','.join(partition_keys)),
        "keys": "{},{}".format(primary_key,secondary_key),
        "y_var": "{}".format(y_var),
        "threshold":0
    },
}
payload

In [None]:
response = client_lambda.invoke(
    FunctionName='RunNotebook',
    InvocationType='RequestResponse',
    LogType='Tail',
    Payload=json.dumps(payload),
)
response

# Generation report

In [46]:
import os, time, shutil, urllib, ipykernel, json
from pathlib import Path
from notebook import notebookapp

In [47]:
create_report.create_report(extension = "html", keep_code = True, notebookname =  notebookname)

Report Available at this adress:
 /home/ec2-user/esg_metadata/01_data_preprocessing/01_transform_tables/Reports/00_meta_analysis.html


In [48]:
create_schema.create_schema(name_json, path_save_image = os.path.join(parent_path, 'utils'))

In [49]:
### Update TOC in Github
for p in [parent_path,
          str(Path(path).parent),
          os.path.join(str(Path(path).parent), "00_download_data"),
          #os.path.join(str(Path(path).parent.parent), "02_data_analysis"),
          #os.path.join(str(Path(path).parent.parent), "02_data_analysis", "00_statistical_exploration"),
          #os.path.join(str(Path(path).parent.parent), "02_data_analysis", "01_model_estimation"),
         ]:
    try:
        os.remove(os.path.join(p, 'README.md'))
    except:
        pass
    path_parameter = os.path.join(parent_path,'utils', name_json)
    md_lines =  make_toc.create_index(cwd = p, path_parameter = path_parameter)
    md_out_fn = os.path.join(p,'README.md')
    
    if p == parent_path:
    
        make_toc.replace_index(md_out_fn, md_lines, Header = os.path.basename(p).replace('_', ' '), add_description = True, path_parameter = path_parameter)
    else:
        make_toc.replace_index(md_out_fn, md_lines, Header = os.path.basename(p).replace('_', ' '), add_description = False)