# US Name
Estimate kandhelwal_quality as a function of  ln_lag_tax_rebate and others variables


# Description

- Compute fixed effect
- Estimate table 1
- Estimate table 2
- Estimate table 4
- Estimate table 5

## Variables
### Target

- kandhelwal_quality

### Features

- ln_lag_tax_rebate
- regime

## Complementary information



# Metadata

- Key: 183_VAT_rebate_quality
- Epic: Models
- US: 
- Task tag: #data-analysis
- Analytics reports: 

# Input Cloud Storage

## Table/file

**Name**

- https://github.com/thomaspernet/VAT_rebate_quality_china/blob/master/01_data_preprocessing/02_transform_tables/04_baseline_vat_quantity_covariates.md

**Github**

- china_vat_quality



# Connexion server

In [None]:
from awsPy.aws_authorization import aws_connector
from awsPy.aws_s3 import service_s3
from awsPy.aws_glue import service_glue
from pathlib import Path
import pandas as pd
import numpy as np
#import seaborn as sns
import os, shutil, json
import sys

path = os.getcwd()
parent_path = str(Path(path).parent.parent.parent)


name_credential = 'financial_dep_SO2_accessKeys.csv'
region = 'eu-west-3'
bucket = 'datalake-datascience'
path_cred = "{0}/creds/{1}".format(parent_path, name_credential)

In [None]:
con = aws_connector.aws_instantiate(credential = path_cred,
                                       region = region)
client= con.client_boto()
s3 = service_s3.connect_S3(client = client,
                      bucket = bucket, verbose = False)
glue = service_glue.connect_glue(client = client) 

In [None]:
pandas_setting = True
if pandas_setting:
    #cm = sns.light_palette("green", as_cmap=True)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_colwidth', None)

In [None]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'


# Load tables

Since we load the data as a Pandas DataFrame, we want to pass the `dtypes`. We load the schema from Glue to guess the types

In [None]:
db = 'chinese_trade'
table = 'china_vat_quality'

In [None]:
dtypes = {}
schema = (glue.get_table_information(database = db,
                           table = table)
          ['Table']['StorageDescriptor']['Columns']
         )
for key, value in enumerate(schema):
    if value['Type'] in ['varchar(12)',
                         'varchar(3)',
                        'varchar(14)', 'varchar(11)']:
        format_ = 'string'
    elif value['Type'] in ['decimal(21,5)', 'double', 'bigint', 'int', 'float']:
        format_ = 'float'
    else:
        format_ = value['Type'] 
    dtypes.update(
        {value['Name']:format_}
    )

In [None]:
download_data = False
filename = 'df_{}'.format(table)
full_path_filename = 'SQL_OUTPUT_ATHENA/CSV/{}.csv'.format(filename)
path_local = os.path.join(str(Path(path).parent.parent.parent), 
                              "00_data_catalog/temporary_local_data")
df_path = os.path.join(path_local, filename + '.csv')
if download_data:
    
    s3 = service_s3.connect_S3(client = client,
                          bucket = bucket, verbose = False)
    query = """
    SELECT * 
    FROM {}.{}
    """.format(db, table)
    try:
        df = (s3.run_query(
            query=query,
            database=db,
            s3_output='SQL_OUTPUT_ATHENA',
            filename=filename,  # Add filename to print dataframe
            destination_key='SQL_OUTPUT_ATHENA/CSV',  #Use it temporarily
            dtype = dtypes
        )
                )
    except:
        pass
    s3.download_file(
        key = full_path_filename
    )
    shutil.move(
        filename + '.csv',
        os.path.join(path_local, filename + '.csv')
    )
    s3.remove_file(full_path_filename)
    #df.head()

In [None]:
pd.DataFrame(schema)

# compute fixed effect

Create the following fixed effect for the baseline regression:

**index**

* city: `c`
* product: `k`
* sector: `s`
* year: `t`
* Destination: `j`
* regime: `r`

**FE**

* city-product: `FE_ck`
* City-sector-year: `FE_cst`
* City-product-regime: `FE_ckr`
* City-sector-regime-year: `FE_csrt`
* Product-year: `FE_kt`
* Product-destination: `FE_pj`
* Destination-year: `FE_jt`

In [None]:
create_fe = False
if create_fe:
    df = pd.read_csv(os.path.join(path_local, filename + '.csv'), dtype = dtypes)
    ### city-product
    df["fe_ck"] = pd.factorize(df["geocode4_corr"].astype('str') + 
                                        df["hs6"].astype('str')
                                       )[0]

    ### City-sector-year
    df["fe_cst"] = pd.factorize(df["geocode4_corr"].astype('str') + 
                                        df["hs2"].astype('str') +
                                        df["year"].astype('str')
                                       )[0]

    ### City-product-regime
    df["fe_ckr"] = pd.factorize(df["geocode4_corr"].astype('str') + 
                                        df["hs6"].astype('str') +
                                        df["regime"].astype('str')
                                       )[0]

    ### City-sector-regime-year
    df["fe_csrt"] = pd.factorize(df["geocode4_corr"].astype('str') + 
                                        df["hs2"].astype('str') +
                                        df["regime"].astype('str') +
                                        df["year"].astype('str')
                                       )[0]

    ## Product-year
    df["fe_kt"] = pd.factorize(df["hs6"].astype('str') + 
                                        df["year"].astype('str')
                                       )[0]

    ## Product-destination
    df["fe_kj"] = pd.factorize(df["hs6"].astype('str') + 
                                        df["country_en"].astype('str')
                                       )[0]

    ## Destination-year
    df["fe_jt"] = pd.factorize(df["country_en"].astype('str') + 
                                        df["year"].astype('str')
                                       )[0]

    ## city-product-destination
    df["fe_ckj"] = pd.factorize(df["geocode4_corr"].astype('str') + 
                                        df["hs6"].astype('str') + 
                                        df["country_en"].astype('str')
                                       )[0]
    df.to_csv(os.path.join(path_local, filename + '.csv'), index = False)

## Schema Latex table

To rename a variable, please use the following template:

```
{
    'old':'XX',
    'new':'XX_1'
    }
```

if you need to pass a latex format with `\`, you need to duplicate it for instance, `\text` becomes `\\text:

```
{
    'old':'working\_capital\_i',
    'new':'\\text{working capital}_i'
    }
```

Then add it to the key `to_rename`

In [None]:
add_to_dic = False
if add_to_dic:
    if os.path.exists("schema_table.json"):
        os.remove("schema_table.json")
        data = {'to_rename':[], 'to_remove':[]}
    dic_rename = [
        {
        'old':'working\_capital\_i',
        'new':'\\text{working capital}_i'
        },
        {
        'old':'periodTRUE',
        'new':'\\text{period}'
        },
        {
        'old':'tso2\_mandate\_c',
        'new':'\\text{policy mandate}_'
        },
    ]

    data['to_rename'].extend(dic_rename)
    with open('schema_table.json', 'w') as outfile:
        json.dump(data, outfile)

In [None]:
sys.path.append(os.path.join(parent_path, 'utils'))
import latex.latex_beautify as lb
#%load_ext autoreload
#%autoreload 2

In [None]:
options(warn=-1)
library(tidyverse)
library(lfe)
#library(lazyeval)
library('progress')
path = "../../../utils/latex/table_golatex.R"
source(path)

In [None]:
%get df_path
df_final <- read_csv(df_path) %>%
mutate_if(is.character, as.factor) %>%
    mutate_at(vars(starts_with("fe")), as.factor) %>%
mutate(regime = relevel(as.factor(regime), ref='NOT_ELIGIBLE'),)

In [None]:
head(df_final)

## Table 1:XXX

$$
\begin{aligned}
\operatorname{Quality}_{c,k,j, t}^{R} &=\alpha \ln \operatorname{VAT} \operatorname{Export} \operatorname{tax}_{k, t-1} \times \text { Eligibility }^{R} \\
&+F E_{c,k}^{R}+F E_{c,s,t}^{R}+ F E_{k, t}+\epsilon_{ck,j, t}^{R}
\end{aligned}
$$


* Column 1: Estimate for eligible regime only
    * FE: 
        - city-product: `fe_ck`
        - city-sector-year: `fe_cst`
        - product-destination: `fe_pj`
* Column 2: Estimate for non-eligible regime only
    * FE: 
        - city-product: `fe_ck`
        - city-sector-year: `fe_cst`
        - product-destination: `fe_pj`
* Column 3: Full estimate without product-year FE -> Get two coefficients
    * FE: 
        - city-product-regime: `fe_ckr`
        - city-sector-regime-year: `fe_csrt`
        - product-destination: `fe_pj`
* Column 4: Baseline estimate -> Focus on the coef of interest only
    * FE: 
        - city-product-regime: `fe_ckr`
        - city-sector-regime-year: `fe_csrt`
        - product-year: `fe_kt`

Sector is defined as the GBT 4 digits

In [None]:
folder = 'Tables_0'
table_nb = 1
table = 'table_{}'.format(table_nb)
path = os.path.join(folder, table + '.txt')
if os.path.exists(folder) == False:
        os.mkdir(folder)
for ext in ['.txt', '.tex', '.pdf']:
    x = [a for a in os.listdir(folder) if a.endswith(ext)]
    [os.remove(os.path.join(folder, i)) for i in x]

In [None]:
%get path table
t_0 <- felm(kandhelwal_quality ~ln_lag_tax_rebate+ ln_lag_import_tax  
            | fe_ck + fe_cst+fe_kj|0 | hs6, df_final %>% filter(regime == 'ELIGIBLE'),
            exactDOF = TRUE)

print('table 0 done')
t_1 <- felm(kandhelwal_quality ~ln_lag_tax_rebate + ln_lag_import_tax 
            | fe_ck + fe_cst+fe_kj|0 | hs6, df_final %>% filter(regime != 'ELIGIBLE'),
            exactDOF = TRUE)

print('table 1 done')
t_2 <- felm(kandhelwal_quality ~ln_lag_tax_rebate* regime + ln_lag_import_tax * regime+ ln_lag_import_tax
            | fe_ckr + fe_csrt + fe_kj|0 | hs6, df_final,
            exactDOF = TRUE)

print('table 2 done')
t_3 <- felm(kandhelwal_quality ~ln_lag_tax_rebate* regime + ln_lag_import_tax * regime+ ln_lag_import_tax 
            | fe_ckr + fe_csrt+fe_kt|0 | hs6, df_final,
            exactDOF = TRUE)

print('table 3 done')
t_4 <- felm(kandhelwal_quality ~ln_lag_tax_rebate+ ln_lag_import_tax  
            | fe_ck + fe_cst+fe_kj+ fe_ckj|0 | hs6, df_final %>% filter(regime == 'ELIGIBLE'),
            exactDOF = TRUE)

print('table 4 done')
t_5 <- felm(kandhelwal_quality ~ln_lag_tax_rebate + ln_lag_import_tax 
            | fe_ck + fe_cst+fe_kj+ fe_ckj|0 | hs6, df_final %>% filter(regime != 'ELIGIBLE'),
            exactDOF = TRUE)

print('table 5 done')
t_6 <- felm(kandhelwal_quality ~ln_lag_tax_rebate* regime + ln_lag_import_tax * regime+ ln_lag_import_tax
            | fe_ckr + fe_csrt + fe_kj+ fe_ckj|0 | hs6, df_final,
            exactDOF = TRUE)

print('table 6 done')
t_7 <- felm(kandhelwal_quality ~ln_lag_tax_rebate* regime + ln_lag_import_tax * regime+ ln_lag_import_tax 
            | fe_ckr + fe_csrt+fe_kt+ fe_ckj|0 | hs6, df_final,
            exactDOF = TRUE)
            
dep <- "Dependent variable: Product quality"
fe1 <- list(
    c("City-product fixed effects", "Yes", "Yes", "No", "No", "Yes", "Yes", "No", "No"),
    
    c("City-sector-year fixed effects", "Yes", "Yes", "No", "No", "Yes", "Yes", "No", "No"),
    
    c("Product-destination fixed effect","Yes", "Yes", "Yes", "No","Yes", "Yes", "Yes", "No"),
    
    c("City-product-regime fixed effects","No", "No", "Yes", "Yes","No", "No", "Yes", "Yes"),
    
    c("City-sector-regime-year fixed effects","No", "No", "Yes", "Yes","No", "No", "Yes", "Yes"),
    
    c("Product-year fixed effects", "No", "No", "No", "Yes", "No", "No", "No", "Yes"),
    
    c("City-product-destination fixed effects", "No", "No", "No", "No", "Yes", "Yes", "Yes", "Yes")
    
             )

table_1 <- go_latex(list(
    t_0,t_1, t_2, t_3, t_4, t_5, t_6, t_7
),
    title="VAT export tax and product's quality upgrading, baseline regression",
    dep_var = dep,
    addFE=fe1,
    save=TRUE,
    note = FALSE,
    name=path
) 

In [None]:
tbe1  = "This table estimates eq(3). " \
"Note that 'Eligible' refers to the regime entitle to VAT refund, our treatment group." \
"Our control group is processing trade with supplied input, 'Non-Eligible' to VAT refund." \
"Sectors are defined following the Chinese 4-digit GB/T industry" \
"classification and regroup several products." \
"Heteroskedasticity-robust standard errors" \
"clustered at the product level appear inparentheses."\
"\sym{*} Significance at the 10\%, \sym{**} Significance at the 5\%, \sym{***} Significance at the 1\%."

multicolumn ={
    'Eligible': 1,
    'Non-Eligible': 1,
    'All': 1,
    'All benchmark': 1,
    'Eligible': 1,
    'Non-Eligible': 1,
    'All': 1,
    'All benchmark': 1,
}

multi_lines_dep = '(city/product/trade regime/year)'
#new_r = ['& test1', 'test2']
lb.beautify(table_number = table_nb,
            #reorder_var = reorder,
            multi_lines_dep = multi_lines_dep,
            #new_row= new_r,
            multicolumn = multicolumn,
            table_nte = tbe1,
            jupyter_preview = True,
            resolution = 150,
            folder = folder)

# Generate reports

In [None]:
import os, time, shutil, urllib, ipykernel, json
from pathlib import Path
from notebook import notebookapp
import make_toc

In [None]:
def create_report(extension = "html", keep_code = False, notebookname = None):
    """
    Create a report from the current notebook and save it in the 
    Report folder (Parent-> child directory)
    
    1. Exctract the current notbook name
    2. Convert the Notebook 
    3. Move the newly created report
    
    Args:
    extension: string. Can be "html", "pdf", "md"
    
    
    """
    
    ### Get notebook name
    connection_file = os.path.basename(ipykernel.get_connection_file())
    kernel_id = connection_file.split('-', 1)[0].split('.')[0]

    for srv in notebookapp.list_running_servers():
        try:
            if srv['token']=='' and not srv['password']:  
                req = urllib.request.urlopen(srv['url']+'api/sessions')
            else:
                req = urllib.request.urlopen(srv['url']+ \
                                             'api/sessions?token=' + \
                                             srv['token'])
            sessions = json.load(req)
            notebookname = sessions[0]['name']
        except:
            notebookname = notebookname  
    
    sep = '.'
    path = os.getcwd()
    #parent_path = str(Path(path).parent)
    
    ### Path report
    #path_report = "{}/Reports".format(parent_path)
    #path_report = "{}/Reports".format(path)
    
    ### Path destination
    name_no_extension = notebookname.split(sep, 1)[0]
    source_to_move = name_no_extension +'.{}'.format(extension)
    dest = os.path.join(path,'Reports', source_to_move)
    
    ### Generate notebook
    if keep_code:
        os.system('jupyter nbconvert --to {} {}'.format(
    extension,notebookname))
    else:
        os.system('jupyter nbconvert --no-input --to {} {}'.format(
    extension,notebookname))
    
    ### Move notebook to report folder
    #time.sleep(5)
    shutil.move(source_to_move, dest)
    print("Report Available at this adress:\n {}".format(dest))

In [None]:
create_report(extension = "html", keep_code = False, notebookname = None)

In [None]:
### Update TOC in Github
for p in [parent_path, path, str(Path(path).parent), os.path.join(str(Path(path).parent), "00_download_data_from")]:
    try:
        os.remove(os.path.join(p, 'README.md'))
    except:
        pass
    path_parameter = os.path.join(parent_path,'utils', 'parameters_ETL_Financial_dependency_pollution.json')
    md_lines =  make_toc.create_index(cwd = p, path_parameter = path_parameter)
    md_out_fn = os.path.join(p,'README.md')
    
    if p == parent_path:
    
        make_toc.replace_index(md_out_fn, md_lines, Header = os.path.basename(p).replace('_', ' '), add_description = True, path_parameter = path_parameter)
    else:
        make_toc.replace_index(md_out_fn, md_lines, Header = os.path.basename(p).replace('_', ' '), add_description = False)