# US Name
Model estimate Estimate pollution abatement equipment and internal finance


# Description

None

# Metadata

- Key: 317_Financial_dependency_pollution
- Epic: Models
- US: Evaluate econometrics model
- Task tag: #econometrics-strategy, #pollution-abatement-equipment, #training-Financial-dependency-pollution
- Analytics reports: 

# Input

## Table/file

**Name**

None

**Github**

- https://github.com/thomaspernet/Financial_dependency_pollution/blob/master/02_data_analysis/01_model_train_evaluate/00_estimate_fin_ratio/07_pollution_abatement_equation.md



# Connexion server

In [1]:
from awsPy.aws_authorization import aws_connector
from awsPy.aws_s3 import service_s3
from awsPy.aws_glue import service_glue
from pathlib import Path
import pandas as pd
import numpy as np
#import seaborn as sns
import os, shutil, json
import sys

path = os.getcwd()
parent_path = str(Path(path).parent.parent.parent)


name_credential = 'financial_dep_SO2_accessKeys.csv'
region = 'eu-west-2'
bucket = 'datalake-london'
path_cred = "{0}/creds/{1}".format(parent_path, name_credential)

In [2]:
con = aws_connector.aws_instantiate(credential = path_cred,
                                       region = region)
client= con.client_boto()
s3 = service_s3.connect_S3(client = client,
                      bucket = bucket, verbose = False)
glue = service_glue.connect_glue(client = client) 

In [3]:
pandas_setting = True
if pandas_setting:
    #cm = sns.light_palette("green", as_cmap=True)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_colwidth', None)

In [4]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'


# Load tables

Since we load the data as a Pandas DataFrame, we want to pass the `dtypes`. We load the schema from Glue to guess the types

In [5]:
db = 'environment'
table = 'fin_dep_pollution_baseline_city'

In [6]:
dtypes = {}
schema = (glue.get_table_information(database = db,
                           table = table)
          ['Table']['StorageDescriptor']['Columns']
         )
for key, value in enumerate(schema):
    if value['Type'] in ['varchar(12)',
                         'varchar(3)',
                        'varchar(14)', 'varchar(11)']:
        format_ = 'string'
    elif value['Type'] in ['decimal(21,5)', 'double', 'bigint', 'int', 'float']:
        format_ = 'float'
    else:
        format_ = value['Type'] 
    dtypes.update(
        {value['Name']:format_}
    )

In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
download_data = True
filename = 'df_{}'.format(table)
full_path_filename = 'SQL_OUTPUT_ATHENA/CSV/{}.csv'.format(filename)
path_local = os.path.join(str(Path(path).parent.parent.parent), 
                              "00_data_catalogue/temporary_local_data")
df_path = os.path.join(path_local, filename + '.csv')
if download_data:
    
    s3 = service_s3.connect_S3(client = client,
                          bucket = bucket, verbose = False)
    query = """
    SELECT * 
    FROM {}.{}
    WHERE 
      year in (
        '2001', '2002', '2003', '2004', '2005', 
        '2006', '2007'
      ) 
      AND 
      lag_current_ratio > 0 
      AND
      lag_cashflow_to_tangible > 0 
      AND 
      tfp_cit > 0
    """.format(db, table)
    df = (s3.run_query(
        query=query,
        database=db,
        s3_output='SQL_OUTPUT_ATHENA',
        filename=filename,  # Add filename to print dataframe
        destination_key='SQL_OUTPUT_ATHENA/CSV',  #Use it temporarily
        dtype = dtypes
    )
    .sort_values(by = ['geocode4_corr','ind2', 'year'])
    .assign(
        tso2_eq_output = lambda x: (x['tdso2_equip'])/(x['output']/1000),
        tso2_eq_output_1 = lambda x: (x['tdso2_equip']+1)/(x['output']/1000),
        tso2_eq_asset = lambda x: (x['tdso2_equip'])/(x['total_asset']/1000),
        tso2_eq_asset_1 = lambda x: (x['tdso2_equip']+1)/(x['total_asset']/1000),
        constraint = lambda x: x['credit_constraint'] > -0.44,
        constraint_1 = lambda x: x['credit_constraint'] > -0.26,
        target = lambda x: np.where(x['tdso2_equip'] > 0, 1,0)
    )
         )
    s3.download_file(
        key = full_path_filename
    )
    shutil.move(
        filename + '.csv',
        os.path.join(path_local, filename + '.csv')
    )
    s3.remove_file(full_path_filename)
    df.head()

In [9]:
df.reindex(columns = ['geocode4_corr','ind2', 'year',
                      'tso2_eq_output_1', 
                      'pct_change_eq', 
                      "std_eq_ind",
                      "std_eq_c",
                      "std_eq_year",
                      'std_eq',
                      "lag_cashflow_to_tangible",
                      "lag_current_ratio",
                      "pct_change_eq_raw",
                     "pct_change_cash", 
                     'pct_change_curr'])

Unnamed: 0,geocode4_corr,ind2,year,tso2_eq_output_1,pct_change_eq,std_eq_ind,std_eq_c,std_eq_year,std_eq,lag_cashflow_to_tangible,lag_current_ratio,pct_change_eq_raw,pct_change_cash,pct_change_curr
1798,1101,13,2001,0.008636,,,,,,0.24285,0.72158,,,
16543,1101,13,2002,0.061991,,,,,,0.03419,0.76847,,,
13175,1101,13,2004,0.010008,,,,,,0.13697,0.64482,,,
1178,1101,13,2005,0.009307,,,,,,0.12685,0.73212,,,
21783,1101,13,2006,0.010239,,,,,,0.08379,0.69509,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18589,6502,25,2001,0.000076,,,,,,0.14876,0.38135,,,
3336,6502,25,2002,0.000087,,,,,,0.07818,0.30314,,,
4077,6502,25,2003,0.000162,,,,,,0.03580,0.28023,,,
3121,6502,25,2004,0.031656,,,,,,0.05205,0.77773,,,


In [10]:
df.to_csv(os.path.join(path_local, filename + '.csv'))

In [11]:
(
    df[['credit_constraint', 'constraint']].drop_duplicates().sort_values(by = ['credit_constraint'])
    .describe()
)

Unnamed: 0,credit_constraint
count,25.0
mean,-0.57
std,0.658882
min,-2.59
25%,-0.8
50%,-0.44
75%,-0.26
max,0.62


In [12]:
pd.DataFrame(schema)

Unnamed: 0,Name,Type,Comment
0,year,string,year from 2001 to 2007
1,period,varchar(5),"False if year before 2005 included, True if year 2006 and 2007"
2,province_en,string,
3,geocode4_corr,string,
4,tcz,string,Two control zone policy city
...,...,...,...
81,dominated_sales_for_i,"map<double,boolean>","map with information on foreign dominated industry knowing percentile .5, .75, .9, .95 of sales"
82,dominated_capital_for_i,"map<double,boolean>","map with information on foreign dominated industry knowing percentile .5, .75, .9, .95 of capital"
83,fe_c_i,bigint,City industry fixed effect
84,fe_t_i,bigint,year industry fixed effect


## Schema Latex table

To rename a variable, please use the following template:

```
{
    'old':'XX',
    'new':'XX_1'
    }
```

if you need to pass a latex format with `\`, you need to duplicate it for instance, `\text` becomes `\\text:

```
{
    'old':'working\_capital\_i',
    'new':'\\text{working capital}_i'
    }
```

Then add it to the key `to_rename`

In [13]:
add_to_dic = True
if add_to_dic:
    if os.path.exists("schema_table.json"):
        os.remove("schema_table.json")
        data = {'to_rename':[], 'to_remove':[]}
    dic_rename =  [
        {
        'old':'periodTRUE',
        'new':'\\text{period}'
        },
        ### depd
        {
        'old':'total\_asset',
        'new':'\\text{total asset}'
        },
        {
        'old':'tangible',
        'new':'\\text{tangible asset}'
        },
        {
        'old':'investment\_tot\_asset',
        'new':'\\text{investment to asset}'
        },
        {
        'old':'rd\_tot\_asset',
        'new':'\\text{rd to asset}'
        },
        {
        'old':'asset\_tangibility\_tot\_asset',
        'new':'\\text{asset tangibility}'
        },
        
        ### ind
        {
        'old':'current\_ratio',
        'new':'\\text{current ratio}'
        },
        {
        'old':'lag\_current\_ratio',
        'new':'\\text{current ratio}'
        },
        {
        'old':'quick\_ratio',
        'new':'\\text{quick ratio}'
        },
        {
        'old':'lag\_liabilities\_tot\_asset',
        'new':'\\text{liabilities to asset}'
        },
        {
        'old':'liabilities\_tot\_asset',
        'new':'\\text{liabilities to asset}'
        },
        {
        'old':'sales\_tot\_asset',
        'new':'\\text{sales to asset}'
        },
        {
        'old':'lag\_sales\_tot\_asset',
        'new':'\\text{sales to asset}'
        },
        {
        'old':'cash\_tot\_asset',
        'new':'\\text{cash to asset}'
        },
        {
        'old':'cashflow\_tot\_asset',
        'new':'\\text{cashflow to asset}'
        },
        {
        'old':'cashflow\_to\_tangible',
        'new':'\\text{cashflow}'
        },
        {
        'old':'lag\_cashflow\_to\_tangible',
        'new':'\\text{cashflow}'
        },
        {
        'old':'d\_credit\_constraintBELOW',
        'new':'\\text{Fin dep}_{i}'
        },
        ## control
        {
        'old':'age + 1',
        'new':'\\text{age}'
        },
        {
        'old':'export\_to\_sale',
        'new':'\\text{export to sale}'
        },
        {
        'old':'labor\_capital',
        'new':'\\text{labor to capital}'
        },
        ### Supply demand external finance
        {
        'old':'supply\_all\_credit',
        'new':'\\text{all credit}'
        },
        {
        'old':'supply\_long\_term\_credit',
        'new':'\\text{long term credit}'
        },
        {
        'old':'credit\_constraint',
        'new':'\\text{credit demand}'
        },
        {
        'old':'soe\_vs\_priPRIVATE',
        'new':'\\text{private}'
        },
        ## TFP
        {
        'old':'tfp\_cit',
        'new':'\\text{TFP}'
        },
        {
        'old':'industry\_size',
        'new':'\\text{industry size}'
        },
        {
        'old':'constraintTRUE',
        'new':'\\text{constraint}'
        }
        
    ]
    

    data['to_rename'].extend(dic_rename)
    with open('schema_table.json', 'w') as outfile:
        json.dump(data, outfile)

In [14]:
sys.path.append(os.path.join(parent_path, 'utils'))
import latex.latex_beautify as lb
#%load_ext autoreload
#%autoreload 2

In [15]:
df.head()

Unnamed: 0,year,period,province_en,geocode4_corr,tcz,spz,ind2,short,polluted_d50i,polluted_d75i,polluted_d80i,polluted_d85i,polluted_d90i,polluted_d95i,polluted_mi,polluted_d50_cit,polluted_d75_cit,polluted_d80_cit,polluted_d85_cit,polluted_d90_cit,polluted_d95_cit,polluted_m_cit,tso2,tlssnl,tdwastegas_equip,tdso2_equip,tfqzlssnl,ttlssnl,ttoutput,firmdum,tfirm,so2_intensity,tso2_mandate_c,above_threshold_mandate,above_average_mandate,in_10_000_tonnes,tfp_cit,credit_constraint,supply_all_credit,supply_long_term_credit,output,sales,employment,capital,current_asset,tofixed,total_liabilities,total_asset,tangible,cashflow,current_ratio,lag_current_ratio,liabilities_tot_asset,sales_tot_asset,lag_sales_tot_asset,asset_tangibility_tot_asset,lag_liabilities_tot_asset,cashflow_to_tangible,lag_cashflow_to_tangible,cashflow_tot_asset,lag_cashflow_tot_asset,return_to_sale,lag_return_to_sale,dominated_output_soe_c,dominated_employment_soe_c,dominated_sales_soe_c,dominated_capital_soe_c,dominated_output_for_c,dominated_employment_for_c,dominated_sales_for_c,dominated_capital_for_c,dominated_output_i,dominated_employ_i,dominated_sales_i,dominated_capital_i,dominated_output_soe_i,dominated_employment_soe_i,dominated_sales_soe_i,dominated_capital_soe_i,dominated_output_for_i,dominated_employment_for_i,dominated_sales_for_i,dominated_capital_for_i,fe_c_i,fe_t_i,fe_c_t,tso2_eq_output,tso2_eq_output_1,tso2_eq_asset,tso2_eq_asset_1,constraint,constraint_1,target
1798,2001,False,Beijing,1101,1,1,13,Processing foods,ABOVE,BELOW,BELOW,BELOW,BELOW,BELOW,BELOW,BELOW,BELOW,BELOW,BELOW,BELOW,BELOW,BELOW,203034,3.0,53,9,193279,9,290277.22,2,5,0.17534,0.381358,"{0.5=false, 0.9=false, 0.75=false, 0.95=true}",ABOVE,3.813583,8.759098,-0.47,0.436681,0.787402,1157918.0,1176993.0,5446.0,366903.0,386373,630840,709093,1259546,607168,20758,0.76847,0.72158,0.56562,0.93446,0.88777,0.48205,0.58289,0.03419,0.24285,0.01648,0.0958,0.00225,0.06211,False,False,False,True,False,False,False,False,"{0.5=true, 0.9=true, 0.75=true, 0.95=false}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=true, 0.9=true, 0.75=true, 0.95=false}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=false, 0.9=false, 0.75=true, 0.95=true}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=true, 0.9=true, 0.75=true, 0.95=false}","{0.5=true, 0.9=true, 0.75=true, 0.95=true}","{0.5=true, 0.9=true, 0.75=true, 0.95=false}","{0.5=true, 0.9=true, 0.75=true, 0.95=true}",1,30,2,0.007773,0.008636,0.007145,0.007939,False,False,1
16543,2002,False,Beijing,1101,1,1,13,Processing foods,ABOVE,BELOW,BELOW,BELOW,BELOW,BELOW,BELOW,BELOW,BELOW,BELOW,BELOW,BELOW,BELOW,BELOW,233729,0.0,46,8,142007,7,339881.5,0,4,1.6099,0.381358,"{0.5=false, 0.9=false, 0.75=false, 0.95=true}",ABOVE,3.813583,8.819793,-0.47,0.436681,0.787402,145182.0,133640.0,1457.0,89240.0,54147,93865,86354,179435,90049,6946,0.68479,0.76847,0.4926,0.74478,0.93446,0.50185,0.56562,0.07714,0.03419,0.03871,0.01648,0.02897,0.00225,False,False,False,True,False,False,False,False,"{0.5=true, 0.9=true, 0.75=true, 0.95=false}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=true, 0.9=true, 0.75=true, 0.95=false}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=false, 0.9=false, 0.75=true, 0.95=true}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=true, 0.9=true, 0.75=true, 0.95=false}","{0.5=true, 0.9=true, 0.75=true, 0.95=true}","{0.5=true, 0.9=true, 0.75=true, 0.95=false}","{0.5=true, 0.9=true, 0.75=true, 0.95=true}",1,58,3,0.055103,0.061991,0.044584,0.050157,False,False,1
13175,2004,False,Beijing,1101,1,1,13,Processing foods,ABOVE,BELOW,BELOW,BELOW,BELOW,BELOW,BELOW,ABOVE,BELOW,BELOW,BELOW,BELOW,BELOW,BELOW,140338,160.0,44,33,208630,320,466079.1,5,11,0.04131,0.381358,"{0.5=false, 0.9=false, 0.75=false, 0.95=true}",ABOVE,3.813583,8.954105,-0.47,0.436681,0.787402,3397307.0,3415148.0,7614.0,807431.0,843864,1098916,1612878,2765865,1049098,133076,0.73212,0.64482,0.58314,1.23475,1.19364,0.3793,0.56745,0.12685,0.13697,0.04811,0.0554,0.02856,0.03654,False,False,False,True,False,False,False,False,"{0.5=true, 0.9=true, 0.75=true, 0.95=false}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=true, 0.9=true, 0.75=true, 0.95=false}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=false, 0.9=false, 0.75=true, 0.95=true}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=true, 0.9=true, 0.75=true, 0.95=false}","{0.5=true, 0.9=true, 0.75=true, 0.95=true}","{0.5=true, 0.9=true, 0.75=true, 0.95=false}","{0.5=true, 0.9=true, 0.75=true, 0.95=true}",1,116,5,0.009714,0.010008,0.011931,0.012293,False,False,1
1178,2005,False,Beijing,1101,1,1,13,Processing foods,ABOVE,BELOW,BELOW,BELOW,BELOW,BELOW,BELOW,ABOVE,BELOW,BELOW,BELOW,BELOW,BELOW,BELOW,248247,28.0,47,36,296084,194,988785.0,6,15,0.06244,0.381358,"{0.5=false, 0.9=false, 0.75=false, 0.95=true}",ABOVE,3.813583,9.021108,-0.47,0.436681,0.787402,3975622.0,4347958.0,9218.0,693645.0,831309,1142232,1770725,2842121,1088843,91236,0.69509,0.73212,0.62303,1.52983,1.23475,0.38311,0.58314,0.08379,0.12685,0.0321,0.04811,0.01194,0.02856,False,False,False,True,False,False,False,False,"{0.5=true, 0.9=true, 0.75=true, 0.95=false}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=true, 0.9=true, 0.75=true, 0.95=false}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=false, 0.9=false, 0.75=true, 0.95=true}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=true, 0.9=true, 0.75=true, 0.95=false}","{0.5=true, 0.9=true, 0.75=true, 0.95=true}","{0.5=true, 0.9=true, 0.75=true, 0.95=false}","{0.5=true, 0.9=true, 0.75=true, 0.95=true}",1,145,6,0.009055,0.009307,0.012667,0.013018,False,False,1
21783,2006,True,Beijing,1101,1,1,13,Processing foods,ABOVE,BELOW,BELOW,BELOW,BELOW,BELOW,BELOW,ABOVE,BELOW,BELOW,BELOW,BELOW,BELOW,BELOW,224128,20.0,50,39,406005,224,712255.6,6,16,0.05737,0.381358,"{0.5=false, 0.9=false, 0.75=false, 0.95=true}",ABOVE,3.813583,9.095758,-0.47,0.436681,0.787402,3906549.0,4329112.0,9558.0,682314.0,835734,1278560,1931780,3195765,1230639,126545,0.57933,0.69509,0.60448,1.35464,1.52983,0.38508,0.62303,0.10283,0.08379,0.0396,0.0321,0.02314,0.01194,False,False,False,True,False,False,False,False,"{0.5=true, 0.9=true, 0.75=true, 0.95=false}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=true, 0.9=true, 0.75=true, 0.95=false}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=false, 0.9=false, 0.75=true, 0.95=true}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=false, 0.9=false, 0.75=false, 0.95=false}","{0.5=true, 0.9=true, 0.75=true, 0.95=false}","{0.5=true, 0.9=true, 0.75=true, 0.95=true}","{0.5=true, 0.9=true, 0.75=true, 0.95=false}","{0.5=true, 0.9=true, 0.75=true, 0.95=true}",1,174,7,0.009983,0.010239,0.012204,0.012517,False,False,1


In [16]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [17]:
options(warn=-1)
library(tidyverse)
library(lfe)
#library(lazyeval)
library('progress')
#library('emmeans')
path = "../../../utils/latex/table_golatex.R"
source(path)

Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.1       ✔ purrr   0.3.4  
✔ tibble  2.1.1       ✔ dplyr   0.8.0.1
✔ tidyr   0.8.3       ✔ stringr 1.4.0  
✔ readr   1.3.1       ✔ forcats 0.4.0  
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Loading required package: Matrix

Attaching package: ‘Matrix’

The following object is masked from ‘package:tidyr’:

    expand


Please cite as: 

 Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.2. https://CRAN.R-project.org/package=stargazer 



In [18]:
%get df_path
df_final <- read_csv(df_path) %>%
mutate_if(is.character, as.factor) %>%
    mutate_at(vars(starts_with("fe")), as.factor) %>%
mutate(
    constraint = relevel(as.factor(constraint), ref='FALSE'),
    constraint_test = relevel(as.factor(constraint), ref='TRUE'),
    constraint_1 = relevel(as.factor(constraint_1), ref='FALSE'),
)%>% filter(tdso2_equip <52)

Parsed with column specification:
cols(
  .default = col_double(),
  period = col_logical(),
  province_en = col_character(),
  short = col_character(),
  polluted_d50i = col_character(),
  polluted_d75i = col_character(),
  polluted_d80i = col_character(),
  polluted_d85i = col_character(),
  polluted_d90i = col_character(),
  polluted_d95i = col_character(),
  polluted_mi = col_character(),
  polluted_d50_cit = col_character(),
  polluted_d75_cit = col_character(),
  polluted_d80_cit = col_character(),
  polluted_d85_cit = col_character(),
  polluted_d90_cit = col_character(),
  polluted_d95_cit = col_character(),
  polluted_m_cit = col_character(),
  above_threshold_mandate = col_character(),
  above_average_mandate = col_character(),
  dominated_output_soe_c = col_logical()
  # ... with 21 more columns
)
See spec(...) for full column specifications.


Is the data correct?

## Table 1: Pollution abatement channel

$$\begin{aligned} \text{Equipment}_{cit} &=  \alpha_2 \text{Internal finance}_{cit-1}+\beta \text{X}_{cit} + \gamma_{it} +\gamma_{ct} + \epsilon_{cit} \end{aligned}$$

The following variables are lagged:

- cashflow
- current ratio
- sale over asset

- Internal finance is a driver of pollution abatement systems
- The acquisition of pollution abatement systems is realized in a dissimilar way: 
    - Small firms use their cashflow to invest while large firms can use a credit because they have a collateral and are less constraint than small firms

Follow methodology: [Greening Through Finance?](https://drive.google.com/file/d/1E6hPTzv6CPgR-uJydgzhcfN2G-CjFUWu/view?usp=sharing)

In [None]:
folder = 'Tables_0'
table_nb = 1
table = 'table_{}'.format(table_nb)
path = os.path.join(folder, table + '.txt')
if os.path.exists(folder) == False:
        os.mkdir(folder)
for ext in ['.txt', '.pdf']:
    x = [a for a in os.listdir(folder) if a.endswith(ext)]
    [os.remove(os.path.join(folder, i)) for i in x]

## Count & Probits models

More about count data:  [Poisson Regression Models](https://www.dataquest.io/blog/tutorial-poisson-regression-in-r/#:~:text=A%20Poisson%20Regression%20model%20is,form%20by%20some%20unknown%20parameters.)

In [None]:
folder = 'Tables_0'
table_nb = 1
table = 'table_{}'.format(table_nb)
path = os.path.join(folder, table + '.txt')
if os.path.exists(folder) == False:
        os.mkdir(folder)
for ext in ['.txt', '.pdf']:
    x = [a for a in os.listdir(folder) if a.endswith(ext)]
    [os.remove(os.path.join(folder, i)) for i in x]

In [67]:
#library(alpaca)
library(fixest)
#library(texreg)

### Poisson

In [71]:
t_0 = fepois(
    tdso2_equip ~ 
            log(cashflow_to_tangible) +
            log(current_ratio) +
            log(lag_liabilities_tot_asset) +
            log(lag_sales_tot_asset)+
            log(total_asset)
            | fe_t_i + fe_c_t,df_final
)
t_1 <- fepois(tdso2_equip ~ 
            #log(cashflow_to_tangible)+
            log(current_ratio) +
            log(lag_liabilities_tot_asset) +
            log(lag_sales_tot_asset)+
            log(total_asset)
            | fe_t_i + fe_c_t,df_final
    )

t_2 <- fepois(tdso2_equip ~ 
            log(cashflow_to_tangible)+
            log(current_ratio) +
            log(lag_liabilities_tot_asset) +
            log(lag_sales_tot_asset)+
            log(total_asset)
            | fe_t_i + fe_c_t,df_final
    )

t_3 <- fepois(tdso2_equip ~ 
            log(cashflow_to_tangible) * log(total_asset)+
            log(current_ratio) * log(total_asset)+
            log(lag_liabilities_tot_asset) +
            log(lag_sales_tot_asset)+
            log(total_asset)
            | fe_t_i + fe_c_t,df_final
    )

table_1 <- texreg(
    list(
    t_0,t_1, t_2, t_3
    )
    )
summary(t_3,
        ssc = ssc(adj = FALSE, cluster.adj = FALSE),
        vcov = "iid"
        #cluster = "geocode4_corr"
       )

NOTE: 0/259 fixed-effects (2,404 observations) removed because of only 0 outcomes.
NOTE: 0/259 fixed-effects (2,404 observations) removed because of only 0 outcomes.
NOTE: 0/259 fixed-effects (2,404 observations) removed because of only 0 outcomes.
NOTE: 0/259 fixed-effects (2,404 observations) removed because of only 0 outcomes.


Poisson estimation, Dep. Var.: tdso2_equip
Observations: 21,729 
Fixed-effects: fe_t_i: 202,  fe_c_t: 1,645
Standard-errors: IID 
                                            Estimate Std. Error  t value
log(cashflow_to_tangible)                   0.073713   0.040234  1.83209
log(total_asset)                            0.377105   0.007044 53.53354
log(current_ratio)                          0.197018   0.062653  3.14458
log(lag_liabilities_tot_asset)              0.187064   0.018618 10.04729
log(lag_sales_tot_asset)                    0.156811   0.010124 15.48972
log(cashflow_to_tangible):log(total_asset) -0.003659   0.002898 -1.26249
log(total_asset):log(current_ratio)        -0.012381   0.004593 -2.69575
                                            Pr(>|t|)    
log(cashflow_to_tangible)                  0.0669383 .  
log(total_asset)                           < 2.2e-16 ***
log(current_ratio)                         0.0016633 ** 
log(lag_liabilities_tot_asset)             < 2.2e-16 ***
l

In [75]:
#etable( t_0,t_1, t_2, t_3,
#         vcov = "iid",
#       headers = c("1", "2", "3", "4"),
#       tex = TRUE       
#      )

# SOE vs Private

City ownership are available for the following variables:

- output
- capital
- employment
- sales

**How is it constructed** 

* city ownership public vs private in 2002
  * Aggregate output by ownership and city
    * A given city will have SOE asset tangibility and PRIVATE asset tangibility [output, employment, capital and sales]
  * If asset tangibility SOE above Private then city is dominated by SOE
  
Notebook reference: https://github.com/thomaspernet/Financial_dependency_pollution/blob/master/01_data_preprocessing/02_transform_tables/07_dominated_city_ownership.md

In [None]:
query = """
WITH test AS (
  SELECT 
    *,
    CASE WHEN LENGTH(cic) = 4 THEN substr(cic, 1, 2) ELSE concat(
      '0', 
      substr(cic, 1, 1)
    ) END AS indu_2,
    CASE WHEN ownership = 'SOE' THEN 'SOE' ELSE 'PRIVATE' END AS soe_vs_pri,
    CASE WHEN ownership in ('HTM', 'FOREIGN') THEN 'FOREIGN' ELSE 'DOMESTIC' END AS for_vs_dom 
  FROM 
    firms_survey.asif_firms_prepared 
    INNER JOIN (
      SELECT 
        extra_code, 
        geocode4_corr 
      FROM 
        chinese_lookup.china_city_code_normalised 
      GROUP BY 
        extra_code, 
        geocode4_corr
    ) as no_dup_citycode ON asif_firms_prepared.citycode = no_dup_citycode.extra_code
  
) 
SELECT year, soe, geocode4_corr, indu_2,SUM(output) as output, SUM(employ) as employ, SUM(captal) as capital
FROM (
SELECT *,
CASE WHEN ownership in ('SOE') THEN 'SOE' ELSE 'PRIVATE' END AS soe
FROM test 
  )
  GROUP BY soe, geocode4_corr, year, indu_2
"""
df = (s3.run_query(
        query=query,
        database=db,
        s3_output='SQL_OUTPUT_ATHENA',
        filename="test",  # Add filename to print dataframe
        destination_key='SQL_OUTPUT_ATHENA/CSV',  #Use it temporarily
        dtype = dtypes
    )
     )

Dirty code

In [None]:
import janitor

In [None]:
for v in ['output','employ', 'capital']:
    for t in [.5, .4, .3, .2, .1]:
        df_ = (
            df
            .set_index(['year','indu_2', 'soe', 'geocode4_corr'])
            .unstack(-2)
            .assign(
                soe_dominated = lambda x: x[(v, 'SOE')] > x[(v, 'PRIVATE')],
                share_soe = lambda x: x[(v, 'SOE')] / (x[(v, 'SOE')] + x[(v, 'PRIVATE')])
            )
            #.loc[lambda x: x['soe_dominated'].isin([True])]
            .collapse_levels("_")
            .reset_index()
            [['year','geocode4_corr', 'indu_2', "soe_dominated", 
             'share_soe'
             ]]
            .loc[lambda x: x['year'].isin(["2002"])]
            .drop(columns = ['year'])
            .rename(columns = {'indu_2':'ind2'})
            .loc[lambda x: x['share_soe']> t]
            #.groupby(['soe_dominated'])
            #.agg({'share_soe':'describe'})
            .to_csv('list_city_soe_{}_{}.csv'.format(v, t), index = False)
        )

In [None]:
query = """
WITH test AS (
  SELECT 
    *,
    CASE WHEN LENGTH(cic) = 4 THEN substr(cic, 1, 2) ELSE concat(
      '0', 
      substr(cic, 1, 1)
    ) END AS indu_2,
    CASE WHEN ownership = 'SOE' THEN 'SOE' ELSE 'PRIVATE' END AS soe_vs_pri,
    CASE WHEN ownership in ('HTM', 'FOREIGN') THEN 'FOREIGN' ELSE 'DOMESTIC' END AS for_vs_dom 
  FROM 
    firms_survey.asif_firms_prepared 
    INNER JOIN (
      SELECT 
        extra_code, 
        geocode4_corr 
      FROM 
        chinese_lookup.china_city_code_normalised 
      GROUP BY 
        extra_code, 
        geocode4_corr
    ) as no_dup_citycode ON asif_firms_prepared.citycode = no_dup_citycode.extra_code
  
) 
SELECT year, foreign, geocode4_corr, indu_2,SUM(output) as output, SUM(employ) as employ, SUM(captal) as capital
FROM (
SELECT *,
CASE WHEN ownership in ('HTM', 'FOREIGN') THEN 'FOREIGN' ELSE 'DOMESTIC' END AS foreign
FROM test 
  )
  GROUP BY foreign, geocode4_corr, year, indu_2

"""
df = (s3.run_query(
        query=query,
        database=db,
        s3_output='SQL_OUTPUT_ATHENA',
        filename="test",  # Add filename to print dataframe
        destination_key='SQL_OUTPUT_ATHENA/CSV',  #Use it temporarily
        dtype = dtypes
    )
     )

In [None]:
for v in ['output','employ', 'capital']:
    for t in [.5, .4, .3, .2, .1]:
        (
            df
            .set_index(['year','indu_2', 'foreign', 'geocode4_corr'])
            .unstack(-2)
            .assign(
                for_dominated = lambda x: x[(v, 'FOREIGN')] > x[(v, 'DOMESTIC')],
                share_for = lambda x: x[(v, 'FOREIGN')] / (x[(v, 'FOREIGN')] + x[(v, 'DOMESTIC')])
            )
            .collapse_levels("_")
            .reset_index()
            [['year','geocode4_corr', 'indu_2', "for_dominated", 
             'share_for'
             ]]
            .loc[lambda x: x['year'].isin(["2002"])]
            .drop(columns = ['year'])
            .rename(columns = {'indu_2':'ind2'})
            .loc[lambda x: x['share_for']> t]
            #.groupby(['soe_dominated'])
            #.agg({'share_soe':'describe'})
            .to_csv('list_city_for_{}_{}.csv'.format(v, t), index = False)
        )

In the paper, only output is reported:

- 50%, 40% and 30% 

In [None]:
%get path table
df_soe <- df_final %>% inner_join(read_csv('list_city_soe_output_0.5.csv'))
df_priv <- df_final %>% left_join(read_csv('list_city_soe_output_0.5.csv')) %>% filter(is.na(share_soe))
df_for <- df_final %>% inner_join(read_csv('list_city_for_output_0.5.csv'))
df_dom <- df_final %>% left_join(read_csv('list_city_for_output_0.5.csv')) %>% filter(is.na(share_for))

#### SOE
t_0 <- feglm(tdso2_equip ~ 
            log(cashflow_to_tangible)+
            log(current_ratio) +
            log(lag_liabilities_tot_asset) +
            log(lag_sales_tot_asset)+
            log(total_asset)
            | fe_t_i + fe_c_t|0 | geocode4_corr,df_soe 
               ,
         poisson(link = "log")
    )

#### PRIVATE
t_1 <- feglm(tdso2_equip ~ 
            log(cashflow_to_tangible)+
            log(current_ratio) +
            log(lag_liabilities_tot_asset) +
            log(lag_sales_tot_asset)+
            log(total_asset)
            | fe_t_i + fe_c_t|0 | geocode4_corr,df_priv 
               ,
         poisson(link = "log")
    )

#### FOREIGN
t_2 <- feglm(tdso2_equip ~ 
            log(cashflow_to_tangible)+
            log(current_ratio) +
            log(lag_liabilities_tot_asset) +
            log(lag_sales_tot_asset)+
            log(total_asset)
            | fe_t_i + fe_c_t|0 | geocode4_corr,df_for 
               ,
         poisson(link = "log")
    )

#### DOMESTIC
t_3 <- feglm(tdso2_equip ~ 
            log(cashflow_to_tangible)+
            log(current_ratio) +
            log(lag_liabilities_tot_asset) +
            log(lag_sales_tot_asset)+
            log(total_asset)
            | fe_t_i + fe_c_t|0 | geocode4_corr,df_dom 
               ,
         poisson(link = "log")
    )

In [None]:
summary(t_0)

In [None]:
summary(t_1)

In [None]:
summary(t_2)

In [None]:
summary(t_3)

# Generate reports

In [None]:
import os, time, shutil, urllib, ipykernel, json
from pathlib import Path
from notebook import notebookapp
import sys
path = os.getcwd()
parent_path = str(Path(path).parent.parent.parent)
sys.path.append(os.path.join(parent_path, 'utils'))
import make_toc
import create_report

In [None]:
name_json = 'parameters_ETL_pollution_credit_constraint.json'
path_json = os.path.join(str(Path(path).parent.parent), 'utils',name_json)

In [None]:
create_report.create_report(extension = "html", keep_code = False, notebookname = "07_pollution_abatement_equation.ipynb")