# US Name
Model estimate Estimate internal finance and pollution emission firm level

# Description
None
# Metadata
- Key: 488_Financial_dependency_pollution
- Epic: Models
- US: Evaluate econometrics model
- Task tag: #internal-finance, #training-Financial-dependency-pollution
- Analytics reports: 
# Input
## Table/file
**Name**
- asif_financial_ratio_baseline_firm
- china_firm_pollution_data
**Github**
- https://github.com/thomaspernet/Financial_dependency_pollution/blob/master/02_data_analysis/01_model_train_evaluate/00_estimate_fin_ratio/08_firm_level_estimation_pollution.md



# Connexion server

In [None]:
from awsPy.aws_authorization import aws_connector
from awsPy.aws_s3 import service_s3
from awsPy.aws_glue import service_glue
from pathlib import Path
import pandas as pd
import numpy as np
#import seaborn as sns
import os, shutil, json
import sys

from sklearn import preprocessing

le = preprocessing.LabelEncoder()

path = os.getcwd()
parent_path = str(Path(path).parent.parent.parent)


name_credential = 'financial_dep_SO2_accessKeys.csv'
region = 'eu-west-2'
bucket = 'datalake-london'
path_cred = "{0}/creds/{1}".format(parent_path, name_credential)

In [None]:
con = aws_connector.aws_instantiate(credential = path_cred,
                                       region = region)
client= con.client_boto()
s3 = service_s3.connect_S3(client = client,
                      bucket = bucket, verbose = False)
glue = service_glue.connect_glue(client = client) 

In [None]:
pandas_setting = True
if pandas_setting:
    #cm = sns.light_palette("green", as_cmap=True)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_colwidth', None)

In [None]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'


# Load tables

Since we load the data as a Pandas DataFrame, we want to pass the `dtypes`. We load the schema from Glue to guess the types

- 1=state -> 110 141 143 151
- 2=collective -> 120 130 142 149
- 3=private -171 172 173 174 190
- 4=foreign- 210 220 230 240
- 5=Hong Kong, Macau and Taiwan (4 and 5 can be combined into a single "foreign" category - 310 320 330 340

In [None]:
db = 'environment'
table = 'firm_financial_ratio_from_pollution1'

In [None]:
dtypes = {}
schema = (glue.get_table_information(database = db,
                           table = table)
          ['Table']['StorageDescriptor']['Columns']
         )
for key, value in enumerate(schema):
    if value['Type'] in ['varchar(12)',
                         'varchar(3)',
                        'varchar(14)', 'varchar(11)']:
        format_ = 'string'
    elif value['Type'] in ['decimal(21,5)', 'double', 'bigint', 'int', 'float']:
        format_ = 'float'
    else:
        format_ = value['Type'] 
    dtypes.update(
        {value['Name']:format_}
    )

In [None]:
download_data = True
filename = 'df_{}'.format(table)
full_path_filename = 'SQL_OUTPUT_ATHENA/CSV/{}.csv'.format(filename)
path_local = os.path.join(str(Path(path).parent.parent.parent), 
                              "00_data_catalog/temporary_local_data")
df_path = 'df_asif.csv'#os.path.join(path_local, filename + '.csv')
if download_data:
    
    s3 = service_s3.connect_S3(client = client,
                          bucket = bucket, verbose = False)

In [None]:
def construct_table(table="firm_financial_ratio_from_pollution1"):
    query = f"""
WITH temp as (
  SELECT 
    "firm", 
    "year", 
    geocode4_corr, 
    province_en, 
    "cic_adj", 
    "cic03", 
    "ownership_new", 
    "total_industrialwater_used", 
    "total_freshwater_used", 
    "gyqs", 
    "total_repeatedwater_used", 
    "total_coal_used", 
    "rlmxf", 
    "ylmxf", 
    "rlmpjlf", 
    "rlyxf", 
    "zyxf", 
    "cyxf", 
    "rlypjlf", 
    "zypjlf", 
    "clean_gas_used", 
    "waste_water", 
    "cod", 
    "ad", 
    "waste_gas", 
    "so2", 
    "nox", 
    "smoke_dust", 
    "soot", 
    "yfc", 
    "gyfscll", 
    "hxxyqcl", 
    "xzssqcl", 
    "adqcl", 
    "eyhlqcl", 
    "dyhwqcl", 
    "ycqcl", 
    "gyfcqcl", 
    "dwastewater_equip", 
    "fszlssnl", 
    "fszlssfee", 
    "dwastegas_equip", 
    "dso2_equip", 
    "fqzlssnl", 
    "tlssnl", 
    "hxxycsl", 
    "adcsl", 
    "eyhlcsl", 
    "dyhwcsl", 
    "yfccsl", 
    "age", 
    "bdat", 
    "export",
    c125,
    c133,
    c98,
    CASE 
    WHEN c98 <> 0 THEN CAST(c125 AS DOUBLE) / c98
    ELSE NULL
END AS interest_expense,
CASE
    WHEN c98 <> 0 THEN CAST(c133 AS DOUBLE) / c98
    ELSE NULL
END AS interest_expense1

    
  FROM 
    "environment"."china_firm_pollution_data" 
    INNER JOIN (
      SELECT 
        extra_code, 
        geocode4_corr, 
        province_en 
      FROM 
        chinese_lookup.china_city_code_normalised 
      GROUP BY 
        extra_code, 
        province_en, 
        geocode4_corr
    ) as no_dup_citycode ON china_firm_pollution_data.citycode = no_dup_citycode.extra_code
) 
SELECT 
  temp.firm, 
  temp.year, 
  temp.geocode4_corr, 
  tcz, 
  spz, 
  temp.province_en, 
  temp.cic_adj, 
  temp.cic03, 
  indu_2,
  temp.ownership_new, 
  "output", 
  "outputdefl", 
  "sales", 
  "employment", 
  "capital", 
  "total_industrialwater_used", 
  "total_freshwater_used", 
  "gyqs", 
  "total_repeatedwater_used", 
  "total_coal_used", 
  "rlmxf", 
  "ylmxf", 
  "rlmpjlf", 
  "rlyxf", 
  "zyxf", 
  "cyxf", 
  "rlypjlf", 
  "zypjlf", 
  "clean_gas_used", 
  "waste_water", 
  "cod", 
  "ad", 
  "waste_gas", 
  "so2", 
  "nox", 
  "smoke_dust", 
  "soot", 
  "yfc", 
  "gyfscll", 
  "hxxyqcl", 
  "xzssqcl", 
  "adqcl", 
  "eyhlqcl", 
  "dyhwqcl", 
  "ycqcl", 
  "gyfcqcl", 
  "dwastewater_equip", 
  "fszlssnl", 
  "fszlssfee", 
  "dwastegas_equip", 
  "dso2_equip", 
  "fqzlssnl", 
  "tlssnl", 
  "hxxycsl", 
  "adcsl", 
  "eyhlcsl", 
  "dyhwcsl", 
  "yfccsl", 
  "age", 
  "bdat", 
  "export", 
  "tfp_op", 
  "tfp_lp",
  CASE WHEN rd_tot_asset IS NULL THEN -1000 WHEN rd_tot_asset < 0 THEN 0 ELSE rd_tot_asset END AS rd_tot_asset_trick, 
  CASE WHEN temp.ownership_new in (1.0) THEN 'SOE' ELSE 'NOT_SOE' END AS SOE, 
  CASE WHEN temp.ownership_new in (4.0, 5.0) THEN 'FOREIGN' ELSE 'NOT_FOREIGN' END AS FOREIGN, 
  "current_asset", 
  "tofixed", 
  "total_liabilities", 
  "tangible", 
  "net_non_current", 
  "cashflow", 
  "working_capital", 
  "current_ratio", 
  "quick_ratio", 
  "liabilities_tot_asset", 
  "sales_tot_asset", 
  "total_asset", 
  "investment_tot_asset", 
  "rd_tot_asset", 
  "asset_tangibility_tot_asset", 
  "cashflow_tot_asset", 
  "cashflow_to_tangible", 
  "return_to_sale", 
  "coverage_ratio", 
  "working_capital_ratio", 
  "roa", 
  "roe", 
  "ros", 
  "liquidity", 
  "current_asset_tot_asset", 
  "tangible_tot_asset", 
  "net_non_current_tot_asset", 
  "working_capital_tot_asset", 
  "current_ratio_tot_asset", 
  "quick_ratio_tot_asset", 
  concat(indu_2, '-', temp.year) as fe_indu2_year, 
  concat(
    temp.geocode4_corr, '-', temp.year
  ) as fe_city_year ,
  interest_expense,
  interest_expense1,
  c125,
  c133,
  c98
FROM 
  temp 
  INNER JOIN "environment"."{table}" ON temp.firm = {table}.firm 
  AND temp.year = {table}.year 
  AND temp.geocode4_corr = {table}.geocode4_corr 
  AND temp.province_en = {table}.province_en 
  AND temp.cic_adj = {table}.cic_adj 
  AND temp.cic03 = {table}.cic03 
  AND temp.ownership_new = {table}.ownership_new 
  LEFT JOIN policy.china_city_tcz_spz ON temp.geocode4_corr = china_city_tcz_spz.geocode4_corr 
  INNER JOIN (
    SELECT 
      "firm", 
      "year", 
      "citycode_asifad" as geocode4_corr, 
      "tfp_op", 
      "tfp_lp" 
    FROM 
      firms_survey.firm_tfp_china
  ) as tfp_table on temp.firm = tfp_table.firm 
  and temp.geocode4_corr = tfp_table.geocode4_corr 
  and temp.year = tfp_table.year 
ORDER BY 
  firm, 
  year
    """
    df = s3.run_query(
        query=query,
        database=db,
        s3_output="SQL_OUTPUT_ATHENA",
        filename=filename,  # Add filename to print dataframe
        destination_key="SQL_OUTPUT_ATHENA/CSV",  # Use it temporarily
        dtype=dtypes,
    ).assign(
        tcz=lambda x: x["tcz"].fillna(0).astype("int").astype("str"),
        spz=lambda x: x["spz"].fillna(0).astype("int").astype("str"),
        fe_fo=lambda x: le.fit_transform(x["firm"].astype("str")),
        fe_indu2_year=lambda x: le.fit_transform(x["fe_indu2_year"].astype("str")),
        fe_city_year=lambda x: le.fit_transform(x["fe_city_year"].astype("str")),
    )
    query = """
SELECT * FROM "industry"."china_credit_constraint"
"""
    df_credit = s3.run_query(
        query=query,
        database=db,
        s3_output="SQL_OUTPUT_ATHENA",
        filename=filename,  # Add filename to print dataframe
        destination_key="SQL_OUTPUT_ATHENA/CSV",  # Use it temporarily
        dtype=dtypes,
    )
    df = df.merge(df_credit.rename(columns={"cic": "indu_2"}), how="left").assign(
        financial_dep_china1 = lambda x: -x['financial_dep_china'],
        constraint=lambda x: x["financial_dep_china"] < -0.44,
        constraint_1=lambda x: x["financial_dep_china"] < -0.26,
    )
    df_final = df.assign(
        **{
            f"lag_{c}": df.groupby(["firm"])[c].transform("shift")
            for c in [
                "sales",
                "tfp_op",
                "tfp_lp",
                "cashflow", 
                  "working_capital", 
                  "current_ratio", 
                  "quick_ratio", 
                  "liabilities_tot_asset", 
                  "sales_tot_asset", 
                  "total_asset", 
                  "investment_tot_asset", 
                  "rd_tot_asset", 
                  "asset_tangibility_tot_asset", 
                  "cashflow_tot_asset", 
                  "cashflow_to_tangible", 
                  "return_to_sale", 
                  "coverage_ratio", 
                  "working_capital_ratio", 
                  "roa", 
                  "roe", 
                  "ros", 
                  "liquidity", 
                  "current_asset_tot_asset", 
                  "tangible_tot_asset", 
                  "net_non_current_tot_asset", 
                  "working_capital_tot_asset", 
                  "current_ratio_tot_asset", 
                  "quick_ratio_tot_asset"
            ]
        }
    ).sort_values(by = ['firm','year'])
    #.dropna(
     #   subset=[
     #       "lag_cashflow_to_tangible",
     #       "lag_sales_tot_asset",
     #       "lag_liabilities_tot_asset",
     #   ]
    #)
    return df_final


In [None]:
query_mandate = """
SELECT geocode4_corr,"tso2_mandate_c", "so2_perc_reduction_c", "target_reduction_so2_p"
FROM "policy"."china_city_reduction_mandate"
INNER JOIN (
      SELECT 
        extra_code, 
        geocode4_corr, 
        cityen
 
      FROM 
        chinese_lookup.china_city_code_normalised 
      GROUP BY 
        extra_code, 
        cityen, 
        geocode4_corr
    ) as no_dup_citycode ON china_city_reduction_mandate.cityen = no_dup_citycode.cityen
"""
df_mandate = (
    s3.run_query(
        query=query_mandate,
        database=db,
        s3_output="SQL_OUTPUT_ATHENA",
        filename=filename,  # Add filename to print dataframe
        destination_key="SQL_OUTPUT_ATHENA/CSV",  # Use it temporarily
    )
    .drop_duplicates()
)

In [None]:
query = """
WITH temp AS (
SELECT year, no_dup_citycode.geocode4_corr,indus_code as cic_adj, 
substr(indus_code, 1,2) as ind2,
innovation_index
FROM innovation_city_industry
INNER JOIN (
    SELECT 
      extra_code, 
      geocode4_corr 
    FROM 
      chinese_lookup.china_city_code_normalised 
    GROUP BY 
      extra_code, 
      geocode4_corr
  ) as no_dup_citycode ON innovation_city_industry.geocode4_corr = no_dup_citycode.extra_code
  )
  SELECT year, geocode4_corr, 
  cic_adj,
  ind2,
  AVG(innovation_index) as innovation_index,
  MAX(innovation_index) as innovation_index_1,
  approx_percentile(innovation_index, ARRAY[0.50])[1] as innovation_index_2
  FROM temp
  GROUP BY year, geocode4_corr,
  cic_adj,
  ind2
"""
df_innovation = (s3.run_query(
            query=query,
            database="china",
            s3_output='SQL_OUTPUT_ATHENA',
            filename=filename,  # Add filename to print dataframe
            destination_key='SQL_OUTPUT_ATHENA/CSV',  #Use it temporarily
            dtype = dtypes
        )
                )
df_innovation.head()

In [None]:
df_innovation.describe()

In [None]:
query = """
SELECT 
      province_loan_and_credit.year, 
      province_loan_and_credit.province_en, 
      CAST(
        total_long_term_loan AS DECIMAL(16, 5)
      )/ CAST(
        total_gdp AS DECIMAL(16, 5)
      ) AS credit_supply_long_term, 
      CAST(
        total_short_term AS DECIMAL(16, 5)
      )/ CAST(
        total_gdp AS DECIMAL(16, 5)
      ) AS credit_supply_short_term 
    FROM 
      almanac_bank_china.province_loan_and_credit
"""
df_credit_supply = (s3.run_query(
            query=query,
            database=db,
            s3_output='SQL_OUTPUT_ATHENA',
            filename=filename,  # Add filename to print dataframe
            destination_key='SQL_OUTPUT_ATHENA/CSV',  #Use it temporarily
            dtype = dtypes
        )
                )
df_credit_supply.head()

In [None]:
import re
def get_company_registration_type(x):
    regex = r'有限公司|信用合作联社|有限责任公司|旧市支行|族自治州分行|县支行|市支行|村支行|市分行|农村合作银行|自治区分行|支行|合作社联合社'\
'|分行|资金互助社|信用社联合社|股份公司|住宅金融事业部|国家开发银行|总行营业部|合作金融结算服务中心|信用合作社|信托投资公司'
    matches = re.findall(regex,x)
    if len(matches) > 0:
        return matches[0]
    else:
        return np.nan 
def get_type(x):
    if re.search(r"中国工商银行|中国建设银行|中国银行|中国农业银行|交通银行|中国邮政储蓄银行", str(x)):
        return (re.search(r"中国工商银行|中国建设银行|中国银行|中国农业银行|交通银行|中国邮政储蓄银行", 
                          str(x)).group(), "SOB")
    elif re.search(r"中国农业发展银行|国家开发银行|中国进出口银行", str(x)):
        return (re.search(r"中国农业发展银行|国家开发银行|中国进出口银行", str(x)).group(),
                "policy bank")
    elif re.search(r"股份制商业银行", str(x)):
        return (re.search(r"股份制商业银行|银行股份", str(x)).group(), "joint-stock commercial bank")
    #elif re.search(r"城市商业银行", str(x)):
    #    return (re.search(r"城市商业银行", str(x)).group(), "city commercial bank")
    elif re.search(r"农村商业银行", str(x)):
        return (re.search(r"农村商业银行", str(x)).group(), "rural commercial bank")
    elif re.search(r"外资银行", str(x)):
        return (re.search(r"外资银行", str(x)).group(), "foreign bank")
    else:
        return np.nan

In [None]:
query = """
SELECT *
FROM china.branches_raw_csv
"""
df_bank = (
    s3.run_query(
        query=query,
        database=db,
        s3_output="SQL_OUTPUT_ATHENA",
        filename="bank",  # Add filename to print dataframe
        destination_key="SQL_OUTPUT_ATHENA/CSV",  # Use it temporarily
        dtype = {'id':'str','geocode4_corr':'str', 'lostReason':'str',
                       'location':'str', 'city_temp':'str', 'points':'str'}
    )
    .assign(
        setdate=lambda x: pd.to_datetime(x["setdate"].astype("Int64").astype(str), errors ='coerce'),
        printdate=lambda x: pd.to_datetime(x["printdate"].astype("Int64").astype(str), errors ='coerce'),
        year_setdate=lambda x: x["setdate"].dt.year.astype("Int64").astype(str),
        bank_temp=lambda x: x.apply(
            lambda x: get_company_registration_type(x["fullname"]), axis=1
        ),
        geocode4_corr = lambda x: x['geocode4_corr'].astype("Int64").astype(str)
    )
     .assign(
         registration_type=lambda x: x.apply(
            lambda x: np.nan if pd.isna(x["bank_temp"]) else x["bank_temp"], axis=1
        ),
        bank_full_name=lambda x: x.apply(
            lambda x: x["fullname"]
            if pd.isna(x["bank_temp"])
            else x["fullname"].split(x["registration_type"][0])[0],
            axis=1,
        ),
        list_bank_type=lambda x: x.apply(lambda x: get_type(x["bank_full_name"]), axis=1),
        bank_type=lambda x: x.apply(
            lambda x: np.nan if pd.isna(x["list_bank_type"]) else x["list_bank_type"][0], axis=1
        ),
        bank_type_adj=lambda x: x.apply(
            lambda x: np.nan if pd.isna(x["list_bank_type"]) else x["list_bank_type"][1], axis=1
        )
    )
    .assign(
        bank_type_1 = lambda x: x['certcode'].str.slice(stop = 1),
        bank_code = lambda x: x['certcode'].str.slice(stop = 7),
        bank_type_details = lambda x: x['certcode'].str.slice(start = 5, stop = 6),
        citycode = lambda x: x['certcode'].str.slice(start = 7, stop = 11),
        unknown_code = lambda x: x['certcode'].str.slice(start = 11),
    )
    .drop(columns=["bank_temp"])
)

In [None]:
query = """
SELECT *
FROM china.china_bank_information
"""
df_bank_info = (
    s3.run_query(
        query=query,
        database=db,
        s3_output="SQL_OUTPUT_ATHENA",
        filename="bank",  # Add filename to print dataframe
        destination_key="SQL_OUTPUT_ATHENA/CSV",  # Use it temporarily
        dtype=dtypes,
    )
    .drop_duplicates(subset=["shortbnm"])
    .replace(
        {
            "bnature": {
                1: "政策性银行",
                2: "国有控股大型商业银行",
                3: "股份制商业银行",
                4: "城市商业银行",
                5: "农村商业银行",
                6: "外资银行",
                7: "其他",
                8: "农合行",
                9: "农信社",
                10: "三类新型农村金融机构",
            }
        }
    )
)
df_bank_info.shape

In [None]:
### CCB
temp_city_branch = (
    df_bank.loc[lambda x: x["bank_type_adj"].isin([np.nan])]
    .loc[lambda x: x["bank_type_details"].isin(["S"])]
    .loc[lambda x: x["bank_type_1"].isin(["B", "L"])]
    .loc[lambda x: ~x["fullname"].str.contains("村镇")]
    .assign(bank_type="城市商业银行", status="CCB")
    .reindex(
        columns=[
            "id",
            "certcode",
            "bank_type_adj",
            "bank_type_1",
            "bank_type",
            "bank_type_details",
            "unknown_code",
            "fullname",
            "registration_type",
            "bank_full_name",
            "citycode",
            'geocode4_corr',
            "bank_code",
            "year_setdate",
            "status",
        ]
    )
    .rename(columns={
        #"citycode": "geocode4_corr",
        "year_setdate": "year"})
)

In [None]:
#### no city bank
df_bank_concat = (
    pd.concat(
    [
        (
            df_bank.loc[lambda x: ~x["bank_type_adj"].isin([np.nan])]
            .loc[lambda x: x["bank_type_details"].isin(["S"])]
            .reindex(
                columns=[
                    "id",
                    "certcode",
                    "bank_type_adj",
                    "bank_type_1",
                    "bank_type",
                    "bank_type_details",
                    "unknown_code",
                    "fullname",
                    "registration_type",
                    "bank_full_name",
                    "citycode",
                    "bank_code",
                    "year_setdate",
                ]
            )
            .assign(status="no CCB",)
        ),
        ### rural
        (
            df_bank.loc[lambda x: x["bank_type_adj"].isin([np.nan])]
            .loc[lambda x: x["bank_type_details"].isin(["S"])]
            .loc[lambda x: x['fullname'].str.contains('村镇')]
            .assign(bank_type="农村商业银行", status="no CCB")
            .reindex(
                columns=[
                    "id",
                    "certcode",
                    "bank_type_adj",
                    "bank_type_1",
                    "bank_type",
                    "bank_type_details",
                    "unknown_code",
                    "fullname",
                    "registration_type",
                    "bank_full_name",
                    "citycode",
                    "geocode4_corr",
                    "bank_code",
                    "year_setdate",
                    "status",
                ]
            )
        )
    ]
)
    .rename(
    columns = {
        #'citycode':'geocode4_corr',
        'year_setdate':'year'
    })
)

In [None]:
from polyfuzz.models import RapidFuzz
from polyfuzz import PolyFuzz
rapidfuzz_matcher = RapidFuzz(n_jobs=1)

In [None]:
banks_no_ccb = (
    [ i.replace('（中国）','').strip() for i in 
     (
    df_bank_concat['bank_full_name'].dropna().drop_duplicates().to_list()
)
     if len(i) > 1]
)
bank_info = df_bank_info.loc[lambda x: ~x['bnature'].isin(['城市商业银行'])]['shortbnm'].to_list()
no_ccb = PolyFuzz(rapidfuzz_matcher).match(banks_no_ccb, bank_info)

In [None]:
banks = (
    [ i.replace('（中国）','').strip() for i in 
     (
    temp_city_branch['bank_full_name'].dropna().drop_duplicates().to_list()
)
     if len(i) > 1]
)
bank_info = df_bank_info.loc[lambda x: x['bnature'].isin(['城市商业银行'])]['shortbnm'].to_list()
model = PolyFuzz(rapidfuzz_matcher).match(banks, bank_info)

In [None]:
query = """
SELECT 
        extra_code, 
        geocode4_corr, 
        province_en 
      FROM 
        chinese_lookup.china_city_code_normalised
"""
df_citycode = (s3.run_query(
            query=query,
            database="china",
            s3_output='SQL_OUTPUT_ATHENA',
            filename=filename,  # Add filename to print dataframe
            destination_key='SQL_OUTPUT_ATHENA/CSV',  #Use it temporarily
            dtype = dtypes
        )
                )
df_citycode.head()

In [None]:
df_bank_status = (
    pd.concat(
        [
            (
                df_bank_concat.merge(
                    no_ccb.get_matches()
                    .sort_values(by=["Similarity"])
                    .loc[lambda x: x["Similarity"] > 0.80]
                    .rename(columns={"From": "bank_full_name"})
                )
            ),
            (
                temp_city_branch.merge(
                    model.get_matches()
                    .sort_values(by=["Similarity"])
                    .loc[lambda x: x["Similarity"] > 0.80]
                    .rename(columns={"From": "bank_full_name"})
                )
            ),
        ]
    )
    .rename(columns={"To": "bank_name"})
    .assign(
        geocode4_corr=lambda x: np.where(
            x["geocode4_corr"].isin(["<NA>", np.nan]), x["citycode"], x["geocode4_corr"]
        )
    )
    .assign(
        bank_type_adj=lambda x: np.where(
            np.logical_and(
                x["bank_type_adj"].isin([np.nan]), x["bank_type"] == "农村商业银行"
            ),
            "rural commercial bank",
            x["bank_type_adj"],
        )
    )
    .assign(
        bank_type_adj=lambda x: np.where(
            np.logical_and(
                x["bank_type_adj"].isin([np.nan]), x["bank_type"] == "城市商业银行"
            ),
            "city commercial bank",
            x["bank_type_adj"],
        )
    )
    .merge(
        df_citycode.drop_duplicates()
        .assign(extra_code=lambda x: x["extra_code"].astype(str))
        .rename(columns={"geocode4_corr": "geocode4_corr_adj"}),
        right_on=["extra_code"],
        left_on=["geocode4_corr"],
    )
    .drop(columns = ['geocode4_corr','extra_code'])
    .rename(columns={"geocode4_corr_adj": "geocode4_corr"})
)
df_bank_status.head()

In [None]:
import janitor

In [None]:
df_ccb = (
    #### Number of CCB per city
    df_bank_status.pivot_table(
        values="id",
        index=["status", "bank_name", "geocode4_corr"],
        columns="year",
        aggfunc="nunique",
        fill_value=0,
    )
    .stack()
    .reset_index()
    .rename(columns={0: "count"})
    .assign(
        count=lambda x: x.groupby(["status", "bank_name", "geocode4_corr"])[
            "count"
        ].transform("cumsum"),
        active=lambda x: np.where(x["count"] > 0, 1, 0),
    )
    .groupby(["year", "geocode4_corr", "status"])
    .agg({"count": "sum", "active": "sum"})
    .unstack(-1)
    .collapse_levels(sep="_")
    .rename(columns={"count_no CCB": "count_no_CCB", "active_no CCB": "active_no_CCB",})
    .fillna(0)
    .assign(
        share_count_ccb=lambda x: (x["count_CCB"] / (x["count_CCB"] + x["count_no_CCB"])),
        share_active_ccb=lambda x: (x["active_CCB"] / (x["active_CCB"] + x["active_no_CCB"])),
    )
    .fillna(0)
)
df_ccb.shape

In [None]:
df_hhi = (
            #### HHI
            df_bank_status.pivot_table(
                values="id",
                index=["bank_name", "geocode4_corr"],
                columns="year",
                aggfunc="nunique",
                fill_value=0,
            )
            .stack()
            .reset_index()
            .rename(columns={0: "count"})
            .assign(
                count=lambda x: x.groupby(["bank_name", "geocode4_corr"])[
                    "count"
                ].transform("cumsum"),
                active=lambda x: np.where(x["count"] > 0, 1, 0),
                total_city=lambda x: x.groupby(["year", "geocode4_corr"])[
                    "count"
                ].transform("sum"),
                total_city_active=lambda x: x.groupby(["year", "geocode4_corr"])[
                    "active"
                ].transform("sum"),
                score_count=lambda x: (x["count"] / x["total_city"]) ** 2,
                score_active=lambda x: (x["active"] / x["total_city_active"]) ** 2,
            )
            .groupby(["year", "geocode4_corr"])
            .agg({"score_count": "sum", "score_active": "sum"})
            .assign(
                hhi_branches=lambda x: 1 - x["score_count"],
                hhi_branches_name=lambda x: 1 - x["score_active"],
            )
        )
df_hhi.shape

In [None]:
def big_four(x):
    if x == "中国农业银行股份":
        return "中国农业银行股份"
    elif x == "中国工商银行股份":
        return "中国工商银行股份"
    elif x == "中国建设银行股份":
        return "中国建设银行股份"
    elif x == "中国银行股份":
        return "中国银行股份"
    else:
        return 'other'

In [None]:
df_concentration = (
            df_bank_status.assign(
                sob=lambda x: x.apply(lambda x: big_four(x["bank_full_name"]), axis =1)
            )
    .groupby(["year", "geocode4_corr", "sob"])
            .agg({"id": "count"})
            .sort_values(by=["sob", "geocode4_corr", "year"])
            .reset_index()
            .pivot_table(
                values="id",
                index=["sob", "geocode4_corr"],
                columns="year",
                aggfunc=np.sum,
                fill_value=0,
            )
    .stack()
            .reset_index()
            .rename(columns={0: "count"})
    .assign(
                temp=lambda x: x.groupby(["sob", "geocode4_corr"])[
                    "count"
                ].transform("cumsum"),
                temp_1=lambda x: x.groupby(["sob", "geocode4_corr", "temp"])[
                    "year"
                ]
                .transform("min")
                .fillna("2222")
                .astype("int"),
                first_entry=lambda x: x.groupby(["sob", "geocode4_corr"])[
                    "temp_1"
                ].transform("min"),
                count=lambda x: x["count"].fillna(0),
            )
    .loc[lambda x: x["year"].astype("int") >= x["first_entry"]]
    .drop(columns=["temp", "temp_1", "first_entry"])
            .assign(
                totalBranchBank=lambda x: x.groupby(["sob", "geocode4_corr"])[
                    "count"
                ].transform("cumsum"),
                totalBranchCity=lambda x: x.groupby(["geocode4_corr", "year"])[
                    "totalBranchBank"
                ].transform("sum"),
            )
    .loc[lambda x: x["sob"] != "other"]
    .set_index(["sob", "geocode4_corr", "year", "totalBranchCity"])
            .drop(columns=["count"])
            .unstack(0)
            .assign(total_sob=lambda x: x.sum(axis=1))
            .reset_index(["totalBranchCity"])
            .assign(
                concentration_sob=lambda x: x[("total_sob", "")]
                / x[("totalBranchCity", "")]
            )
            .reindex(columns=[("total_sob", ""), ("concentration_sob", "")])
            .droplevel(axis=1, level=1)
            .reset_index()
            .dropna(subset=["concentration_sob"])
            .assign(concentration=lambda x: np.log((1 - x["concentration_sob"]) +1)
                   )
)
df_concentration.shape

In [None]:
df_deregulation = (
    pd.concat([df_ccb,df_hhi], axis =1)
    .merge(df_concentration.set_index(['year','geocode4_corr']),
           how = 'left', left_index = True, right_index = True)
)

#df_deregulation =  pd.concat([df_ccb,df_hhi,df_concentration], axis =1)
df_deregulation = (
    df_deregulation.assign(
        **{
            "lag_{}".format(i): df_deregulation.groupby(["geocode4_corr"])[i].transform(
                "shift"
            )
            for i in df_deregulation.columns
        }
    )
)
df_deregulation.shape

In [None]:
df_deregulation = (
    df_deregulation.reindex(
        columns=[
            "count_CCB",
            "lag_count_CCB",
            "count_no_CCB",
            "lag_count_no_CCB",
            "active_CCB",
            "lag_active_CCB",
            "active_no_CCB",
            "lag_active_no_CCB",
            "share_count_ccb",
            "lag_share_count_ccb",
            "share_active_ccb",
            "lag_share_active_ccb",
            #"score_count",
            #"lag_score_count",
            #"score_active",
            #"lag_score_active",
            "hhi_branches",
            "lag_hhi_branches",
            "hhi_branches_name",
            "lag_hhi_branches_name",
            "count_False",
            "lag_count_False",
            "count_True",
            "lag_count_True",
            "concentration",
            "lag_concentration",
        ]
    )
    .reset_index()
    .loc[lambda x: x["year"] > "1997"]
)
df_deregulation.columns = (
    df_deregulation.columns.str.strip()
    .str.replace(" ", "_")
    .str.replace("-", "_")
    .str.lower()
)
df_deregulation.tail()

In [None]:
def is_sequence_continuous(years):
    sorted_years = sorted(years.unique())
    return len(sorted_years) == sorted_years[-1] - sorted_years[0] + 1
expected_years = list(range(1999, 2008)) 
def has_missing_years(x):
    unique_years = x.unique()
    return len(set(expected_years) - set(unique_years))

In [None]:
to_download = False
df_final1 = (
    construct_table(table="firm_financial_ratio_from_pollution1")
)

In [None]:
df_final1 = (
    df_final1
    #.drop(columns=["SOE", "FOREIGN"])
    #.merge(
    #    df_final1.assign(
    #        first_year=lambda x: x.groupby(["firm"])["year"].transform("min")
    #    )
    #    .loc[lambda x: x["year"] == x["first_year"]]
    #    .reindex(columns=["firm", "SOE", "FOREIGN"])
    #    .drop_duplicates()
    #)
    .assign(
        change_status_soe=lambda x: x.groupby(["firm"])["SOE"].transform(
            lambda x: x.nunique()
        ),
        change_status_foreign=lambda x: x.groupby(["firm"])["FOREIGN"].transform(
            lambda x: x.nunique()
        ),
    )
)

In [None]:
df_final1['flag_discontinuous'] = ~df_final1.groupby('firm')['year'].transform(is_sequence_continuous).astype(int)
df_final1['missing_years'] = df_final1.groupby('firm')['year'].transform(has_missing_years)
# Add a column to flag firms with missing years
df_final1['flag'] = np.where(df_final1['missing_years'] > 0, 1, 0)
df_final1 = (
    df_final1
    .merge(df_innovation, how = 'left', on =['year', 'geocode4_corr', 'cic_adj'])
    .assign(
        innovation_index =lambda x: x['innovation_index'].fillna(0),
        innovation_index_1=lambda x: x['innovation_index_1'].fillna(0),
        innovation_index_2=lambda x: x['innovation_index_2'].fillna(0),
        so2_output = lambda x: x['so2']/x['output'],
        so2_sales = lambda x: x['so2']/x['sales'],
        so2_capital = lambda x: x['so2']/x['capital'],
        so2_emp = lambda x: x['so2']/x['employment'],
        so2_asset = lambda x: x['so2']/x['total_asset'],
        
        cod_output = lambda x: x['cod']/x['output'],
        cod_sales = lambda x: x['cod']/x['sales'],
        cod_capital = lambda x: x['cod']/x['capital'],
        cod_emp = lambda x: x['cod']/x['employment'],
        cod_asset = lambda x: x['cod']/x['total_asset'],
        
        waste_water_output = lambda x: x['waste_water']/x['output'],
        waste_water_sales = lambda x: x['waste_water']/x['sales'],
        waste_water_capital = lambda x: x['waste_water']/x['capital'],
        waste_water_emp = lambda x: x['waste_water']/x['employment'],
        waste_water_asset = lambda x: x['waste_water']/x['total_asset'],
    )
    .merge(df_deregulation.assign(year = lambda x: x['year'].astype(int)), how = 'left')
     .merge(
        (
            df_credit_supply.sort_values(by=["province_en", "year"])
            .assign(
                lag_credit_supply_long_term=lambda x: x.groupby(["province_en"])[
                    "credit_supply_long_term"
                ].transform("shift")
            )
            .reindex(columns=["year", "province_en", "lag_credit_supply_long_term",'credit_supply_long_term'])
        ),
         how = 'left'
    )
)

In [None]:
df_final1.shape

In [None]:

if to_download:
    df_final1.to_csv('df_asif1.csv')
    df_final2 = construct_table(table="firm_financial_ratio_from_pollution2")
    df_final3 = construct_table(table="firm_financial_ratio_from_pollution3")
    
    df_final2.to_csv('df_asif2.csv')
    df_final3.to_csv('df_asif3.csv')

In [None]:
df_final1['change_status_soe'].value_counts()

In [None]:
df_final1['change_status_foreign'].value_counts()

In [None]:
df_final1['flag_discontinuous'].value_counts()

In [None]:
df_final1['missing_years'].value_counts()

In [None]:
df_final1.loc[lambda x: x['flag_discontinuous'] == -1]

## Schema Latex table

To rename a variable, please use the following template:

```
{
    'old':'XX',
    'new':'XX_1'
    }
```

if you need to pass a latex format with `\`, you need to duplicate it for instance, `\text` becomes `\\text:

```
{
    'old':'working\_capital\_i',
    'new':'\\text{working capital}_i'
    }
```

Then add it to the key `to_rename`

In [None]:
add_to_dic = False
if add_to_dic:
    if os.path.exists("schema_table.json"):
        os.remove("schema_table.json")
    data = {'to_rename':[], 'to_remove':[]}
    dic_rename = [
        {
        'old':'working\_capital\_i',
        'new':'\\text{working capital}_i'
        },
        {
        'old':'periodTRUE',
        'new':'\\text{period}'
        },
        {
        'old':'tso2\_mandate\_c',
        'new':'\\text{policy mandate}_'
        },
    ]

    data['to_rename'].extend(dic_rename)
    with open('schema_table.json', 'w') as outfile:
        json.dump(data, outfile)

In [None]:
sys.path.append(os.path.join(parent_path, 'utils'))
import latex.latex_beautify as lb
#%load_ext autoreload
#%autoreload 2

In [1]:
options(warn=-1)
library(tidyverse)
library(lfe)
#library(lazyeval)
library('progress')
path = "../../../utils/latex/table_golatex.R"
source(path)

ERROR: Error: package or namespace load failed for ‘tidyverse’ in loadNamespace(i, c(lib.loc, .libPaths()), versionCheck = vI[[i]]):
 there is no package called ‘dplyr’


In [None]:
%get df_path
df_final1 <- read_csv('df_asif1.csv') %>%
mutate_if(is.character, as.factor) %>%
    mutate_at(vars(starts_with("fe")), as.factor) %>%
  group_by(firm) %>%
  mutate(count = n(), quick_ratio1 = -quick_ratio)

In [None]:
#df_final %>%select(so2)

# Main results

### Summary:

**Baseline Observations**:
- Firms with a higher proportion of tangible assets tend to produce more SO2 emissions, likely due to more intensive manufacturing activities.
- Efficient and financially healthy firms, especially older ones, tend to emit less SO2. This might be attributed to the adoption of cleaner technologies and better processes.
- As expected, firms with SO2 reducing equipment have fewer emissions.
- State-owned enterprises emit more SO2 than their private counterparts.

**Heterogeneity Observations**:
- Cash-rich firms are proactive about reducing emissions when facing industry-wide financial constraints or localized banking biases. 
- Firms with stronger cashflows in cities with SO2 reduction policies (TCZ cities) or in growth-focused zones (SPZ cities) are more proactive in reducing emissions.

**Mechanism Observations**:
- Firms with higher cashflows tend to invest less in tangible assets and more in R&D, emphasizing a move towards innovation over traditional asset acquisition.
- A financial shift is observed where liquidity-driven firms prioritize long-term growth and efficiency, especially when facing challenges in accessing traditional bank loans.

**Investment in Pollution Abatement**:
- Firms with ample liquidity invest more in pollution control technologies, leading to a direct increase in their capacity to reduce pollution.
- Bigger firms, both in terms of assets and age, demonstrate a better capability to manage pollution, whereas firms with larger employee counts seem to face challenges in efficient pollution control.
- State-owned enterprises lag behind in pollution abatement capacities compared to private firms.

In essence, the study underscores the integral role of a firm's financial health, particularly cashflows, in its environmental responsibilities and practices. The balance between financial constraints and proactive investments in cleaner technologies and processes is evident. Additionally, the nature, size, age, and ownership of a firm play nuanced roles in determining its environmental responsiveness, especially in the context of SO2 emissions and controls.

In [None]:
folder = 'Tables_0'
table_nb = 1
table = 'table_{}'.format(table_nb)
path = os.path.join(folder, table + '.txt')
if os.path.exists(folder) == False:
        os.mkdir(folder)
for ext in ['.txt', '.pdf']:
    x = [a for a in os.listdir(folder) if a.endswith(ext)]
    [os.remove(os.path.join(folder, i)) for i in x]

In [None]:
df_final1.head(1)#[['tfp_op','cashflow','current_ratio']].isna().sum()
# total_coal_used, waste_gas, nox, smoke_dust, soot

In [None]:
summary(felm(log(so2_output) ~
  log(asset_tangibility_tot_asset) +
  log(tfp_op) +
  log(cashflow_tot_asset) +
  log(lag_liabilities_tot_asset) +
  log(lag_sales_tot_asset) +
  log(age) + 
  age_sqr+
  dummy_dso2_equip +
  SOE |
  firm + geocode4_corr + indu_2 | 0 | indu_2, df_final1 %>%
  mutate(
    age_sqr = log(age) ** 2,
    dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)

  ) %>%
  filter_at(
    vars(
      so2,
      asset_tangibility_tot_asset,
      sales,
      total_asset,
      lag_liabilities_tot_asset,
      lag_sales_tot_asset,
      cashflow_tot_asset,
      lag_concentration
    ),
    all_vars(. > 0)
  ) %>%
  filter_at(
    vars(
      asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  ) %>%
  filter_at(
    vars(
      so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  ) %>% filter(!is.na(tfp_op)),
  exactDOF = TRUE))

In [None]:
exp(1.342730/0.409474)

In [None]:
%get path table
t_0 <- felm(log(so2_output) ~
  log(asset_tangibility_tot_asset) +
  log(tfp_op) +
  log(cashflow_tot_asset) +
  log(lag_liabilities_tot_asset) +
  log(lag_sales_tot_asset) +
  log(age) + 
  age_sqr+
  dummy_dso2_equip +
  SOE |
  firm + geocode4_corr + indu_2 | 0 | indu_2, df_final1 %>%
  mutate(
    age_sqr = log(age) ** 2,
    dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)

  ) %>%
  filter_at(
    vars(
      so2,
      asset_tangibility_tot_asset,
      sales,
      total_asset,
      lag_liabilities_tot_asset,
      lag_sales_tot_asset,
      cashflow_tot_asset,
      lag_concentration
    ),
    all_vars(. > 0)
  ) %>%
  filter_at(
    vars(
      asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  ) %>%
  filter_at(
    vars(
      so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  ) %>% filter(!is.na(tfp_op)),
  exactDOF = TRUE)
t_1 <- felm(log(so2_output) ~
  log(asset_tangibility_tot_asset) +
  log(tfp_op) +
  log(current_ratio) +
  log(lag_liabilities_tot_asset) +
  log(lag_sales_tot_asset) +
  log(age) + 
  age_sqr+
  dummy_dso2_equip +
  SOE |
  firm + geocode4_corr + indu_2 | 0 | indu_2, df_final1 %>%
  mutate(
    age_sqr = log(age) ** 2,
    dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)

  ) %>%
  filter_at(
    vars(
      so2,
      asset_tangibility_tot_asset,
      sales,
      total_asset,
      lag_liabilities_tot_asset,
      lag_sales_tot_asset,
      current_ratio,
      lag_concentration
    ),
    all_vars(. > 0)
  ) %>%
  filter_at(
    vars(
      asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  ) %>%
  filter_at(
    vars(
      so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  ) %>% filter(!is.na(tfp_op)),
  exactDOF = TRUE)
t_2 <- felm(log(so2_output) ~
  log(asset_tangibility_tot_asset) +
  log(tfp_op) +
  log(coverage_ratio) +
  log(lag_liabilities_tot_asset) +
  log(lag_sales_tot_asset) +
  log(age) + 
  age_sqr+
  dummy_dso2_equip +
  SOE |
  firm + geocode4_corr + indu_2 | 0 | indu_2, df_final1 %>%
  mutate(
    age_sqr = log(age) ** 2,
    dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)

  ) %>%
  filter_at(
    vars(
      so2,
      asset_tangibility_tot_asset,
      sales,
      total_asset,
      lag_liabilities_tot_asset,
      lag_sales_tot_asset,
      coverage_ratio,
      lag_concentration
    ),
    all_vars(. > 0)
  ) %>%
  filter_at(
    vars(
      asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  ) %>%
  filter_at(
    vars(
      so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  ) %>% filter(!is.na(tfp_op)),
  exactDOF = TRUE)
dep <- "Dependent variable: Pollution emissions"
fe1 <- list(
    c("firm", "Yes", "Yes", "Yes"),
    c("year", "Yes", "Yes", "Yes"),
    c("city", "Yes", "Yes", "Yes")
             )

table_1 <- go_latex(list(
    t_0,t_1, t_2
),
    title="Determinant of pollution emissions",
    dep_var = dep,
    addFE=fe1,
    save=TRUE,
    note = FALSE,
    name=path
) 

### Baseline

1. **Asset Tangibility**: A positive relationship with SO2 emissions suggests that firms with a higher proportion of tangible assets tend to emit more SO2. This might be because companies with more physical assets (like factories) are more involved in manufacturing, which can lead to more emissions.

2. **Total Factor Productivity (tfp_op)**: A negative coefficient indicates that firms with higher productivity emit less SO2. This is interesting because it suggests that more efficient firms might be using cleaner technologies or better processes, leading to lower emissions.

3. **Cashflow to Asset**: A negative coefficient for this variable suggests that firms with more internal financing capabilities (as measured by their cashflows relative to their assets) emit less SO2. This could be because these firms might be more capable of investing in cleaner technologies.

4. **Current Ratio**: The negative relationship with SO2 emissions implies that firms with more current assets relative to their current liabilities might be less pollutive. This might be an indication of the financial health of a company, and financially healthier companies might be more environmentally responsible.

5. **Coverage Ratio**: The negative coefficient suggests that firms with higher abilities to cover their interest expenses emit less SO2, possibly due to better financial health and the capability to adopt cleaner technologies.

6. **Liabilities to Asset**: The mixed signs across different models suggest the relationship between the leverage of a firm and its emissions is not straightforward and needs further investigation.

7. **Sales to Asset**: The strong negative relationship suggests that more efficient firms in terms of sales per unit of asset tend to emit less SO2. This could again be an indication of the efficiency and possible adoption of cleaner technologies.

8. **Age Square (age_sqr)**: The negative coefficient indicates that older firms, possibly more established, emit less SO2. This could be due to older firms having more experience and resources to invest in cleaner technologies.

9. **Dummy for SO2 Equipment (dummy_dso2_equip)**: The negative sign suggests that firms that have invested in SO2 reducing equipment emit less SO2, as expected.

10. **State-Owned Enterprises (SOESOE)**: The positive relationship indicates that state-owned enterprises emit more SO2 compared to their counterparts.

In [None]:
tbe1  = "This table estimates eq(3). " \
"Heteroskedasticity-robust standard errors" \
"clustered at the product level appear inparentheses."\
"\sym{*} Significance at the 10\%, \sym{**} Significance at the 5\%, \sym{***} Significance at the 1\%."

#multicolumn ={
#    'Eligible': 2,
#    'Non-Eligible': 1,
#    'All': 1,
#    'All benchmark': 1,
#}

#multi_lines_dep = '(city/product/trade regime/year)'
new_r = ['& SO2', 'COD', "Waste water"]
lb.beautify(table_number = table_nb,
            #reorder_var = reorder,
            #multi_lines_dep = multi_lines_dep,
            #new_row= new_r,
            #multicolumn = multicolumn,
            table_nte = tbe1,
            jupyter_preview = True,
            resolution = 150,
            folder = folder)

In [None]:
summary( felm(log(cod_output) ~ 
            log(asset_tangibility_tot_asset) +
            log(tfp_op) +
            log(cashflow_tot_asset) +
            log(lag_liabilities_tot_asset) +
            log(lag_sales_tot_asset) +
            log(age) + 
  age_sqr+
             dummy_dwastewater_equip +
             SOE
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>%
         mutate(
             total_asset_dummy = ifelse(total_asset > quantile(total_asset, .75,na.rm = TRUE), 1, 0)
         ) %>%
         mutate(
             age_sqr = log(age) ** 2,
             dummy_dwastewater_equip = ifelse(dwastewater_equip > 0, 1, 0)
             
         ) %>%
             filter_at(
    vars(
        cod,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0)
  )%>%
             filter_at(
    vars(
        asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )%>%
             filter_at(
    vars(
        so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )
           %>% filter(!is.na(tfp_op))
             ,
            exactDOF = TRUE))

## Heterogeneity effect

- heterogeneity by sector and by city

In [None]:
df_final1.head(1)

In [None]:
folder = 'Tables_0'
table_nb = 1
table = 'table_{}'.format(table_nb)
path = os.path.join(folder, table + '.txt')
if os.path.exists(folder) == False:
        os.mkdir(folder)
for ext in ['.txt', '.pdf']:
    x = [a for a in os.listdir(folder) if a.endswith(ext)]
    [os.remove(os.path.join(folder, i)) for i in x]

In [None]:
summary(felm(log(so2_output) ~ 
            log(asset_tangibility_tot_asset) +
            log(tfp_op) +
             #log(cashflow_tot_asset) * total_asset_dummy+
             log(cashflow_tot_asset) * financial_dep_china1+
             log(cashflow_tot_asset) * innovation_index+
             log(cashflow_tot_asset) * concentration+
             log(cashflow_tot_asset) * tcz+
             log(cashflow_tot_asset) * spz+
             log(cashflow_tot_asset) * SOE + 
            log(lag_liabilities_tot_asset) +
            log(lag_sales_tot_asset) +
            log(age) + 
            age_sqr+
             dummy_dso2_equip +
             SOE
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>%
         mutate(
             total_asset_dummy = ifelse(total_asset > quantile(total_asset, .75,na.rm = TRUE), 1, 0)
         ) %>%
         mutate(
             age_sqr = log(age) ** 2,
             dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)
             
         ) %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0)
  )%>%
             filter_at(
    vars(
        asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )%>%
             filter_at(
    vars(
        so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )
           %>% filter(!is.na(tfp_op))
             ,
            exactDOF = TRUE))

In [None]:
%get path table
### Size
t_0 <- felm(log(so2_output) ~ 
            log(asset_tangibility_tot_asset) +
            log(tfp_op) +
            log(cashflow_tot_asset) * total_asset_dummy+
            log(lag_liabilities_tot_asset) +
            log(lag_sales_tot_asset) +
            log(age_sqr) +
             dummy_dso2_equip +
             SOE
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>%
         mutate(
             total_asset_dummy = ifelse(total_asset > quantile(total_asset, .75,na.rm = TRUE), 1, 0)
         ) %>%
         mutate(
             age_sqr = age **2,
             dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)
             
         ) %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0)
  )%>%
             filter_at(
    vars(
        asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )%>%
             filter_at(
    vars(
        so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )
           %>% filter(!is.na(tfp_op))
             ,
            exactDOF = TRUE)
t_1 <- felm(log(so2_output) ~ 
            log(asset_tangibility_tot_asset) +
            log(tfp_op) +
            log(cashflow_tot_asset) * financial_dep_china1+
            #log(current_ratio) * SOE+
            #log(sales) +
            log(lag_liabilities_tot_asset) +
            log(lag_sales_tot_asset) +
            log(age_sqr) +
             dummy_dso2_equip +
             SOE
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>%
         mutate(
             age_sqr = age **2,
             dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)
             
         ) %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0)
  )%>%
             filter_at(
    vars(
        asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )%>%
             filter_at(
    vars(
        so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )
           %>% filter(!is.na(tfp_op))
             ,
            exactDOF = TRUE)
t_2 <- felm(log(so2_output) ~ 
            log(asset_tangibility_tot_asset) +
            log(tfp_op) +
            log(cashflow_tot_asset) * innovation_index+
            #log(current_ratio) * SOE+
            #log(sales) +
            log(lag_liabilities_tot_asset) +
            log(lag_sales_tot_asset) +
            log(age_sqr) +
             dummy_dso2_equip +
             SOE
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>%
             mutate(
             age_sqr = age **2,
             dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)
         ) %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0)
  )%>%
             filter_at(
    vars(
        asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )%>%
             filter_at(
    vars(
        so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )
           %>% filter(!is.na(tfp_op))
             ,
            exactDOF = TRUE)
t_3 <- felm(log(so2_output) ~ 
            log(asset_tangibility_tot_asset) +
            log(tfp_op) +
            log(cashflow_tot_asset) * concentration+
            #log(current_ratio) * SOE+
            #log(sales) +
            log(lag_liabilities_tot_asset) +
            log(lag_sales_tot_asset) +
            log(age_sqr) +
             dummy_dso2_equip +
             SOE
            |firm + geocode4_corr+cic_adj|0 | cic_adj, df_final1 %>%
             mutate(
             age_sqr = age **2,
             dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)
         ) %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0)
  )%>%
             filter_at(
    vars(
        asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )%>%
             filter_at(
    vars(
        so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )
           %>% filter(!is.na(tfp_op))
             ,
            exactDOF = TRUE)
t_4 <- felm(log(so2_output) ~ 
            log(asset_tangibility_tot_asset) +
            log(tfp_op) +
            log(cashflow_tot_asset) * tcz+
            log(lag_liabilities_tot_asset) +
            log(lag_sales_tot_asset) +
            log(age_sqr) +
             dummy_dso2_equip +
             SOE
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>%
             mutate(
             age_sqr = age **2,
             dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)
         ) %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0)
  )%>%
             filter_at(
    vars(
        asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )%>%
             filter_at(
    vars(
        so2
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )
           %>% filter(!is.na(tfp_op))
             ,
            exactDOF = TRUE)
t_5 <- felm(log(so2_output) ~ 
            log(asset_tangibility_tot_asset) +
            log(tfp_op) +
            log(cashflow_tot_asset) * spz+
            log(lag_liabilities_tot_asset) +
            log(lag_sales_tot_asset) +
            log(age_sqr) +
             dummy_dso2_equip +
             SOE
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>%
             mutate(
             age_sqr = age **2,
             dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)
         ) %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0)
  )%>%
             filter_at(
    vars(
        asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )%>%
             filter_at(
    vars(
        so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )
           %>% filter(!is.na(tfp_op))
             ,
            exactDOF = TRUE)

dep <- "Dependent variable: Pollution emissions"
fe1 <- list(
    c("firm", "Yes", "Yes", "Yes", "Yes", "Yes"),
    c("year", "Yes", "Yes", "Yes", "Yes", "Yes"),
    c("city", "Yes", "Yes", "Yes", "Yes", "Yes")
             )

table_1 <- go_latex(list(
    t_0,t_1, t_2, t_3, t_4, t_5
),
    title="Determinant of pollution emissions",
    dep_var = dep,
    addFE=fe1,
    save=TRUE,
    note = FALSE,
    name=path
) 

### Heterogeinety

1. **Cashflow × financial_dep_china1:** A higher `financial_dep_china1` value indicates that the industry is more financially constrained, meaning it's difficult for firms to borrow money. The negative coefficient for this interaction indicates that in industries with tougher borrowing conditions, firms with higher cashflows are more inclined to reduce their SO2 emissions compared to those in less constrained industries. These firms may use their stronger cashflow to invest in cleaner technologies, production processes, or other measures that decrease their environmental footprint. This could be a way to mitigate potential regulatory risks, reduce future liabilities, or to proactively adopt sustainable practices given their limited access to external financing.

2. **Cashflow × concentration:** The negative coefficient of this interaction suggests that in cities where the big 4 banks dominate (and where these banks are biased towards lending to state-owned enterprises), private firms with more substantial cashflows are more proactive in reducing their emissions than in cities with a less concentrated banking sector. This implies that in areas where there's an implicit banking bias against private entities, such entities use their cashflows more efficiently to invest in emission-reducing measures. This could be a strategy for these private firms to demonstrate environmental responsibility, making them potentially more attractive for alternate financing, partnerships, or to appeal to a more environmentally-conscious customer base.

3. **Cashflow × tcz:** Firms in cities with a policy to reduce SO2 emissions (i.e., TCZ cities) tend to reduce their SO2 emissions more as their cashflow relative to assets increases.  This suggests that cash-rich firms in environmentally conscious cities are more proactive or efficient in leveraging their cashflows to invest in reducing emissions. This might be because these firms face greater regulatory scrutiny, have more public pressure to be environmentally responsible, or see economic benefits in compliance due to potential incentives provided by such cities

4. **Cashflow × spz:** The negative coefficient indicates that in SPZ cities, which are focused on economic growth, firms with higher cashflows tend to reduce their SO2 emissions more than those with weaker cashflows. This can be seen as evidence that even in growth-oriented regions, companies with more substantial financial strength (in terms of cashflow) prioritize or at least do not neglect environmental considerations, possibly as a strategy to ensure long-term sustainable growth.

In essence, the interactions with cashflow suggest that firms with better cash positions are proactive about reducing emissions, especially when faced with industry-wide financial constraints or localized banking biases. Their stronger cash position allows them to navigate and adapt to both financial and environmental challenges more effectively.

In [None]:
tbe1  = "This table estimates eq(3). " \
"Heteroskedasticity-robust standard errors" \
"clustered at the product level appear inparentheses."\
"\sym{*} Significance at the 10\%, \sym{**} Significance at the 5\%, \sym{***} Significance at the 1\%."

#multicolumn ={
#    'Eligible': 2,
#    'Non-Eligible': 1,
#    'All': 1,
#    'All benchmark': 1,
#}

#multi_lines_dep = '(city/product/trade regime/year)'
new_r = ['& SO2', 'COD', "Waste water"]
lb.beautify(table_number = table_nb,
            #reorder_var = reorder,
            #multi_lines_dep = multi_lines_dep,
            #new_row= new_r,
            #multicolumn = multicolumn,
            table_nte = tbe1,
            jupyter_preview = True,
            resolution = 150,
            folder = folder)

### Transmission channel

Faire avec taille

Regarder l'hétérogénéity, SOE/foreign/taille

- Changer de polluants

Est ce que l'accès a la finance interne affecte les emissions. 

Une entreprise publique/large est moins sensible a la contrainte de crédit.

In [None]:
folder = 'Tables_0'
table_nb = 1
table = 'table_{}'.format(table_nb)
path = os.path.join(folder, table + '.txt')
if os.path.exists(folder) == False:
        os.mkdir(folder)
for ext in ['.txt', '.pdf']:
    x = [a for a in os.listdir(folder) if a.endswith(ext)]
    [os.remove(os.path.join(folder, i)) for i in x]

In [None]:
%get path table
t_0 <- felm(log(asset_tangibility_tot_asset) ~
            log(cashflow_to_tangible) + 
            log(liabilities_tot_asset) + 
            log(total_asset) + 
            log(employment)+
            log(age) + 
            age_sqr+
             SOE 
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1%>%
            mutate(
             age_sqr = log(age) ** 2,
             dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)
         ) %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0)
  )%>%
             filter_at(
    vars(
        asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )%>%
             filter_at(
    vars(
        so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )
           %>% filter(!is.na(tfp_op))%>%
                 filter(!year %in% list("1998"))
             ,
            exactDOF = TRUE)
t_1 <- felm(rd_tot_asset_trick ~
            log(cashflow_to_tangible) + 
            log(liabilities_tot_asset) + 
            log(total_asset) + 
            log(employment)+
            log(age) + 
            age_sqr+
             SOE 
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>%
            mutate(
             age_sqr = log(age) ** 2,
             dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)
         ) %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0) 
  )%>%
                 filter(year %in% list("2005","2006", "2007"))
             ,
            exactDOF = TRUE)
t_2 <- felm(log(tfp_op) ~
            log(cashflow_to_tangible) + 
            log(liabilities_tot_asset) + 
            log(total_asset) + 
            log(employment)+
            log(age) + 
            age_sqr+
             SOE 
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1%>%
            mutate(
             age_sqr = log(age) ** 2,
             dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)
         ) %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0)
  )%>%
             filter_at(
    vars(
        asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )%>%
             filter_at(
    vars(
        so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )
           %>% filter(!is.na(tfp_op))%>%
                 filter(!year %in% list("1998"))
             ,
            exactDOF = TRUE)
dep <- "Dependent variable: "
fe1 <- list(
    c("firm", "Yes", "Yes", "Yes"),
    c("year", "Yes", "Yes", "Yes"),
    c("city", "Yes", "Yes", "Yes")
             )

table_1 <- go_latex(list(
    t_0,t_1, t_2
),
    title="Determinant of pollution emissions",
    dep_var = dep,
    addFE=fe1,
    save=TRUE,
    note = FALSE,
    name=path
) 

In [None]:
tbe1  = "This table estimates eq(3). " \
"Heteroskedasticity-robust standard errors" \
"clustered at the product level appear inparentheses."\
"\sym{*} Significance at the 10\%, \sym{**} Significance at the 5\%, \sym{***} Significance at the 1\%."

#multicolumn ={
#    'Eligible': 2,
#    'Non-Eligible': 1,
#    'All': 1,
#    'All benchmark': 1,
#}

#multi_lines_dep = '(city/product/trade regime/year)'
new_r = ['& SO2', 'COD', "Waste water"]
lb.beautify(table_number = table_nb,
            #reorder_var = reorder,
            #multi_lines_dep = multi_lines_dep,
            #new_row= new_r,
            #multicolumn = multicolumn,
            table_nte = tbe1,
            jupyter_preview = True,
            resolution = 150,
            folder = folder)

In [None]:
folder = 'Tables_0'
table_nb = 1
table = 'table_{}'.format(table_nb)
path = os.path.join(folder, table + '.txt')
if os.path.exists(folder) == False:
        os.mkdir(folder)
for ext in ['.txt', '.pdf']:
    x = [a for a in os.listdir(folder) if a.endswith(ext)]
    [os.remove(os.path.join(folder, i)) for i in x]

In [None]:
%get path table
t_0 <- felm(log(asset_tangibility_tot_asset) ~
            log(cashflow_to_tangible) * financial_dep_china1+ 
            log(liabilities_tot_asset) + 
            log(total_asset) + 
            log(employment)+
            log(age) + 
            age_sqr+
             SOE 
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1%>%
            mutate(
             age_sqr = log(age) ** 2,
             dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)
         ) %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0)
  )%>%
             filter_at(
    vars(
        asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )%>%
             filter_at(
    vars(
        so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )
           %>% filter(!is.na(tfp_op))%>%
                 filter(!year %in% list("1998"))
             ,
            exactDOF = TRUE)
t_1 <- felm(rd_tot_asset_trick ~
            log(cashflow_to_tangible) * financial_dep_china1+ 
            log(liabilities_tot_asset) + 
            log(total_asset) + 
            log(employment)+
            log(age) + 
            age_sqr+
             SOE 
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>%
            mutate(
             age_sqr = log(age) ** 2,
             dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)
         ) %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0) 
  )%>%
                 filter(year %in% list("2005","2006", "2007"))
             ,
            exactDOF = TRUE)
t_2 <- felm(log(tfp_op) ~
            log(cashflow_to_tangible) * financial_dep_china1+ 
            log(liabilities_tot_asset) + 
            log(total_asset) + 
            log(employment)+
            log(age) + 
            age_sqr+
             SOE 
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1%>%
            mutate(
             age_sqr = log(age) ** 2,
             dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)
         ) %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0)
  )%>%
             filter_at(
    vars(
        asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )%>%
             filter_at(
    vars(
        so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )
           %>% filter(!is.na(tfp_op))%>%
                 filter(!year %in% list("1998"))
             ,
            exactDOF = TRUE)
dep <- "Dependent variable: "
fe1 <- list(
    c("firm", "Yes", "Yes", "Yes"),
    c("year", "Yes", "Yes", "Yes"),
    c("city", "Yes", "Yes", "Yes")
             )

table_1 <- go_latex(list(
    t_0,t_1, t_2
),
    title="Determinant of pollution emissions",
    dep_var = dep,
    addFE=fe1,
    save=TRUE,
    note = FALSE,
    name=path
) 

In [None]:
df_final1['sales'].describe(percentiles = np.arange(0,1, .01)).head(10)

In [None]:
df_final1['employment'].describe(percentiles = np.arange(0,1, .01)).tail(10)

In [None]:
df_final1['cashflow_to_tangible'].describe(percentiles = np.arange(0,1, .01)).head(10)

In [None]:
df_final1['employment'].describe(percentiles = np.arange(0,1, .01)).head(10)

In [None]:
df_filtered_var <- df_final1%>% ungroup()%>% 
select(c(
    #so2,
    cashflow_to_tangible,
    #liabilities_tot_asset,
    total_asset,
    #employment,
    #age,
    tfp_op
)
      )
# Calculate the 1st and 99th percentiles for each variable
bounds <- sapply(df_filtered_var, function(x) {
  q <- quantile(x, c(0.01, 0.99), na.rm = TRUE)
  if (q[1] < 0) {
    q[1] <- 0
  }
  return(q)
})

# Create a filtering function
filter_outliers <- function(x, variable_name) {
  lb <- bounds[1, variable_name]
  ub <- bounds[2, variable_name]
  x > lb & x < ub
}
selected_cols <- names(df_filtered_var)
filter_conditions <- sapply(selected_cols, function(col) {
  lb <- bounds[1, col]
  ub <- bounds[2, col]
  paste0("(", col, " > ", lb, ") & (", col, " < ", ub, ")")
})

filter_expression <- paste(filter_conditions, collapse = " & ")
summary(felm(log(tfp_op) ~
            log(cashflow_to_tangible+1) * financial_dep_china1+ 
            log(liabilities_tot_asset) + 
            log(total_asset) + 
            log(employment)+
            log(age) + 
            age_sqr+
             SOE 
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>% filter(!!rlang::parse_expr(filter_expression)) %>%
            mutate(
             age_sqr = log(age) ** 2,
             dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)
         )%>% filter(!is.na(tfp_op))%>%
                 filter(!year %in% list("1998"))
       )
       )

In [None]:
tbe1  = "This table estimates eq(3). " \
"Heteroskedasticity-robust standard errors" \
"clustered at the product level appear inparentheses."\
"\sym{*} Significance at the 10\%, \sym{**} Significance at the 5\%, \sym{***} Significance at the 1\%."

#multicolumn ={
#    'Eligible': 2,
#    'Non-Eligible': 1,
#    'All': 1,
#    'All benchmark': 1,
#}

#multi_lines_dep = '(city/product/trade regime/year)'
new_r = ['& SO2', 'COD', "Waste water"]
lb.beautify(table_number = table_nb,
            #reorder_var = reorder,
            #multi_lines_dep = multi_lines_dep,
            #new_row= new_r,
            #multicolumn = multicolumn,
            table_nte = tbe1,
            jupyter_preview = True,
            resolution = 150,
            folder = folder)

In [None]:
folder = 'Tables_0'
table_nb = 1
table = 'table_{}'.format(table_nb)
path = os.path.join(folder, table + '.txt')
if os.path.exists(folder) == False:
        os.mkdir(folder)
for ext in ['.txt', '.pdf']:
    x = [a for a in os.listdir(folder) if a.endswith(ext)]
    [os.remove(os.path.join(folder, i)) for i in x]

In [None]:
%get path table
t_0 <- felm(log(asset_tangibility_tot_asset) ~
            log(cashflow_to_tangible) * SOE+ 
            log(liabilities_tot_asset) + 
            log(total_asset) + 
            log(employment)+
            log(age) + 
            age_sqr
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1%>%
            mutate(
             age_sqr = log(age) ** 2,
             dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)
         ) %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0)
  )%>%
             filter_at(
    vars(
        asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )%>%
             filter_at(
    vars(
        so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )
           %>% filter(!is.na(tfp_op))%>%
                 filter(!year %in% list("1998"))
             ,
            exactDOF = TRUE)
t_1 <- felm(rd_tot_asset_trick ~
            log(cashflow_to_tangible) * SOE+ 
            log(liabilities_tot_asset) + 
            log(total_asset) + 
            log(employment)+
            log(age) + 
            age_sqr
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>%
            mutate(
             age_sqr = log(age) ** 2,
             dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)
         ) %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0) 
  )%>%
                 filter(year %in% list("2005","2006", "2007"))
             ,
            exactDOF = TRUE)
t_2 <- felm(log(tfp_op) ~
            log(cashflow_to_tangible) * SOE+ 
            log(liabilities_tot_asset) + 
            log(total_asset) + 
            log(employment)+
            log(age) + 
            age_sqr
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1%>%
            mutate(
             age_sqr = log(age) ** 2,
             dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)
         ) %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0)
  )%>%
             filter_at(
    vars(
        asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )%>%
             filter_at(
    vars(
        so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )
           %>% filter(!is.na(tfp_op))%>%
                 filter(!year %in% list("1998"))
             ,
            exactDOF = TRUE)
dep <- "Dependent variable: "
fe1 <- list(
    c("firm", "Yes", "Yes", "Yes"),
    c("year", "Yes", "Yes", "Yes"),
    c("city", "Yes", "Yes", "Yes")
             )

table_1 <- go_latex(list(
    t_0,t_1, t_2
),
    title="Determinant of pollution emissions",
    dep_var = dep,
    addFE=fe1,
    save=TRUE,
    note = FALSE,
    name=path
) 

### Mechanism

1. **Asset Tangibility**:

   * **Log(cashflow)**: Firms with higher cashflow tend to decrease their investments in tangible assets. This resonates with the understanding that when firms possess ample cash flow, they exhibit greater discretion in investment choices and are not necessitated to invest heavily in tangible assets for the sole purpose of collateralization to secure bank loans. This dynamic is particularly accentuated in environments like China where tangible assets are frequently leveraged as collateral for loan procurements.

2. **R&D (Research and Development)**:

   * **Log(cashflow)**: An upsurge in cashflow is associated with increased investments in R&D. This underscores the perspective that banks often exhibit reservations toward financing innovative endeavors due to the intangible nature of intellectual property, which cannot be readily collateralized like tangible assets. Hence, firms flush with cash are inclined to self-finance their innovative ventures, leading to elevated R&D investments.

3. **TFP (Total Factor Productivity)**:

   * **Log(cashflow)**: Enhanced cashflow correlates with a surge in a firm's total factor productivity. This can be attributed to the possibility that firms with substantial liquidity can channel investments into state-of-the-art technologies or processes, and effectively allocate resources towards innovative or avenues promising growth.

From the outlined mechanisms:

- Financial constraints, through by cash flow, wield significant influence over a firm's investment strategies. With augmented cash flow, firms, especially private entities that might be marginalized by predominant banks, exercise autonomy in investment decisions. Rather than allocating resources to tangible assets, predominantly viewed as a vehicle to procure loans, there's a palpable shift towards innovation (R&D) — an avenue that holds the promise of sustained growth and heightened efficiency.

- This trend is emblematic of a broader economic shift where, sufforing by financial constraints, firms prioritize long-term growth, innovation, and efficiency over transient needs or the acquisition of external finance. Within a framework where traditional bank loans could be elusive for a subset of firms, especially against a landscape of credit concentration and preferential lending patterns favoring SOEs, internal cash reserves assume paramount importance in guiding investment choices.

In [None]:
tbe1  = "This table estimates eq(3). " \
"Heteroskedasticity-robust standard errors" \
"clustered at the product level appear inparentheses."\
"\sym{*} Significance at the 10\%, \sym{**} Significance at the 5\%, \sym{***} Significance at the 1\%."

multicolumn ={
    'SO2': 2,
    'COD': 2,
    'Waste water': 2
}

#multi_lines_dep = '(city/product/trade regime/year)'
new_r = ['& Asset tangilbility', 'R\&D', "TFP"]
lb.beautify(table_number = table_nb,
            #reorder_var = reorder,
            #multi_lines_dep = multi_lines_dep,
            new_row= new_r,
            #multicolumn = multicolumn,
            table_nte = tbe1,
            jupyter_preview = True,
            resolution = 150,
            folder = folder)

In [None]:
df_final1.head(1)
# fe_indu2_year
# fe_city_year

In [None]:
df_final1.head()

Removal capacity

In [None]:
df_final1.head(1)

In [None]:
list_vars = ['rlmxf',
 'ylmxf',
 'rlmpjlf',
 'rlyxf',
 'zyxf',
 'cyxf',
 'rlypjlf',
 'zypjlf',
 'clean_gas_used',
 'waste_water',
 'cod',
 'ad',
 'waste_gas',
 'so2',
 'nox',
 'smoke_dust',
 'soot',
 'yfc',
 'gyfscll',
 'hxxyqcl',
 'xzssqcl',
 'adqcl',
 'eyhlqcl',
 'dyhwqcl',
 'ycqcl',
 'gyfcqcl',
 'dwastewater_equip',
 'fszlssnl',
 'fszlssfee',
 'dwastegas_equip',
 'dso2_equip',
 'fqzlssnl',
 'tlssnl',
 'hxxycsl',
 'adcsl',
 'eyhlcsl',
 'dyhwcsl',
 'yfccsl',
            'tlssnl']

In [None]:
(
    df_final1[list_vars].describe()
)

In [None]:
folder = 'Tables_0'
table_nb = 1
table = 'table_{}'.format(table_nb)
path = os.path.join(folder, table + '.txt')
if os.path.exists(folder) == False:
        os.mkdir(folder)
for ext in ['.txt', '.pdf']:
    x = [a for a in os.listdir(folder) if a.endswith(ext)]
    [os.remove(os.path.join(folder, i)) for i in x]

In [None]:
%get path table
t_0 <- felm(log(dso2_equip + 1) ~
            log(cashflow_to_tangible)+ 
            log(liabilities_tot_asset) + 
            log(total_asset) + 
            log(employment)+
            log(age_sqr) +
             SOE 
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>%
            mutate(
             age_sqr = age **2,
         )
            %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0)
  )%>%
             filter_at(
    vars(
        asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )%>%
             filter_at(
    vars(
        so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )
           %>% filter(!is.na(tfp_op))%>%
                 filter(!year %in% list("1998"))
             ,
            exactDOF = TRUE)

t_1 <- felm(log(eyhlqcl + 1) ~
            log(cashflow_to_tangible)+ 
            log(liabilities_tot_asset) + 
            log(total_asset) + 
            log(employment)+
            log(age_sqr) +
             SOE 
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1%>%
            mutate(
             age_sqr = age **2,
         )
            %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0)
  )%>%
             filter_at(
    vars(
        asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )%>%
             filter_at(
    vars(
        so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )
           %>% filter(!is.na(tfp_op))%>%
                 filter(!year %in% list("1998"))
             ,
            exactDOF = TRUE)

t_2 <-felm(log(ratio+1) ~
            log(cashflow_to_tangible)+ 
            log(liabilities_tot_asset) + 
            log(total_asset) +
            log(employment)+
            log(age_sqr) +
             SOE
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1%>%
            mutate(
             age_sqr = age **2,
         )
           %>% mutate(
ratio = eyhlqcl/so2
)
            %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0)
  )%>%
             filter_at(
    vars(
        asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )%>%
             filter_at(
    vars(
        so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )
           %>% filter(!is.na(tfp_op))%>%
                 filter(!year %in% list("1998")) %>% filter(!is.na(ratio))
             ,
            exactDOF = TRUE)

t_3 <- felm(log(tlssnl+1) ~
            log(cashflow_to_tangible)+ 
            log(liabilities_tot_asset) + 
            log(total_asset) + 
            log(employment)+
            log(age_sqr) +
            SOE 
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1%>%
            mutate(
             age_sqr = age **2,
         )
            %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0)
  )%>%
             filter_at(
    vars(
        asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )%>%
             filter_at(
    vars(
        so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )
           %>% filter(!is.na(tfp_op))%>%
                 filter(!year %in% list("1998"))
             ,
            exactDOF = TRUE)
dep <- "Dependent variable: "
fe1 <- list(
    c("firm", "Yes", "Yes", "Yes", "Yes"),
    c("year", "Yes", "Yes", "Yes", "Yes"),
    c("city", "Yes", "Yes", "Yes", "Yes")
             )

table_1 <- go_latex(list(
    t_0,t_1, t_2, t_3
),
    title="Determinant of pollution emissions",
    dep_var = dep,
    addFE=fe1,
    save=TRUE,
    note = FALSE,
    name=path
) 
# dso2_equip eyhlqcl  ratio tlssnl

### Investment in Pollution Abatement:

1. **Abatement capacity**:
   
   * **Log(cashflow)**: Firms with a higher cashflow are found to increase their abatement capacity, implying that an increase in liquidity allows firms to invest in technologies and systems that enhance their capacity to reduce pollution. This suggests a proactive response by liquidity-rich firms in environmental stewardship.
   
2. **SO2 removed**:

   * **Log(cashflow)**: Higher cashflows lead to a significant increase in the amount of SO2 removed. Firms that possess ample liquidity are, evidently, deploying it to enhance their environmental footprint by adopting more efficient pollution abatement technologies.

3. **SO2 removed over yield**:

   * **Log(cashflow)**: Firms with increased cashflow are more effective at removing SO2 for each unit of their output. This emphasizes the efficiency gains that liquidity can bring about in pollution control, further supporting the premise that cash-rich firms are more environmentally efficient per unit of their production.

4. **SO2 removal per hour**:

   * **Log(cashflow)**: An increase in cashflow correlates with more efficient hourly SO2 removal rates. This implies that liquidity not only aids in sheer volume but also in the efficiency of the pollution control mechanisms.

Apart from the effects of cashflow:

- **Log(liabilities to asset)**: Higher liabilities relative to assets lead to an increase in abatement capacity, SO2 removed over yield, and SO2 removal per hour. This may be indicative of firms with higher leverage taking more proactive environmental actions, possibly to project a more responsible image to stakeholders or to meet certain regulatory compliances associated with their liabilities.

- **Log(total asset)**: Larger firms, in terms of total assets, have markedly higher abatement capacities, remove more SO2, have improved SO2 removal efficiency over yield, and remove SO2 more efficiently per hour. This highlights that bigger firms, potentially due to economies of scale and greater resources, are better equipped to invest in and manage pollution control measures.

- **Log(employment)**: Larger firms in terms of employment, conversely, are found to have a reduced abatement capacity and lower SO2 removal efficiency per hour. It might suggest that firms with more employees face challenges in managing and deploying pollution control measures as effectively as asset-rich firms.

- **Log(age_sqr)**: Older firms tend to have a greater abatement capacity, remove more SO2, and display improved efficiency in terms of SO2 removed over yield and per hour. Long-standing firms might possess more established processes and infrastructures to deal with pollution control.

- **SOESOE (State-Owned Enterprises)**: Compared to private entities, state-owned enterprises showcase a decreased abatement capacity and are less efficient in terms of SO2 removal per hour. This could be reflective of differing priorities or operational efficiencies between state-run and private enterprises.

In summary, the results shed light on the pivotal role that liquidity, in the form of cashflow, plays in a firm's environmental endeavors, specifically in the realm of SO2 abatement. It further underscores the nuanced ways in which firm size, leverage, age, and ownership influence environmental responsiveness in the context of SO2 pollution control.

In [None]:
tbe1  = "This table estimates eq(3). " \
"Heteroskedasticity-robust standard errors" \
"clustered at the product level appear inparentheses."\
"\sym{*} Significance at the 10\%, \sym{**} Significance at the 5\%, \sym{***} Significance at the 1\%."

multicolumn ={
    'SO2': 2,
    'COD': 2,
    'Waste water': 2
}

#multi_lines_dep = '(city/product/trade regime/year)'
new_r = ['& Abatement capacity', 'SO2 removed', "SO2 removed over yield", 'SO2 removal per hour']
lb.beautify(table_number = table_nb,
            #reorder_var = reorder,
            #multi_lines_dep = multi_lines_dep,
            new_row= new_r,
            #multicolumn = multicolumn,
            table_nte = tbe1,
            jupyter_preview = True,
            resolution = 150,
            folder = folder)

In [None]:
folder = 'Tables_0'
table_nb = 1
table = 'table_{}'.format(table_nb)
path = os.path.join(folder, table + '.txt')
if os.path.exists(folder) == False:
        os.mkdir(folder)
for ext in ['.txt', '.pdf']:
    x = [a for a in os.listdir(folder) if a.endswith(ext)]
    [os.remove(os.path.join(folder, i)) for i in x]

In [None]:
%get path table
t_0 <- felm(log(dso2_equip + 1) ~
            log(cashflow_to_tangible) * constraint+ 
            log(liabilities_tot_asset) + 
            log(total_asset) + 
            log(employment)+
            log(age_sqr) +
             SOE 
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>%
            mutate(
             age_sqr = age **2,
         )
            %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0)
  )%>%
             filter_at(
    vars(
        asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )%>%
             filter_at(
    vars(
        so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )
           %>% filter(!is.na(tfp_op))%>%
                 filter(!year %in% list("1998"))
             ,
            exactDOF = TRUE)

t_1 <- felm(log(eyhlqcl + 1) ~
            log(cashflow_to_tangible)* constraint+ 
            log(liabilities_tot_asset) + 
            log(total_asset) + 
            log(employment)+
            log(age_sqr) +
             SOE 
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1%>%
            mutate(
             age_sqr = age **2,
         )
            %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0)
  )%>%
             filter_at(
    vars(
        asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )%>%
             filter_at(
    vars(
        so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )
           %>% filter(!is.na(tfp_op))%>%
                 filter(!year %in% list("1998"))
             ,
            exactDOF = TRUE)

t_2 <-felm(log(ratio+1) ~
            log(cashflow_to_tangible) * constraint+ 
            log(liabilities_tot_asset) + 
            log(total_asset) +
            log(employment)+
            log(age_sqr) +
             SOE
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1%>%
            mutate(
             age_sqr = age **2,
         )
           %>% mutate(
ratio = eyhlqcl/so2
)
            %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0)
  )%>%
             filter_at(
    vars(
        asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )%>%
             filter_at(
    vars(
        so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )
           %>% filter(!is.na(tfp_op))%>%
                 filter(!year %in% list("1998")) %>% filter(!is.na(ratio))
             ,
            exactDOF = TRUE)

t_3 <- felm(log(tlssnl+1) ~
            log(cashflow_to_tangible) * constraint+ 
            log(liabilities_tot_asset) + 
            log(total_asset) + 
            log(employment)+
            log(age_sqr) +
            SOE 
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1%>%
            mutate(
             age_sqr = age **2,
         )
            %>%
             filter_at(
    vars(
        so2,
        asset_tangibility_tot_asset,
        sales,
        total_asset,
        lag_liabilities_tot_asset,
        lag_sales_tot_asset,
        cashflow_tot_asset,
        lag_concentration
    ),
    all_vars(. >0)
  )%>%
             filter_at(
    vars(
        asset_tangibility_tot_asset
    ),
    all_vars(between(., 0, quantile(., 0.99, na.rm = TRUE)))
  )%>%
             filter_at(
    vars(
        so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )
           %>% filter(!is.na(tfp_op))%>%
                 filter(!year %in% list("1998"))
             ,
            exactDOF = TRUE)
dep <- "Dependent variable: "
fe1 <- list(
    c("firm", "Yes", "Yes", "Yes", "Yes"),
    c("year", "Yes", "Yes", "Yes", "Yes"),
    c("city", "Yes", "Yes", "Yes", "Yes")
             )

table_1 <- go_latex(list(
    t_0,t_1, t_2, t_3
),
    title="Determinant of pollution emissions",
    dep_var = dep,
    addFE=fe1,
    save=TRUE,
    note = FALSE,
    name=path
) 
# dso2_equip eyhlqcl  ratio tlssnl

In [None]:
tbe1  = "This table estimates eq(3). " \
"Heteroskedasticity-robust standard errors" \
"clustered at the product level appear inparentheses."\
"\sym{*} Significance at the 10\%, \sym{**} Significance at the 5\%, \sym{***} Significance at the 1\%."

multicolumn ={
    'SO2': 2,
    'COD': 2,
    'Waste water': 2
}

#multi_lines_dep = '(city/product/trade regime/year)'
new_r = ['& Abatement capacity', 'SO2 removed', "SO2 removed over yield", 'SO2 removal per hour']
lb.beautify(table_number = table_nb,
            #reorder_var = reorder,
            #multi_lines_dep = multi_lines_dep,
            new_row= new_r,
            #multicolumn = multicolumn,
            table_nte = tbe1,
            jupyter_preview = True,
            resolution = 150,
            folder = folder)

# Generate reports

In [None]:
import os, time, shutil, urllib, ipykernel, json
from pathlib import Path
from notebook import notebookapp
import sys
path = os.getcwd()
parent_path = str(Path(path).parent.parent.parent)
sys.path.append(os.path.join(parent_path, 'utils'))
import make_toc
import create_report

In [None]:
name_json = 'parameters_ETL_pollution_credit_constraint.json'
path_json = os.path.join(str(Path(path).parent.parent), 'utils',name_json)

In [None]:
create_report.create_report(extension = "html", keep_code = False, notebookname = None)

In [None]:
### Update TOC in Github
for p in [parent_path,
          str(Path(path).parent),
          #os.path.join(str(Path(path).parent), "00_download_data_from"),
          #os.path.join(str(Path(path).parent.parent), "02_data_analysis"),
          #os.path.join(str(Path(path).parent.parent), "02_data_analysis", "00_statistical_exploration"),
          #os.path.join(str(Path(path).parent.parent), "02_data_analysis", "01_model_estimation"),
         ]:
    try:
        os.remove(os.path.join(p, 'README.md'))
    except:
        pass
    path_parameter = os.path.join(parent_path,'utils', name_json)
    md_lines =  make_toc.create_index(cwd = p, path_parameter = path_parameter)
    md_out_fn = os.path.join(p,'README.md')
    
    if p == parent_path:
    
        make_toc.replace_index(md_out_fn, md_lines, Header = os.path.basename(p).replace('_', ' '), add_description = True, path_parameter = path_parameter)
    else:
        make_toc.replace_index(md_out_fn, md_lines, Header = os.path.basename(p).replace('_', ' '), add_description = False)

In [None]:
!jupyter nbconvert --no-input --to html 09_firm_level_estimation_pollution_2.ipynb