# US Name
Model estimate Estimate internal finance and pollution emission firm level

# Description
None
# Metadata
- Key: 488_Financial_dependency_pollution
- Epic: Models
- US: Evaluate econometrics model
- Task tag: #internal-finance, #training-Financial-dependency-pollution
- Analytics reports: 
# Input
## Table/file
**Name**
- asif_financial_ratio_baseline_firm
- china_firm_pollution_data
**Github**
- https://github.com/thomaspernet/Financial_dependency_pollution/blob/master/02_data_analysis/01_model_train_evaluate/00_estimate_fin_ratio/08_firm_level_estimation_pollution.md



# Connexion server

In [1]:
from awsPy.aws_authorization import aws_connector
from awsPy.aws_s3 import service_s3
from awsPy.aws_glue import service_glue
from pathlib import Path
import pandas as pd
import numpy as np
#import seaborn as sns
import os, shutil, json
import sys

from sklearn import preprocessing

le = preprocessing.LabelEncoder()

path = os.getcwd()
parent_path = str(Path(path).parent.parent.parent)


name_credential = 'rootkey.csv'
region = 'eu-west-2'
bucket = 'datalake-london'
path_cred = "creds/{1}".format(parent_path, name_credential)

In [2]:
con = aws_connector.aws_instantiate(credential = path_cred,
                                       region = region)
client= con.client_boto()
s3 = service_s3.connect_S3(client = client,
                      bucket = bucket, verbose = False)
glue = service_glue.connect_glue(client = client) 

In [3]:
pandas_setting = True
if pandas_setting:
    #cm = sns.light_palette("green", as_cmap=True)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_colwidth', None)

In [4]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'


# Load tables

Since we load the data as a Pandas DataFrame, we want to pass the `dtypes`. We load the schema from Glue to guess the types

- 1=state -> 110 141 143 151
- 2=collective -> 120 130 142 149
- 3=private -171 172 173 174 190
- 4=foreign- 210 220 230 240
- 5=Hong Kong, Macau and Taiwan (4 and 5 can be combined into a single "foreign" category - 310 320 330 340

In [None]:
db = 'environment'
table = 'firm_financial_ratio_from_pollution1'

In [None]:
dtypes = {}
schema = (glue.get_table_information(database = db,
                           table = table)
          ['Table']['StorageDescriptor']['Columns']
         )
for key, value in enumerate(schema):
    if value['Type'] in ['varchar(12)',
                         'varchar(3)',
                        'varchar(14)', 'varchar(11)']:
        format_ = 'string'
    elif value['Type'] in ['decimal(21,5)', 'double', 'bigint', 'int', 'float']:
        format_ = 'float'
    else:
        format_ = value['Type'] 
    dtypes.update(
        {value['Name']:format_}
    )

In [None]:
download_data = True
filename = 'df_{}'.format(table)
full_path_filename = 'SQL_OUTPUT_ATHENA/CSV/{}.csv'.format(filename)
path_local = os.path.join(str(Path(path).parent.parent.parent), 
                              "00_data_catalog/temporary_local_data")
df_path = 'df_asif.csv'#os.path.join(path_local, filename + '.csv')
if download_data:
    
    s3 = service_s3.connect_S3(client = client,
                          bucket = bucket, verbose = False)

In [None]:
def construct_table(table="firm_financial_ratio_from_pollution1"):
    query = f"""
WITH temp as (
  SELECT 
    "firm", 
    "year", 
    geocode4_corr, 
    province_en, 
    "cic_adj", 
    "cic03", 
    "ownership_new", 
    "total_industrialwater_used", 
    "total_freshwater_used", 
    "gyqs", 
    "total_repeatedwater_used", 
    "total_coal_used", 
    "rlmxf", 
    "ylmxf", 
    "rlmpjlf", 
    "rlyxf", 
    "zyxf", 
    "cyxf", 
    "rlypjlf", 
    "zypjlf", 
    "clean_gas_used", 
    "waste_water", 
    "cod", 
    "ad", 
    "waste_gas", 
    "so2", 
    "nox", 
    "smoke_dust", 
    "soot", 
    "yfc", 
    "gyfscll", 
    "hxxyqcl", 
    "xzssqcl", 
    "adqcl", 
    "eyhlqcl", 
    "dyhwqcl", 
    "ycqcl", 
    "gyfcqcl", 
    "dwastewater_equip", 
    "fszlssnl", 
    "fszlssfee", 
    "dwastegas_equip", 
    "dso2_equip", 
    "fqzlssnl", 
    "tlssnl", 
    "hxxycsl", 
    "adcsl", 
    "eyhlcsl", 
    "dyhwcsl", 
    "yfccsl", 
    "age", 
    "bdat", 
    "export",
    c125,
    c133,
    c98,
    CASE 
    WHEN c98 <> 0 THEN CAST(c125 AS DOUBLE) / c98
    ELSE NULL
END AS interest_expense,
CASE
    WHEN c98 <> 0 THEN CAST(c133 AS DOUBLE) / c98
    ELSE NULL
END AS interest_expense1

    
  FROM 
    "environment"."china_firm_pollution_data" 
    INNER JOIN (
      SELECT 
        extra_code, 
        geocode4_corr, 
        province_en 
      FROM 
        chinese_lookup.china_city_code_normalised 
      GROUP BY 
        extra_code, 
        province_en, 
        geocode4_corr
    ) as no_dup_citycode ON china_firm_pollution_data.citycode = no_dup_citycode.extra_code
) 
SELECT 
  temp.firm, 
  temp.year, 
  temp.geocode4_corr, 
  tcz, 
  spz, 
  temp.province_en, 
  temp.cic_adj, 
  temp.cic03, 
  indu_2,
  temp.ownership_new, 
  "output", 
  "outputdefl", 
  "sales", 
  "employment", 
  "capital", 
  "total_industrialwater_used", 
  "total_freshwater_used", 
  "gyqs", 
  "total_repeatedwater_used", 
  "total_coal_used", 
  "rlmxf", 
  "ylmxf", 
  "rlmpjlf", 
  "rlyxf", 
  "zyxf", 
  "cyxf", 
  "rlypjlf", 
  "zypjlf", 
  "clean_gas_used", 
  "waste_water", 
  "cod", 
  "ad", 
  "waste_gas", 
  "so2", 
  "nox", 
  "smoke_dust", 
  "soot", 
  "yfc", 
  "gyfscll", 
  "hxxyqcl", 
  "xzssqcl", 
  "adqcl", 
  "eyhlqcl", 
  "dyhwqcl", 
  "ycqcl", 
  "gyfcqcl", 
  "dwastewater_equip", 
  "fszlssnl", 
  "fszlssfee", 
  "dwastegas_equip", 
  "dso2_equip", 
  "fqzlssnl", 
  "tlssnl", 
  "hxxycsl", 
  "adcsl", 
  "eyhlcsl", 
  "dyhwcsl", 
  "yfccsl", 
  "age", 
  "bdat", 
  "export", 
  "tfp_op", 
  "tfp_lp",
  CASE WHEN rd_tot_asset IS NULL THEN -1000 WHEN rd_tot_asset < 0 THEN 0 ELSE rd_tot_asset END AS rd_tot_asset_trick, 
  CASE WHEN temp.ownership_new in (1.0) THEN 'SOE' ELSE 'NOT_SOE' END AS SOE, 
  CASE WHEN temp.ownership_new in (4.0, 5.0) THEN 'FOREIGN' ELSE 'NOT_FOREIGN' END AS FOREIGN, 
  "current_asset", 
  "tofixed", 
  "total_liabilities", 
  "tangible", 
  "net_non_current", 
  "cashflow", 
  "working_capital", 
  "current_ratio", 
  "quick_ratio", 
  "liabilities_tot_asset", 
  "sales_tot_asset", 
  "total_asset", 
  "investment_tot_asset", 
  "rd_tot_asset", 
  "asset_tangibility_tot_asset", 
  "cashflow_tot_asset", 
  "cashflow_to_tangible", 
  "return_to_sale", 
  "coverage_ratio", 
  "working_capital_ratio", 
  "roa", 
  "roe", 
  "ros", 
  "liquidity", 
  "current_asset_tot_asset", 
  "tangible_tot_asset", 
  "net_non_current_tot_asset", 
  "working_capital_tot_asset", 
  "current_ratio_tot_asset", 
  "quick_ratio_tot_asset", 
  concat(indu_2, '-', temp.year) as fe_indu2_year, 
  concat(
    temp.geocode4_corr, '-', temp.year
  ) as fe_city_year ,
  interest_expense,
  interest_expense1,
  c125,
  c133,
  c98
FROM 
  temp 
  INNER JOIN "environment"."{table}" ON temp.firm = {table}.firm 
  AND temp.year = {table}.year 
  AND temp.geocode4_corr = {table}.geocode4_corr 
  AND temp.province_en = {table}.province_en 
  AND temp.cic_adj = {table}.cic_adj 
  AND temp.cic03 = {table}.cic03 
  AND temp.ownership_new = {table}.ownership_new 
  LEFT JOIN policy.china_city_tcz_spz ON temp.geocode4_corr = china_city_tcz_spz.geocode4_corr 
  INNER JOIN (
    SELECT 
      "firm", 
      "year", 
      "citycode_asifad" as geocode4_corr, 
      "tfp_op", 
      "tfp_lp" 
    FROM 
      firms_survey.firm_tfp_china
  ) as tfp_table on temp.firm = tfp_table.firm 
  and temp.geocode4_corr = tfp_table.geocode4_corr 
  and temp.year = tfp_table.year 
ORDER BY 
  firm, 
  year
    """
    df = s3.run_query(
        query=query,
        database=db,
        s3_output="SQL_OUTPUT_ATHENA",
        filename=filename,  # Add filename to print dataframe
        destination_key="SQL_OUTPUT_ATHENA/CSV",  # Use it temporarily
        dtype=dtypes,
    ).assign(
        #tcz=lambda x: x["tcz"].fillna(0).astype("int").astype("str"),
        spz=lambda x: x["spz"].fillna(0).astype("int").astype("str"),
        fe_fo=lambda x: le.fit_transform(x["firm"].astype("str")),
        fe_indu2_year=lambda x: le.fit_transform(x["fe_indu2_year"].astype("str")),
        fe_city_year=lambda x: le.fit_transform(x["fe_city_year"].astype("str")),
    )
    query = """
SELECT * FROM "industry"."china_credit_constraint"
"""
    df_credit = s3.run_query(
        query=query,
        database=db,
        s3_output="SQL_OUTPUT_ATHENA",
        filename=filename,  # Add filename to print dataframe
        destination_key="SQL_OUTPUT_ATHENA/CSV",  # Use it temporarily
        dtype=dtypes,
    )
    df = df.merge(df_credit.rename(columns={"cic": "indu_2"}), how="left").assign(
        financial_dep_china1 = lambda x: -x['financial_dep_china'],
        constraint=lambda x: x["financial_dep_china"] < -0.44,
        constraint_1=lambda x: x["financial_dep_china"] < -0.26,
    )
    df_final = df.assign(
        **{
            f"lag_{c}": df.groupby(["firm"])[c].transform("shift")
            for c in [
                "sales",
                "tfp_op",
                "tfp_lp",
                "cashflow", 
                  "working_capital", 
                  "current_ratio", 
                  "quick_ratio", 
                  "liabilities_tot_asset", 
                  "sales_tot_asset", 
                  "total_asset", 
                  "investment_tot_asset", 
                  "rd_tot_asset", 
                  "asset_tangibility_tot_asset", 
                  "cashflow_tot_asset", 
                  "cashflow_to_tangible", 
                  "return_to_sale", 
                  "coverage_ratio", 
                  "working_capital_ratio", 
                  "roa", 
                  "roe", 
                  "ros", 
                  "liquidity", 
                  "current_asset_tot_asset", 
                  "tangible_tot_asset", 
                  "net_non_current_tot_asset", 
                  "working_capital_tot_asset", 
                  "current_ratio_tot_asset", 
                  "quick_ratio_tot_asset"
            ]
        }
    ).sort_values(by = ['firm','year'])
    #.dropna(
     #   subset=[
     #       "lag_cashflow_to_tangible",
     #       "lag_sales_tot_asset",
     #       "lag_liabilities_tot_asset",
     #   ]
    #)
    return df_final


In [None]:
query_mandate = """
SELECT geocode4_corr,"tso2_mandate_c", "so2_perc_reduction_c", "target_reduction_so2_p"
FROM "policy"."china_city_reduction_mandate"
INNER JOIN (
      SELECT 
        extra_code, 
        geocode4_corr, 
        cityen
 
      FROM 
        chinese_lookup.china_city_code_normalised 
      GROUP BY 
        extra_code, 
        cityen, 
        geocode4_corr
    ) as no_dup_citycode ON china_city_reduction_mandate.cityen = no_dup_citycode.cityen
"""
df_mandate = (
    s3.run_query(
        query=query_mandate,
        database=db,
        s3_output="SQL_OUTPUT_ATHENA",
        filename=filename,  # Add filename to print dataframe
        destination_key="SQL_OUTPUT_ATHENA/CSV",  # Use it temporarily
    )
    .drop_duplicates()
)

In [None]:
df_mandate.to_csv('mandate.csv',index = False)

In [None]:
query = """
WITH temp AS (
SELECT year, no_dup_citycode.geocode4_corr,indus_code as cic_adj, 
substr(indus_code, 1,2) as ind2,
innovation_index
FROM innovation_city_industry
INNER JOIN (
    SELECT 
      extra_code, 
      geocode4_corr 
    FROM 
      chinese_lookup.china_city_code_normalised 
    GROUP BY 
      extra_code, 
      geocode4_corr
  ) as no_dup_citycode ON innovation_city_industry.geocode4_corr = no_dup_citycode.extra_code
  )
  SELECT year, geocode4_corr, 
  cic_adj,
  ind2,
  AVG(innovation_index) as innovation_index,
  MAX(innovation_index) as innovation_index_1,
  approx_percentile(innovation_index, ARRAY[0.50])[1] as innovation_index_2
  FROM temp
  GROUP BY year, geocode4_corr,
  cic_adj,
  ind2
"""
df_innovation = (s3.run_query(
            query=query,
            database="china",
            s3_output='SQL_OUTPUT_ATHENA',
            filename=filename,  # Add filename to print dataframe
            destination_key='SQL_OUTPUT_ATHENA/CSV',  #Use it temporarily
            dtype = dtypes
        )
                )
df_innovation.head()

In [None]:
(
df_innovation.to_csv('df_innovation.csv', index = False)
)

In [None]:
df_innovation.describe()

In [None]:
query = """
SELECT 
      province_loan_and_credit.year, 
      province_loan_and_credit.province_en, 
      CAST(
        total_long_term_loan AS DECIMAL(16, 5)
      )/ CAST(
        total_gdp AS DECIMAL(16, 5)
      ) AS credit_supply_long_term, 
      CAST(
        total_short_term AS DECIMAL(16, 5)
      )/ CAST(
        total_gdp AS DECIMAL(16, 5)
      ) AS credit_supply_short_term 
    FROM 
      almanac_bank_china.province_loan_and_credit
"""
df_credit_supply = (s3.run_query(
            query=query,
            database=db,
            s3_output='SQL_OUTPUT_ATHENA',
            filename=filename,  # Add filename to print dataframe
            destination_key='SQL_OUTPUT_ATHENA/CSV',  #Use it temporarily
            dtype = dtypes
        )
                )
df_credit_supply.head()

In [None]:
import re
def get_company_registration_type(x):
    regex = r'有限公司|信用合作联社|有限责任公司|旧市支行|族自治州分行|县支行|市支行|村支行|市分行|农村合作银行|自治区分行|支行|合作社联合社'\
'|分行|资金互助社|信用社联合社|股份公司|住宅金融事业部|国家开发银行|总行营业部|合作金融结算服务中心|信用合作社|信托投资公司'
    matches = re.findall(regex,x)
    if len(matches) > 0:
        return matches[0]
    else:
        return np.nan 
def get_type(x):
    if re.search(r"中国工商银行|中国建设银行|中国银行|中国农业银行|交通银行|中国邮政储蓄银行", str(x)):
        return (re.search(r"中国工商银行|中国建设银行|中国银行|中国农业银行|交通银行|中国邮政储蓄银行", 
                          str(x)).group(), "SOB")
    elif re.search(r"中国农业发展银行|国家开发银行|中国进出口银行", str(x)):
        return (re.search(r"中国农业发展银行|国家开发银行|中国进出口银行", str(x)).group(),
                "policy bank")
    elif re.search(r"股份制商业银行", str(x)):
        return (re.search(r"股份制商业银行|银行股份", str(x)).group(), "joint-stock commercial bank")
    #elif re.search(r"城市商业银行", str(x)):
    #    return (re.search(r"城市商业银行", str(x)).group(), "city commercial bank")
    elif re.search(r"农村商业银行", str(x)):
        return (re.search(r"农村商业银行", str(x)).group(), "rural commercial bank")
    elif re.search(r"外资银行", str(x)):
        return (re.search(r"外资银行", str(x)).group(), "foreign bank")
    else:
        return np.nan

In [None]:
query = """
SELECT *
FROM china.branches_raw_csv
"""
df_bank = (
    s3.run_query(
        query=query,
        database=db,
        s3_output="SQL_OUTPUT_ATHENA",
        filename="bank",  # Add filename to print dataframe
        destination_key="SQL_OUTPUT_ATHENA/CSV",  # Use it temporarily
        dtype = {'id':'str','geocode4_corr':'str', 'lostReason':'str',
                       'location':'str', 'city_temp':'str', 'points':'str'}
    )
    .assign(
        setdate=lambda x: pd.to_datetime(x["setdate"].astype("Int64").astype(str), errors ='coerce'),
        printdate=lambda x: pd.to_datetime(x["printdate"].astype("Int64").astype(str), errors ='coerce'),
        year_setdate=lambda x: x["setdate"].dt.year.astype("Int64").astype(str),
        bank_temp=lambda x: x.apply(
            lambda x: get_company_registration_type(x["fullname"]), axis=1
        ),
        geocode4_corr = lambda x: x['geocode4_corr'].astype("Int64").astype(str)
    )
     .assign(
         registration_type=lambda x: x.apply(
            lambda x: np.nan if pd.isna(x["bank_temp"]) else x["bank_temp"], axis=1
        ),
        bank_full_name=lambda x: x.apply(
            lambda x: x["fullname"]
            if pd.isna(x["bank_temp"])
            else x["fullname"].split(x["registration_type"][0])[0],
            axis=1,
        ),
        list_bank_type=lambda x: x.apply(lambda x: get_type(x["bank_full_name"]), axis=1),
        bank_type=lambda x: x.apply(
            lambda x: np.nan if pd.isna(x["list_bank_type"]) else x["list_bank_type"][0], axis=1
        ),
        bank_type_adj=lambda x: x.apply(
            lambda x: np.nan if pd.isna(x["list_bank_type"]) else x["list_bank_type"][1], axis=1
        )
    )
    .assign(
        bank_type_1 = lambda x: x['certcode'].str.slice(stop = 1),
        bank_code = lambda x: x['certcode'].str.slice(stop = 7),
        bank_type_details = lambda x: x['certcode'].str.slice(start = 5, stop = 6),
        citycode = lambda x: x['certcode'].str.slice(start = 7, stop = 11),
        unknown_code = lambda x: x['certcode'].str.slice(start = 11),
    )
    .drop(columns=["bank_temp"])
)

In [None]:
query = """
SELECT *
FROM china.china_bank_information
"""
df_bank_info = (
    s3.run_query(
        query=query,
        database=db,
        s3_output="SQL_OUTPUT_ATHENA",
        filename="bank",  # Add filename to print dataframe
        destination_key="SQL_OUTPUT_ATHENA/CSV",  # Use it temporarily
        dtype=dtypes,
    )
    .drop_duplicates(subset=["shortbnm"])
    .replace(
        {
            "bnature": {
                1: "政策性银行",
                2: "国有控股大型商业银行",
                3: "股份制商业银行",
                4: "城市商业银行",
                5: "农村商业银行",
                6: "外资银行",
                7: "其他",
                8: "农合行",
                9: "农信社",
                10: "三类新型农村金融机构",
            }
        }
    )
)
df_bank_info.shape

In [None]:
### CCB
temp_city_branch = (
    df_bank.loc[lambda x: x["bank_type_adj"].isin([np.nan])]
    .loc[lambda x: x["bank_type_details"].isin(["S"])]
    .loc[lambda x: x["bank_type_1"].isin(["B", "L"])]
    .loc[lambda x: ~x["fullname"].str.contains("村镇")]
    .assign(bank_type="城市商业银行", status="CCB")
    .reindex(
        columns=[
            "id",
            "certcode",
            "bank_type_adj",
            "bank_type_1",
            "bank_type",
            "bank_type_details",
            "unknown_code",
            "fullname",
            "registration_type",
            "bank_full_name",
            "citycode",
            'geocode4_corr',
            "bank_code",
            "year_setdate",
            "status",
        ]
    )
    .rename(columns={
        #"citycode": "geocode4_corr",
        "year_setdate": "year"})
)

In [None]:
#### no city bank
df_bank_concat = (
    pd.concat(
    [
        (
            df_bank.loc[lambda x: ~x["bank_type_adj"].isin([np.nan])]
            .loc[lambda x: x["bank_type_details"].isin(["S"])]
            .reindex(
                columns=[
                    "id",
                    "certcode",
                    "bank_type_adj",
                    "bank_type_1",
                    "bank_type",
                    "bank_type_details",
                    "unknown_code",
                    "fullname",
                    "registration_type",
                    "bank_full_name",
                    "citycode",
                    "bank_code",
                    "year_setdate",
                ]
            )
            .assign(status="no CCB",)
        ),
        ### rural
        (
            df_bank.loc[lambda x: x["bank_type_adj"].isin([np.nan])]
            .loc[lambda x: x["bank_type_details"].isin(["S"])]
            .loc[lambda x: x['fullname'].str.contains('村镇')]
            .assign(bank_type="农村商业银行", status="no CCB")
            .reindex(
                columns=[
                    "id",
                    "certcode",
                    "bank_type_adj",
                    "bank_type_1",
                    "bank_type",
                    "bank_type_details",
                    "unknown_code",
                    "fullname",
                    "registration_type",
                    "bank_full_name",
                    "citycode",
                    "geocode4_corr",
                    "bank_code",
                    "year_setdate",
                    "status",
                ]
            )
        )
    ]
)
    .rename(
    columns = {
        #'citycode':'geocode4_corr',
        'year_setdate':'year'
    })
)

In [None]:
from polyfuzz.models import RapidFuzz
from polyfuzz import PolyFuzz
rapidfuzz_matcher = RapidFuzz(n_jobs=1)

In [None]:
banks_no_ccb = (
    [ i.replace('（中国）','').strip() for i in 
     (
    df_bank_concat['bank_full_name'].dropna().drop_duplicates().to_list()
)
     if len(i) > 1]
)
bank_info = df_bank_info.loc[lambda x: ~x['bnature'].isin(['城市商业银行'])]['shortbnm'].to_list()
no_ccb = PolyFuzz(rapidfuzz_matcher).match(banks_no_ccb, bank_info)

In [None]:
banks = (
    [ i.replace('（中国）','').strip() for i in 
     (
    temp_city_branch['bank_full_name'].dropna().drop_duplicates().to_list()
)
     if len(i) > 1]
)
bank_info = df_bank_info.loc[lambda x: x['bnature'].isin(['城市商业银行'])]['shortbnm'].to_list()
model = PolyFuzz(rapidfuzz_matcher).match(banks, bank_info)

In [None]:
query = """
SELECT 
        extra_code, 
        geocode4_corr, 
        province_en 
      FROM 
        chinese_lookup.china_city_code_normalised
"""
df_citycode = (s3.run_query(
            query=query,
            database="china",
            s3_output='SQL_OUTPUT_ATHENA',
            filename=filename,  # Add filename to print dataframe
            destination_key='SQL_OUTPUT_ATHENA/CSV',  #Use it temporarily
            dtype = dtypes
        )
                )
df_citycode.head()

In [None]:
df_bank_status = (
    pd.concat(
        [
            (
                df_bank_concat.merge(
                    no_ccb.get_matches()
                    .sort_values(by=["Similarity"])
                    .loc[lambda x: x["Similarity"] > 0.80]
                    .rename(columns={"From": "bank_full_name"})
                )
            ),
            (
                temp_city_branch.merge(
                    model.get_matches()
                    .sort_values(by=["Similarity"])
                    .loc[lambda x: x["Similarity"] > 0.80]
                    .rename(columns={"From": "bank_full_name"})
                )
            ),
        ]
    )
    .rename(columns={"To": "bank_name"})
    .assign(
        geocode4_corr=lambda x: np.where(
            x["geocode4_corr"].isin(["<NA>", np.nan]), x["citycode"], x["geocode4_corr"]
        )
    )
    .assign(
        bank_type_adj=lambda x: np.where(
            np.logical_and(
                x["bank_type_adj"].isin([np.nan]), x["bank_type"] == "农村商业银行"
            ),
            "rural commercial bank",
            x["bank_type_adj"],
        )
    )
    .assign(
        bank_type_adj=lambda x: np.where(
            np.logical_and(
                x["bank_type_adj"].isin([np.nan]), x["bank_type"] == "城市商业银行"
            ),
            "city commercial bank",
            x["bank_type_adj"],
        )
    )
    .merge(
        df_citycode.drop_duplicates()
        .assign(extra_code=lambda x: x["extra_code"].astype(str))
        .rename(columns={"geocode4_corr": "geocode4_corr_adj"}),
        right_on=["extra_code"],
        left_on=["geocode4_corr"],
    )
    .drop(columns = ['geocode4_corr','extra_code'])
    .rename(columns={"geocode4_corr_adj": "geocode4_corr"})
)
df_bank_status.head()

In [None]:
import janitor

In [None]:
df_ccb = (
    #### Number of CCB per city
    df_bank_status.pivot_table(
        values="id",
        index=["status", "bank_name", "geocode4_corr"],
        columns="year",
        aggfunc="nunique",
        fill_value=0,
    )
    .stack()
    .reset_index()
    .rename(columns={0: "count"})
    .assign(
        count=lambda x: x.groupby(["status", "bank_name", "geocode4_corr"])[
            "count"
        ].transform("cumsum"),
        active=lambda x: np.where(x["count"] > 0, 1, 0),
    )
    .groupby(["year", "geocode4_corr", "status"])
    .agg({"count": "sum", "active": "sum"})
    .unstack(-1)
    .collapse_levels(sep="_")
    .rename(columns={"count_no CCB": "count_no_CCB", "active_no CCB": "active_no_CCB",})
    .fillna(0)
    .assign(
        share_count_ccb=lambda x: (x["count_CCB"] / (x["count_CCB"] + x["count_no_CCB"])),
        share_active_ccb=lambda x: (x["active_CCB"] / (x["active_CCB"] + x["active_no_CCB"])),
    )
    .fillna(0)
)
df_ccb.shape

In [None]:
df_hhi = (
            #### HHI
            df_bank_status.pivot_table(
                values="id",
                index=["bank_name", "geocode4_corr"],
                columns="year",
                aggfunc="nunique",
                fill_value=0,
            )
            .stack()
            .reset_index()
            .rename(columns={0: "count"})
            .assign(
                count=lambda x: x.groupby(["bank_name", "geocode4_corr"])[
                    "count"
                ].transform("cumsum"),
                active=lambda x: np.where(x["count"] > 0, 1, 0),
                total_city=lambda x: x.groupby(["year", "geocode4_corr"])[
                    "count"
                ].transform("sum"),
                total_city_active=lambda x: x.groupby(["year", "geocode4_corr"])[
                    "active"
                ].transform("sum"),
                score_count=lambda x: (x["count"] / x["total_city"]) ** 2,
                score_active=lambda x: (x["active"] / x["total_city_active"]) ** 2,
            )
            .groupby(["year", "geocode4_corr"])
            .agg({"score_count": "sum", "score_active": "sum"})
            .assign(
                hhi_branches=lambda x: 1 - x["score_count"],
                hhi_branches_name=lambda x: 1 - x["score_active"],
            )
        )
df_hhi.shape

In [None]:
def big_four(x):
    if x == "中国农业银行股份":
        return "中国农业银行股份"
    elif x == "中国工商银行股份":
        return "中国工商银行股份"
    elif x == "中国建设银行股份":
        return "中国建设银行股份"
    elif x == "中国银行股份":
        return "中国银行股份"
    else:
        return 'other'

In [None]:
df_concentration = (
            df_bank_status.assign(
                sob=lambda x: x.apply(lambda x: big_four(x["bank_full_name"]), axis =1)
            )
    .groupby(["year", "geocode4_corr", "sob"])
            .agg({"id": "count"})
            .sort_values(by=["sob", "geocode4_corr", "year"])
            .reset_index()
            .pivot_table(
                values="id",
                index=["sob", "geocode4_corr"],
                columns="year",
                aggfunc=np.sum,
                fill_value=0,
            )
    .stack()
            .reset_index()
            .rename(columns={0: "count"})
    .assign(
                temp=lambda x: x.groupby(["sob", "geocode4_corr"])[
                    "count"
                ].transform("cumsum"),
                temp_1=lambda x: x.groupby(["sob", "geocode4_corr", "temp"])[
                    "year"
                ]
                .transform("min")
                .fillna("2222")
                .astype("int"),
                first_entry=lambda x: x.groupby(["sob", "geocode4_corr"])[
                    "temp_1"
                ].transform("min"),
                count=lambda x: x["count"].fillna(0),
            )
    .loc[lambda x: x["year"].astype("int") >= x["first_entry"]]
    .drop(columns=["temp", "temp_1", "first_entry"])
            .assign(
                totalBranchBank=lambda x: x.groupby(["sob", "geocode4_corr"])[
                    "count"
                ].transform("cumsum"),
                totalBranchCity=lambda x: x.groupby(["geocode4_corr", "year"])[
                    "totalBranchBank"
                ].transform("sum"),
            )
    .loc[lambda x: x["sob"] != "other"]
    .set_index(["sob", "geocode4_corr", "year", "totalBranchCity"])
            .drop(columns=["count"])
            .unstack(0)
            .assign(total_sob=lambda x: x.sum(axis=1))
            .reset_index(["totalBranchCity"])
            .assign(
                concentration_sob=lambda x: x[("total_sob", "")]
                / x[("totalBranchCity", "")]
            )
            .reindex(columns=[("total_sob", ""), ("concentration_sob", "")])
            .droplevel(axis=1, level=1)
            .reset_index()
            .dropna(subset=["concentration_sob"])
            .assign(concentration=lambda x: np.log((1 - x["concentration_sob"]) +1)
                   )
)
df_concentration.shape

In [None]:
df_deregulation = (
    pd.concat([df_ccb,df_hhi], axis =1)
    .merge(df_concentration.set_index(['year','geocode4_corr']),
           how = 'left', left_index = True, right_index = True)
)

#df_deregulation =  pd.concat([df_ccb,df_hhi,df_concentration], axis =1)
df_deregulation = (
    df_deregulation.assign(
        **{
            "lag_{}".format(i): df_deregulation.groupby(["geocode4_corr"])[i].transform(
                "shift"
            )
            for i in df_deregulation.columns
        }
    )
)
df_deregulation.shape

In [None]:
df_deregulation = (
    df_deregulation.reindex(
        columns=[
            "count_CCB",
            "lag_count_CCB",
            "count_no_CCB",
            "lag_count_no_CCB",
            "active_CCB",
            "lag_active_CCB",
            "active_no_CCB",
            "lag_active_no_CCB",
            "share_count_ccb",
            "lag_share_count_ccb",
            "share_active_ccb",
            "lag_share_active_ccb",
            #"score_count",
            #"lag_score_count",
            #"score_active",
            #"lag_score_active",
            "hhi_branches",
            "lag_hhi_branches",
            "hhi_branches_name",
            "lag_hhi_branches_name",
            "count_False",
            "lag_count_False",
            "count_True",
            "lag_count_True",
            "concentration",
            "lag_concentration",
        ]
    )
    .reset_index()
    .loc[lambda x: x["year"] > "1997"]
)
df_deregulation.columns = (
    df_deregulation.columns.str.strip()
    .str.replace(" ", "_")
    .str.replace("-", "_")
    .str.lower()
)
df_deregulation.tail()

In [None]:
df_deregulation.to_csv('df_deregulation.csv', index = False)

In [None]:
def is_sequence_continuous(years):
    sorted_years = sorted(years.unique())
    return len(sorted_years) == sorted_years[-1] - sorted_years[0] + 1
expected_years = list(range(1999, 2008)) 
def has_missing_years(x):
    unique_years = x.unique()
    return len(set(expected_years) - set(unique_years))

In [None]:
to_download = False
df_final1 = (
    construct_table(table="firm_financial_ratio_from_pollution1")
)

In [None]:
df_final1 = (
    df_final1
    #.drop(columns=["SOE", "FOREIGN"])
    #.merge(
    #    df_final1.assign(
    #        first_year=lambda x: x.groupby(["firm"])["year"].transform("min")
    #    )
    #    .loc[lambda x: x["year"] == x["first_year"]]
    #    .reindex(columns=["firm", "SOE", "FOREIGN"])
    #    .drop_duplicates()
    #)
    .assign(
        change_status_soe=lambda x: x.groupby(["firm"])["SOE"].transform(
            lambda x: x.nunique()
        ),
        change_status_foreign=lambda x: x.groupby(["firm"])["FOREIGN"].transform(
            lambda x: x.nunique()
        ),
    )
)

In [None]:
df_final1['flag_discontinuous'] = ~df_final1.groupby('firm')['year'].transform(is_sequence_continuous).astype(int)
df_final1['missing_years'] = df_final1.groupby('firm')['year'].transform(has_missing_years)
# Add a column to flag firms with missing years
df_final1['flag'] = np.where(df_final1['missing_years'] > 0, 1, 0)
df_final1 = (
    df_final1
    .merge(df_innovation, how = 'left', on =['year', 'geocode4_corr', 'cic_adj'])
    .assign(
        innovation_index =lambda x: x['innovation_index'].fillna(0),
        innovation_index_1=lambda x: x['innovation_index_1'].fillna(0),
        innovation_index_2=lambda x: x['innovation_index_2'].fillna(0),
        so2_output = lambda x: x['so2']/x['output'],
        so2_sales = lambda x: x['so2']/x['sales'],
        so2_capital = lambda x: x['so2']/x['capital'],
        so2_emp = lambda x: x['so2']/x['employment'],
        so2_asset = lambda x: x['so2']/x['total_asset'],
        
        cod_output = lambda x: x['cod']/x['output'],
        cod_sales = lambda x: x['cod']/x['sales'],
        cod_capital = lambda x: x['cod']/x['capital'],
        cod_emp = lambda x: x['cod']/x['employment'],
        cod_asset = lambda x: x['cod']/x['total_asset'],
        
        waste_water_output = lambda x: x['waste_water']/x['output'],
        waste_water_sales = lambda x: x['waste_water']/x['sales'],
        waste_water_capital = lambda x: x['waste_water']/x['capital'],
        waste_water_emp = lambda x: x['waste_water']/x['employment'],
        waste_water_asset = lambda x: x['waste_water']/x['total_asset'],
    )
    .merge(df_deregulation.assign(year = lambda x: x['year'].astype(int)), how = 'left')
     .merge(
        (
            df_credit_supply.sort_values(by=["province_en", "year"])
            .assign(
                lag_credit_supply_long_term=lambda x: x.groupby(["province_en"])[
                    "credit_supply_long_term"
                ].transform("shift")
            )
            .reindex(columns=["year", "province_en", "lag_credit_supply_long_term",'credit_supply_long_term'])
        ),
         how = 'left'
    )
)

In [None]:
#df_final1.to_csv('df_asif1.csv')

In [None]:
df_final1.shape

In [None]:

if to_download:
    df_final1.to_csv('df_asif1.csv')
    df_final2 = construct_table(table="firm_financial_ratio_from_pollution2")
    df_final3 = construct_table(table="firm_financial_ratio_from_pollution3")
    
    df_final2.to_csv('df_asif2.csv')
    df_final3.to_csv('df_asif3.csv')

In [None]:
df_final1['change_status_soe'].value_counts()

In [None]:
df_final1['change_status_foreign'].value_counts()

In [None]:
df_final1['flag_discontinuous'].value_counts()

In [None]:
df_final1['missing_years'].value_counts()

In [None]:
df_final1.loc[lambda x: x['flag_discontinuous'] == -1]

## Schema Latex table

To rename a variable, please use the following template:

```
{
    'old':'XX',
    'new':'XX_1'
    }
```

if you need to pass a latex format with `\`, you need to duplicate it for instance, `\text` becomes `\\text:

```
{
    'old':'working\_capital\_i',
    'new':'\\text{working capital}_i'
    }
```

Then add it to the key `to_rename`

In [None]:
add_to_dic = True
if add_to_dic:
    if os.path.exists("schema_table.json"):
        os.remove("schema_table.json")
    data = {'to_rename':[], 'to_remove':[]}
    dic_rename = [
    {
        'old': 'current\_ratio',
        'new': '\\text{current ratio}'
    },
    {
        'old': 'cashflow\_to\_tangible',
        'new': '\\text{cashflow}'
    },
    {
        'old': 'coverage\_ratio',
        'new': '\\text{coverage ratio}'
    },
    {
        'old': 'asset\_tangibility\_tot\_asset',
        'new': '\\text{asset tangibility}'
    },
    {
        'old': 'financial\_dep\_china1',
        'new': '\\text{credit constraint}'
    },
    {
        'old': 'financial\_dep\_china',
        'new': '\\text{credit constraint}'
    },
    {
        'old': 'lag\_concentration',
        'new': '\\text{bank regulation}'
    },
    {
        'old': 'tcz',
        'new': '\\text{Two Control Zone}'
    },
    {
        'old': 'spz',
        'new': '\\text{Special Policy Zone}'
    },
    {
        'old': 'tfp\_op',
        'new': '\\text{tfp}'
    },
    {
        'old': 'tofixed',
        'new': '\\text{total asset}'
    },
    {
        'old': 'lag\_sales\_tot\_asset',
        'new': '\\text{sales to asset}'
    },
    {
        'old': 'total\_asset',
        'new': '\\text{total asset}'
    },
    {
        'old': 'employment',
        'new': '\\text{employment}'
    },
    {
        'old': 'age',
        'new': '\\text{age}'
    },
    {
        'old': 'age\_sqr',
        'new': '\\text{age sqr}'
    },
    {
        'old': 'dummy\_dso2\_equip',
        'new': '\\text{$SO_{2}$ removing capacity}'
    },
    {
        'old': 'SOESOE',
        'new': '\\text{soe}'
    },
        {
        'old': 'liabilities\_tot\_asset',
        'new': '\\text{liabilities to asset}'
    },
        {
        'old': 'concentratedTRUE',
        'new': '\\text{bank regulation}'
    },
        {
        'old': 'concentrated',
        'new': '\\text{Bank regulation}'
    },
        {
        'old': 'innovativeTRUE',
        'new': '\\text{inn. capacity}'
    },
        {
        'old': 'innovative',
        'new': '\\text{inn. capacity}'
    }
        
]

    data['to_rename'].extend(dic_rename)
    with open('schema_table.json', 'w') as outfile:
        json.dump(data, outfile)

In [None]:
sys.path.append(os.path.join(parent_path, 'utils'))
import latex.latex_beautify as lb
#%load_ext autoreload
#%autoreload 2

In [None]:
options(warn=-1)
library(tidyverse)
library(lfe)
#library(lazyeval)
library('progress')
path = "../../../utils/latex/table_golatex.R"
source(path)

In [5]:
%get df_path
df_final1 <- read_csv('df_asif1.csv') %>%
mutate_if(is.character, as.factor) %>%
    mutate_at(vars(starts_with("fe")), as.factor) %>%
  group_by(firm) %>%
  mutate(count = n(), quick_ratio1 = -quick_ratio) #%>% ungroup

Variable df_path does not exist


ERROR: Error in read_csv("df_asif1.csv") %>% mutate_if(is.character, as.factor) %>% : could not find function "%>%"


# Main results


In [8]:
df_final1 = pd.read_csv('df_asif1.csv')

In [None]:
df_final1.head()

In [12]:
(
    df_final1.loc[lambda x: x['so2'] > 0]
               .loc[lambda x: ~x['year'].isin(['1998'])]['cic_adj'].nunique()
)

419

In [13]:
(
    df_final1.loc[lambda x: x['so2'] > 0]
               .loc[lambda x: ~x['year'].isin(['1998'])]['geocode4_corr'].nunique()
)

286

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.set_style("white")
    ### plot 1
chart = sns.lmplot(x="log_asset_tangibility_tot_asset",
           y="so2_output",
           data= (
               df_final1.assign(
               log_asset_tangibility_tot_asset = lambda x: np.log(x['asset_tangibility_tot_asset']),
               so2_output = lambda x: np.log(x['so2_output'])
           )
               .loc[lambda x: x['so2'] > 0]
               .loc[lambda x: x['log_asset_tangibility_tot_asset'] > -4]
               .loc[lambda x: ~x['year'].isin(['1998'])]
           )
                   
                   #df.loc[lambda x: 
                   #   x['log_asset_tangibility_tot_asset'] > -4]
          )
plt.xlabel("log Asset tangibility")
plt.ylabel('log SO2 emissions')
plt.savefig("Figures/FIGURE_3.png",
            bbox_inches='tight',
            dpi=600)

In [None]:
fig_dims = (15, 10)
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, sharey=False, figsize=fig_dims)

sns.regplot(
    x="log_cashflow_to_tangible",
    y="so2_output",
    data=(
               df_final1.assign(
               log_cashflow_to_tangible = lambda x: np.log(x['cashflow_to_tangible']),
               so2_output = lambda x: np.log(x['so2_output'])
           )
               .loc[lambda x: x['so2'] > 0]
               .loc[lambda x: x['cashflow_to_tangible'] > 0]
               .loc[lambda x: ~x['year'].isin(['1998'])]
           ),
    ax=ax1,
)
ax1.set_xlabel("log Cashflow")
ax1.set_ylabel("log SO2 emissions")
sns.regplot(
    x="log_current_ratio",
    y="so2_output",
    data=(
               df_final1.assign(
               log_current_ratio = lambda x: np.log(x['current_ratio']),
               so2_output = lambda x: np.log(x['so2_output'])
           )
               .loc[lambda x: x['so2'] > 0]
               .loc[lambda x: x['current_ratio'] > 0]
               .loc[lambda x: ~x['year'].isin(['1998'])]
           ),
    ax=ax2,
)
ax2.set_xlabel("log Current ratio")
ax2.set_ylabel("log SO2 emissions")
sns.regplot(
    x="log_coverage_ratio",
    y="so2_output",
    data=(
               df_final1.assign(
               log_coverage_ratio = lambda x: np.log(x['coverage_ratio']),
               so2_output = lambda x: np.log(x['so2_output'])
           )
               .loc[lambda x: x['so2'] > 0]
               .loc[lambda x: x['coverage_ratio'] > 0]
               .loc[lambda x: ~x['year'].isin(['1998'])]
           ),
    ax=ax3,
)
ax3.set_xlabel("log coverage ratio")
ax3.set_ylabel("log SO2 emissions")
plt.savefig("Figures/FIGURE_4.png",
            bbox_inches='tight',
            dpi=600)

In [None]:
folder = 'Tables_0'
table_nb = 1
table = 'table_{}'.format(table_nb)
path = os.path.join(folder, table + '.txt')
if os.path.exists(folder) == False:
        os.mkdir(folder)
for ext in ['.txt', '.pdf']:
    x = [a for a in os.listdir(folder) if a.endswith(ext)]
    [os.remove(os.path.join(folder, i)) for i in x]

ERROR: Error in df_final1 %>% mutate(age_sqr = log(age)^2, dummy_dso2_equip = ifelse(dso2_equip > : could not find function "%>%"


In [None]:
%get path table

t0 <- felm(log(so2_output) ~
  log(asset_tangibility_tot_asset) + 
  log(cashflow_to_tangible) +
  log(tfp_op) +
  log(lag_sales_tot_asset) +  
  log(tofixed) +
  log(employment)+
  log(age) + 
  age_sqr+
             dummy_dso2_equip + 
  SOE  |
  firm + geocode4_corr + indu_2 | 0 | indu_2, df_final1 %>%
  mutate(
    age_sqr = log(age) ** 2,
      dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)

  ) %>%
        filter_at(
    vars(
        cashflow_to_tangible,
        tfp_op,
        asset_tangibility_tot_asset

    ),
    all_vars(. >0)
  ) %>% filter(!is.na(tfp_op))%>%
  filter_at(
    vars(
      so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )%>%filter(!year %in% list("1998"))%>%filter(!is.na(financial_dep_china1)),
  exactDOF = TRUE)

t1 <- felm(log(so2_output) ~
  log(asset_tangibility_tot_asset) +           
  log(current_ratio) +
  log(tfp_op) +
  log(lag_sales_tot_asset) +   
  log(tofixed) +
  log(employment)+
  log(age) + 
  age_sqr+
             dummy_dso2_equip+
  SOE  |
  firm + geocode4_corr + indu_2 | 0 | indu_2, df_final1 %>%
  mutate(
    age_sqr = log(age) ** 2,
      dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)

  ) %>%
        filter_at(
    vars(
        current_ratio,
        tfp_op,
        asset_tangibility_tot_asset

    ),
    all_vars(. >0)
  ) %>% filter(!is.na(tfp_op))%>%
  filter_at(
    vars(
      so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )%>%filter(!year %in% list("1998"))%>%filter(!is.na(financial_dep_china1)),
  exactDOF = TRUE)

t2 <- felm(log(so2_output) ~
  log(asset_tangibility_tot_asset) + 
  log(coverage_ratio) +
  log(tfp_op) +
  log(lag_sales_tot_asset) +  
  log(tofixed) +
  log(employment)+
  log(age) + 
  age_sqr+
  dummy_dso2_equip + 
  SOE  |
  firm + geocode4_corr + indu_2 | 0 | indu_2, df_final1 %>%
  mutate(
    age_sqr = log(age) ** 2,
      dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)

  ) %>%
        filter_at(
    vars(
        coverage_ratio,
        tfp_op,
        asset_tangibility_tot_asset

    ),
    all_vars(. >0)
  ) %>% filter(!is.na(tfp_op))%>%
  filter_at(
    vars(
      so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )%>%filter(!year %in% list("1998"))%>%filter(!is.na(financial_dep_china1)),
  exactDOF = TRUE)

t3 <- felm(log(so2_output) ~
  log(asset_tangibility_tot_asset) + 
  log(cashflow_to_tangible) +
  log(current_ratio) +
  log(coverage_ratio) +
  log(tfp_op) +
  log(lag_sales_tot_asset) +  
  log(tofixed) +
  log(employment)+
  log(age) + 
  age_sqr+
  dummy_dso2_equip +
  SOE  |
  firm + geocode4_corr + indu_2 | 0 | indu_2, df_final1 %>%
  mutate(
    age_sqr = log(age) ** 2,
      dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)

  ) %>%
        filter_at(
    vars(
        cashflow_to_tangible,
        current_ratio,
        coverage_ratio,
        tfp_op

    ),
    all_vars(. >0)
  ) %>% filter(!is.na(tfp_op))%>%
  filter_at(
    vars(
      so2
    ),
    all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
  )%>%filter(!year %in% list("1998"))%>%filter(!is.na(financial_dep_china1)),
  exactDOF = TRUE)

dep <- "Dependent variable: SO2 emission intensity"
fe1 <- list(
    c("firm", "Yes", "Yes", "Yes", "Yes"),
    c("year", "Yes", "Yes", "Yes", "Yes"),
    c("city", "Yes", "Yes", "Yes", "Yes")
             )

table_1 <- go_latex(list(
    t0,t1, t2,t3
),
    title="Determinant of pollution emissions",
    dep_var = dep,
    addFE=fe1,
    save=TRUE,
    note = FALSE,
    name=path
) 

### Baseline

Notebook variable calculation: https://github.com/thomaspernet/Financial_dependency_pollution/blob/master/01_data_preprocessing/02_transform_tables/11_firm_pollution_financial_ratio_tfp.md

**Asset Tangibility**
The results indicate a significant positive correlation between asset tangibility and SO2 emissions. This finding aligns with the theoretical framework, suggesting that credit-constrained firms tend to invest more in tangible assets, which can be utilized as collateral. These assets are typically emission-intensive, such as heavy machinery or industrial plants, leading to higher levels of SO2 emissions. This has important policy implications, highlighting the need for regulators to reconsider the environmental costs of facilitating loans backed by tangible assets.

**Internal Finance (Cash Flow, Current Ratio, Coverage Ratio)**
Negative coefficients for variables related to internal financing capabilities—specifically cash flow, current ratio, and coverage ratio—suggest that firms with stronger internal financial health are less pollutive. Such firms are likely capable of investing in cleaner technologies or processes. This is noteworthy because these firms may be less reliant on tangible assets, reducing their environmental footprint.

**Total Factor Productivity (TFP)**
A negative coefficient for TFP indicates that firms with higher productivity levels emit less SO2. This may be due to more efficient utilization of resources per unit of output, possibly facilitated by cleaner technologies or processes. The findings related to TFP are particularly important, signaling that enhancements in productivity can be environmentally beneficial.

**Control Variables**
Although Sales to Asset, Total Asset, and Employment are used as control variables in the model, their negative correlations with SO2 emissions provide additional, albeit secondary, insights. Specifically, the findings suggest that more efficient firms in terms of resource utilization tend to have lower emissions, adding further nuance to the understanding of the interplay between corporate finance and environmental impact.

In summary, the empirical evidence supports the theoretical argument that credit constraints can have unintended environmental consequences, mediated through asset tangibility. This underscores the importance of understanding the environmental implications of corporate financial decisions, especially for credit-constrained firms. The findings provide a substantive contribution to the existing literature, bridging the gap between corporate finance and environmental economics.

In [None]:
tbe1  = ("This table estimates eq(3). "  
         "\\textit{asset tangibility} denotes tangible assets over total assets. "  
         "\\textit{cash flow} is defined as net income + depreciation over assets. "  
         "\\textit{current ratio} is measured as current assets over current liabilities. "  
         "\\textit{coverage ratio} is measured as the earning before interest and taxes over interest expenses. "  
         "\\textit{TFP} stands for Total Factor Productivity and is estimated using the Olley and Pake algorithm. "  
         "\\textit{$SO_{2}$ removing capacity} is the capacity to remove $SO_{2}$ emissions per hour divided by sales. "  
         "All variables are in logs. Control variables are sales over asset, total asset, employment, age, age square, $SO_{2}$ removing capacity and SOE ownership "
         "Heteroskedasticity-robust standard errors"
         " clustered at the product level appear in parentheses."
         "\\sym{*} Significance at the 10\\%, \\sym{**} Significance at the 5\\%, \\sym{***} Significance at the 1\\%.")
#multicolumn ={
#    'Eligible': 2,
#    'Non-Eligible': 1,
#    'All': 1,
#    'All benchmark': 1,
#}

#multi_lines_dep = '(city/product/trade regime/year)'
new_r = ['& SO2', 'COD', "Waste water"]
lb.beautify(table_number = table_nb,
            #reorder_var = reorder,
            #multi_lines_dep = multi_lines_dep,
            #new_row= new_r,
            #multicolumn = multicolumn,
            table_nte = tbe1,
            jupyter_preview = True,
            resolution = 150,
            folder = folder)

## Heterogeneity effect

- heterogeneity by sector and by city

In [None]:
folder = 'Tables_0'
table_nb = 2
table = 'table_{}'.format(table_nb)
path = os.path.join(folder, table + '.txt')
if os.path.exists(folder) == False:
        os.mkdir(folder)
for ext in ['.txt', '.pdf']:
    x = [a for a in os.listdir(folder) if a.endswith(ext)]
    [os.remove(os.path.join(folder, i)) for i in x]

In [None]:
year = 2001
variable = 'innovation_index'
threshold = .75
(
    df_innovation
    .loc[lambda x: x['year'].isin([year])]
    .groupby(['geocode4_corr'])
    .agg({'innovation_index':'mean'})
    .reset_index()
    .assign(
        median = lambda x: x[variable].quantile(threshold),
        innovative = lambda x: x[variable] > x['median']
    )
    .reindex(columns = ['geocode4_corr','innovative'])
    .to_csv('temp_innovative.csv', index = False)
)

In [None]:
year = '1998'
variable = 'hhi_branches_name'
threshold = .75
(
    df_deregulation
    .loc[lambda x: x['year'].isin([year])]
    .assign(
        median = lambda x: x[variable].quantile(threshold),
        concentrated = lambda x: x[variable] > x['median']
    )
    .reindex(columns = ['geocode4_corr','concentrated'])
    .to_csv('temp_regulation.csv', index = False)
)

In [None]:
%get path table
t0 <- felm(
    log(so2_output) ~
      log(asset_tangibility_tot_asset) + 
      log(cashflow_to_tangible) * financial_dep_china1+
      log(tfp_op) +
      log(lag_sales_tot_asset) +
      log(tofixed) +
      log(employment) +
      log(age) +
      age_sqr +
      dummy_dso2_equip +
      SOE |
    firm + geocode4_corr + indu_2 | 0 | indu_2, 
    df_final1 %>%
      mutate(
        age_sqr = log(age) ** 2,
        dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)
      ) %>%
      filter_at(
        vars(
          cashflow_to_tangible,
          tfp_op,
          asset_tangibility_tot_asset
        ),
        all_vars(. > 0)
      ) %>% 
      filter(!is.na(tfp_op)) %>%
      filter_at(
        vars(
          so2
        ),
        all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
      )%>%filter(!year %in% list("1998"))%>%filter(!is.na(financial_dep_china1)),
    exactDOF = TRUE
  )

t1 <- felm(
    log(so2_output) ~
      log(asset_tangibility_tot_asset) + 
      log(cashflow_to_tangible) * concentrated+
      log(tfp_op) +
      log(lag_sales_tot_asset) +
      log(tofixed) +
      log(employment) +
      log(age) +
      age_sqr +
      dummy_dso2_equip +
      SOE |
    firm + geocode4_corr + indu_2 | 0 | indu_2, 
    df_final1 %>% left_join(read_csv('temp_regulation.csv') ) %>% mutate(concentrated = replace_na(concentrated, FALSE)) %>%
      mutate(
        age_sqr = log(age) ** 2,
        dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)
      ) %>%
      filter_at(
        vars(
          asset_tangibility_tot_asset,
          cashflow_to_tangible,
          tfp_op
        ),
        all_vars(. > 0)
      ) %>% 
      filter(!is.na(tfp_op)) %>%
      filter_at(
        vars(
          so2
        ),
        all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
      )%>%filter(!year %in% list("1998"))%>%filter(!is.na(financial_dep_china1)),
    exactDOF = TRUE
  )

t2 <- felm(
    log(so2_output) ~
      log(asset_tangibility_tot_asset) + 
      log(cashflow_to_tangible) * innovative+ 
      log(tfp_op) +
      log(lag_sales_tot_asset) +
      log(tofixed) +
      log(employment) +
      log(age) +
      age_sqr +
      dummy_dso2_equip +
      SOE |
    firm + geocode4_corr + indu_2 | 0 | indu_2, 
    df_final1 %>% left_join(read_csv('temp_innovative.csv') ) %>% mutate(innovative = replace_na(innovative, FALSE)) %>%
      mutate(
        age_sqr = log(age) ** 2,
        dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)
      ) %>%
      filter_at(
        vars(
          asset_tangibility_tot_asset,
          cashflow_to_tangible,
          tfp_op
        ),
        all_vars(. > 0)
      ) %>% 
      filter(!is.na(tfp_op)) %>%
      filter_at(
        vars(
          so2
        ),
        all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
      )%>%filter(!year %in% list("1998"))%>%filter(!is.na(financial_dep_china1)),
    exactDOF = TRUE
  )

t3 <- felm(
    log(so2_output) ~
      log(asset_tangibility_tot_asset) + 
      log(cashflow_to_tangible) * tcz+
      log(tfp_op) +
      log(lag_sales_tot_asset) +
      log(tofixed) +
      log(employment) +
      log(age) +
      age_sqr +
      dummy_dso2_equip +
      SOE |
    firm + geocode4_corr + indu_2 | 0 | indu_2, 
    df_final1 %>%
      mutate(
        age_sqr = log(age) ** 2,
        dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0),
        tcz = ifelse(is.na(tcz), 0,tcz)
      ) %>%
      filter_at(
        vars(
          cashflow_to_tangible,
          tfp_op,
          asset_tangibility_tot_asset
        ),
        all_vars(. > 0)
      ) %>% 
      filter(!is.na(tfp_op)) %>%
      filter_at(
        vars(
          so2
        ),
        all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
      )%>%filter(!year %in% list("1998"))%>%filter(!is.na(financial_dep_china1)),
    exactDOF = TRUE
  )

t4 <- felm(
    log(so2_output) ~
      log(asset_tangibility_tot_asset) + 
      log(cashflow_to_tangible) * spz+
      log(tfp_op) +
      log(lag_sales_tot_asset) +
      log(tofixed) +
      log(employment) +
      log(age) +
      age_sqr +
      dummy_dso2_equip +
      SOE |
    firm + geocode4_corr + indu_2 | 0 | indu_2, 
    df_final1 %>%
      mutate(
        age_sqr = log(age) ** 2,
        dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)
      ) %>%
      filter_at(
        vars(
          cashflow_to_tangible,
          tfp_op,
          asset_tangibility_tot_asset
        ),
        all_vars(. > 0)
      ) %>% 
      filter(!is.na(tfp_op)) %>%
      filter_at(
        vars(
          so2
        ),
        all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
      )%>%filter(!year %in% list("1998"))%>%filter(!is.na(financial_dep_china1)),
    exactDOF = TRUE
  )

t5 <- felm(
    log(so2_output) ~
      log(asset_tangibility_tot_asset) + 
      log(cashflow_to_tangible) * financial_dep_china1+
      log(cashflow_to_tangible) * concentrated+
      log(cashflow_to_tangible) * innovative+
      log(cashflow_to_tangible) * tcz+
      log(cashflow_to_tangible) * spz+
      log(tfp_op) +
      log(lag_sales_tot_asset) +
      log(tofixed) +
      log(employment) +
      log(age) +
      age_sqr +
      dummy_dso2_equip +
      SOE |
    firm + geocode4_corr + indu_2 | 0 | indu_2, 
    df_final1 %>% left_join(read_csv('temp_regulation.csv') ) %>% mutate(concentrated = replace_na(concentrated, FALSE))%>%
    left_join(read_csv('temp_innovative.csv') ) %>% mutate(innovative = replace_na(innovative, FALSE)) %>%
      mutate(
        age_sqr = log(age) ** 2,
        dummy_dso2_equip = ifelse(dso2_equip > 0, 1, 0)
      ) %>%
      filter_at(
        vars(
          cashflow_to_tangible,
          tfp_op,
          asset_tangibility_tot_asset
        ),
        all_vars(. > 0)
      ) %>% 
      filter(!is.na(tfp_op)) %>%
      filter_at(
        vars(
          so2
        ),
        all_vars(between(., 1, quantile(., 0.99, na.rm = TRUE)))
      )%>%filter(!year %in% list("1998"))%>%filter(!is.na(financial_dep_china1)),
    exactDOF = TRUE
  )

dep <- "Dependent variable: SO2 emission intensity"
fe1 <- list(
    c("firm", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes"),
    c("year", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes"),
    c("city", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes")
             )

table_1 <- go_latex(list(
    t0,t1, t2,t3, t4, t5
),
    title="Heterogeneity effect",
    dep_var = dep,
    addFE=fe1,
    save=TRUE,
    note = FALSE,
    name=path
) 

In this table titled "Heterogeneity effect," the primary focus is on examining how SO2 emission intensity varies across different firms and situations. The idea is to test whether these variations can be explained by certain firm characteristics and interactions between these characteristics.

**Main Variables:**
1. log(cashflow): This variable measures the natural log of the firm's net income plus depreciation, scaled by assets. The coefficients across different specifications range from -0.091 to -0.078, all significant at the 1% level. This negative coefficient suggests that an increase in cashflow tends to lead to a decrease in SO2 emission intensity.
2. Interactions with log(cashflow):

    - log(cashflow) × credit constraint: The interaction term is significant and negative, with coefficients ranging from -0.037 to -0.028. This suggests that firms that are credit constrained tend to have lower SO2 emission intensity as cash flow increases, perhaps because they may invest in cleaner technologies when they have the financial means.
    - log(cashflow) × Bank regulation: The coefficient is -0.048, significant at the 1% level, implying that stricter bank regulation combined with higher cashflow leads to lower SO2 emissions.
    - log(cashflow) × inn. capacity: The coefficient is -0.035, significant at the 5% level. Firms with greater innovative capacity tend to reduce SO2 emissions more with an increase in cashflow.
    - log(cashflow) × Two Control Zone: The coefficient is -0.018 and not statistically significant, suggesting that being in a "Two Control Zone" does not strongly influence the relationship between cashflow and SO2 emissions.
    - log(cashflow) × Special Policy Zone: The coefficient is -0.040, significant at the 1% level, implying that firms in special policy zones tend to have lower SO2 emissions with higher cashflow, possibly due to targeted policies or incentives in these zones.

**Theoretical Interpretation**:
- log(cashflow): The negative relationship between cashflow and SO2 emission suggests that firms with more financial resources are more capable of adopting environmentally friendly practices. This aligns with the economic theory that firms with better financials can make long-term investments in cleaner technology, reducing environmental impact.
- Interactions:
    - Credit constraint: Firms that are credit constrained may be more sensitive to their cash situation and would invest in cleaner technologies when they have more cash, aligning with the resource-based view of the firm.
    - Bank regulation: Stricter bank regulation may encourage firms to be more responsible environmentally, as higher cash flows may be channeled into more sustainable practices.
    - Innovative Capacity: Firms with more innovative capacity may be more adept at deploying cash efficiently towards reducing emissions.

In [None]:
footnote = ("This table estimates eq(3). "
            "\\textit{Asset tangibility} represents the proportion of tangible assets to total assets. "
            "\\textit{Cash flow} is net income plus depreciation, scaled by assets. "
            "\\textit{TFP} is Total Factor Productivity, estimated using the Olley and Pake algorithm. "
            "\\textit{Credit constraints} is a dummy variable taking the value of 1 if the industry is financially dependent. "
            "\\textit{Bank regulation} is a dummy variable based on the 75th percentile of the Herfindahl-Hirschman Index calculated from the share of each bank branch relative to the total branches in that area in 1998. "
            "\\textit{inn. capacity} is a dummy variable based on the 75th percentile of a patent value-adjusted index at the city and 4-digit industry level in 2001. "
            "\\textit{Two Control Zone} and \\textit{Special Policy Zone} are policy variables indicating the firm's location. "
            "Heteroskedasticity-robust standard errors clustered at the product level are in parentheses. "
            "\\sym{*} Significance at the 10\\%, \\sym{**} Significance at the 5\\%, \\sym{***} Significance at the 1\\%.")

#multicolumn ={
#    'Eligible': 2,
#    'Non-Eligible': 1,
#    'All': 1,
#    'All benchmark': 1,
#}

#multi_lines_dep = '(city/product/trade regime/year)'
new_r = ['& SO2', 'COD', "Waste water"]
lb.beautify(table_number = table_nb,
            #reorder_var = reorder,
            #multi_lines_dep = multi_lines_dep,
            #new_row= new_r,
            #multicolumn = multicolumn,
            table_nte = footnote,
            jupyter_preview = True,
            resolution = 150,
            folder = folder)

## Transmission channel

### Asset tangible, R&D and TFP

In [None]:
folder = 'Tables_0'
table_nb = 3
table = 'table_{}'.format(table_nb)
path = os.path.join(folder, table + '.txt')
if os.path.exists(folder) == False:
        os.mkdir(folder)
for ext in ['.txt', '.pdf']:
    x = [a for a in os.listdir(folder) if a.endswith(ext)]
    [os.remove(os.path.join(folder, i)) for i in x]

In [None]:
%get path table
t0 <- felm(log(asset_tangibility_tot_asset) ~
            log(cashflow_to_tangible) + 
            log(liabilities_tot_asset) +
            log(total_asset) + 
            log(employment)+
            log(age) +
            age_sqr +
             SOE 
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>%
            mutate(
              age_sqr = log(age) **2,
         )%>%
             filter_at(
    vars(
        cashflow_to_tangible
    ),
all_vars(between(., 0.01, quantile(., 0.99, na.rm = TRUE)))
  )%>%
                 filter(!year %in% list("1998"))%>%filter(!is.na(financial_dep_china1))
             ,
            exactDOF = TRUE)

t1 <- felm(rd_tot_asset_trick ~
            log(cashflow_to_tangible) + 
            log(liabilities_tot_asset) + 
            log(total_asset) + 
            log(employment)+
            log(age) +
            age_sqr +
             SOE 
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>%
            mutate(
              age_sqr = log(age) **2,
         )%>%
        filter_at(
    vars(
        cashflow_to_tangible,

    ),
    all_vars(. >0)
  )%>%filter(year %in% list("2005","2006", "2007"))%>%filter(!is.na(financial_dep_china1))
             ,
            exactDOF = TRUE)

t2 <- felm(log(tfp_op) ~
            log(cashflow_to_tangible) + 
            log(liabilities_tot_asset) + 
            log(total_asset) + 
            log(employment)+
            log(age) +
            age_sqr +
             SOE 
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>%
            mutate(
              age_sqr = log(age) **2,
         )%>%
             filter_at(
    vars(
        cashflow_to_tangible
    ),
all_vars(between(., 0.01, quantile(., 0.99, na.rm = TRUE)))
  )%>%
                 filter(!year %in% list("1998"))%>%filter(!is.na(financial_dep_china1))
             ,
            exactDOF = TRUE)

dep <- "Dependent variable: "
fe1 <- list(
    c("firm", "Yes", "Yes", "Yes"),
    c("year", "Yes", "Yes", "Yes"),
    c("city", "Yes", "Yes", "Yes")
             )

table_1 <- go_latex(list(
    t0,t1, t2
),
    title="Determinants of Pollution Abatement Through Asset Tangibility, RD, and TFP",
    dep_var = dep,
    addFE=fe1,
    save=TRUE,
    note = FALSE,
    name=path
) 

**Asset Tangibility**: A 1% increase in cash flow leads to a 0.076% decrease in asset tangibility, holding other factors constant. This is significant at the 1% level.
- Theoretical Explanation: Firms with more cash flow may be more flexible in their investment choices and opt for less tangible assets. This could be a sign that these firms are diverting resources towards less tangible but potentially higher-yield investments like R&D.
- **Log(cashflow)**: Firms with higher cashflow tend to decrease their investments in tangible assets. This resonates with the understanding that when firms possess ample cash flow, they exhibit greater discretion in investment choices and are not necessitated to invest heavily in tangible assets for the sole purpose of collateralization to secure bank loans. This dynamic is particularly accentuated in environments like China where tangible assets are frequently leveraged as collateral for loan procurements.

**Research and Development (RD)**: A 1% increase in cash flow leads to a 0.0005% increase in RD. This is significant at the 5% level
- Theoretical Explanation: The positive relationship suggests that firms with more cash flow are more likely to invest in R&D. This aligns with the notion that greater cash flow can free up resources for investments in innovation.
- **Log(cashflow)**: An upsurge in cashflow is associated with increased investments in R&D. This underscores the perspective that banks often exhibit reservations toward financing innovative endeavors due to the intangible nature of intellectual property, which cannot be readily collateralized like tangible assets. Hence, firms flush with cash are inclined to self-finance their innovative ventures, leading to elevated R&D investments.

**Total Factor Productivity (TFP)**: A 1% increase in cash flow leads to a 0.087% increase in TFP, significant at the 1% level.
- Theoretical Explanation: Greater cash flow may allow firms to invest in productivity-enhancing technologies or processes, thereby increasing their TFP.
All the results support the idea that firms with more cash flow are more agile in their capital allocation, preferring investments that are less tangible and potentially more innovative or productive.
- **Log(cashflow)**: Enhanced cashflow correlates with a surge in a firm's total factor productivity. This can be attributed to the possibility that firms with substantial liquidity can channel investments into state-of-the-art technologies or processes, and effectively allocate resources towards innovative or avenues promising growth.


In [None]:
footnote = ("This table estimates eq(3). "
            "\\textit{Asset tangibility} represents the proportion of tangible assets to total assets. "
            "\\textit{RD} measures research and development expenditure. "
            "\\textit{Cash flow} is net income plus depreciation, scaled by assets. "
            "\\textit{TFP} is Total Factor Productivity, estimated using the Olley and Pake algorithm. "
            "\\textit{Log(cashflow)} is the natural logarithm of cash flow. "
            "Heteroskedasticity-robust standard errors clustered at the product level are in parentheses. "
            "\\sym{*} Significance at the 10\\%, \\sym{**} Significance at the 5\\%, \\sym{***} Significance at the 1\\%.")


#multicolumn ={
#    'Eligible': 2,
#    'Non-Eligible': 1,
#    'All': 1,
#    'All benchmark': 1,
#}

#multi_lines_dep = '(city/product/trade regime/year)'
new_r = ['& Asset tangibility', 'RD', "TFP"]
lb.beautify(table_number = table_nb,
            #reorder_var = reorder,
            #multi_lines_dep = multi_lines_dep,
            new_row= new_r,
            #multicolumn = multicolumn,
            table_nte = footnote,
            jupyter_preview = True,
            resolution = 150,
            folder = folder)

Credit constraint

In [None]:
folder = 'Tables_0'
table_nb = 4
table = 'table_{}'.format(table_nb)
path = os.path.join(folder, table + '.txt')
if os.path.exists(folder) == False:
        os.mkdir(folder)
for ext in ['.txt', '.pdf']:
    x = [a for a in os.listdir(folder) if a.endswith(ext)]
    [os.remove(os.path.join(folder, i)) for i in x]

In [None]:
%get path table

t0 <- felm(log(asset_tangibility_tot_asset) ~
            log(cashflow_to_tangible)* financial_dep_china1 + 
            log(liabilities_tot_asset) + 
            log(total_asset) + 
            log(employment)+
            log(age) +
            age_sqr +
             SOE 
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>%
            mutate(
              age_sqr = log(age) **2,
         )%>%
             filter_at(
    vars(
        cashflow_to_tangible
    ),
all_vars(between(., 0.01, quantile(., 0.99, na.rm = TRUE)))
  )%>%
                 filter(!year %in% list("1998"))%>%filter(!is.na(financial_dep_china1))
             ,
            exactDOF = TRUE)

t1 <- felm(rd_tot_asset_trick ~
            log(cashflow_to_tangible) * financial_dep_china1+ 
            log(liabilities_tot_asset) +  
            log(total_asset) + 
            log(employment)+
            log(age) +
            age_sqr +
             SOE 
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>%
            mutate(
             age_sqr = age **2,
         )%>%
        filter_at(
    vars(
        cashflow_to_tangible,

    ),
    all_vars(. >0)
  )%>%filter(year %in% list("2005","2006", "2007"))%>%filter(!is.na(financial_dep_china1))
             ,
            exactDOF = TRUE)

t2 <- felm(log(tfp_op) ~
            log(cashflow_to_tangible) * financial_dep_china1+ 
            log(liabilities_tot_asset) +  
            log(total_asset) + 
            log(employment)+
            log(age) +
            age_sqr +
             SOE 
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>%
            mutate(
              age_sqr = log(age) **2,
         )%>%
             filter_at(
    vars(
        cashflow_to_tangible
    ),
all_vars(between(., 0.01, quantile(., 0.99, na.rm = TRUE)))
  )%>%
                 filter(!year %in% list("1998"))%>%filter(!is.na(financial_dep_china1))
             ,
            exactDOF = TRUE)

dep <- "Dependent variable: "
fe1 <- list(
    c("firm", "Yes", "Yes", "Yes"),
    c("year", "Yes", "Yes", "Yes"),
    c("city", "Yes", "Yes", "Yes")
             )

table_1 <- go_latex(list(
    t0,t1, t2
),
    title="Impact of Credit Constraints on Tangibility, RD, and TFP in Pollution Control",
    dep_var = dep,
    addFE=fe1,
    save=TRUE,
    note = FALSE,
    name=path
) 

From the outlined mechanisms:

- Financial constraints, through by cash flow, wield significant influence over a firm's investment strategies. With augmented cash flow, firms, especially private entities that might be marginalized by predominant banks, exercise autonomy in investment decisions. Rather than allocating resources to tangible assets, predominantly viewed as a vehicle to procure loans, there's a palpable shift towards innovation (R&D) — an avenue that holds the promise of sustained growth and heightened efficiency.

- This trend is emblematic of a broader economic shift where, sufforing by financial constraints, firms prioritize long-term growth, innovation, and efficiency over transient needs or the acquisition of external finance. Within a framework where traditional bank loans could be elusive for a subset of firms, especially against a landscape of credit concentration and preferential lending patterns favoring SOEs, internal cash reserves assume paramount importance in guiding investment choices.

In [None]:
footnote = ("This table estimates eq(3). "
            "\\textit{Asset tangibility} represents the proportion of tangible assets to total assets. "
            "\\textit{RD} measures research and development expenditure. "
            "\\textit{Cash flow} is net income plus depreciation, scaled by assets. "
            "\\textit{TFP} is Total Factor Productivity, estimated using the Olley and Pake algorithm. "
            "\\textit{Credit constraints} is a dummy variable taking the value of 1 if the industry is financially dependent. "
            "Heteroskedasticity-robust standard errors clustered at the product level are in parentheses. "
            "\\sym{*} Significance at the 10\\%, \\sym{**} Significance at the 5\\%, \\sym{***} Significance at the 1\\%.")
#multicolumn ={
#    'Eligible': 2,
#    'Non-Eligible': 1,
#    'All': 1,
#    'All benchmark': 1,
#}

#multi_lines_dep = '(city/product/trade regime/year)'
new_r = ['& Asset tangibility', 'RD', "TFP"]
lb.beautify(table_number = table_nb,
            #reorder_var = reorder,
            #multi_lines_dep = multi_lines_dep,
            new_row= new_r,
            #multicolumn = multicolumn,
            table_nte = footnote,
            jupyter_preview = True,
            resolution = 150,
            folder = folder)

Removal capacity

In [None]:
list_vars = ['rlmxf',
 'ylmxf',
 'rlmpjlf',
 'rlyxf',
 'zyxf',
 'cyxf',
 'rlypjlf',
 'zypjlf',
 'clean_gas_used',
 'waste_water',
 'cod',
 'ad',
 'waste_gas',
 'so2',
 'nox',
 'smoke_dust',
 'soot',
 'yfc',
 'gyfscll',
 'hxxyqcl',
 'xzssqcl',
 'adqcl',
 'eyhlqcl',
 'dyhwqcl',
 'ycqcl',
 'gyfcqcl',
 'dwastewater_equip',
 'fszlssnl',
 'fszlssfee',
 'dwastegas_equip',
 'dso2_equip',
 'fqzlssnl',
 'tlssnl',
 'hxxycsl',
 'adcsl',
 'eyhlcsl',
 'dyhwcsl',
 'yfccsl',
            'tlssnl']

### Investment in Pollution Abatement:

- Abatement Capacity: This term refers to the collective technological and infrastructural capabilities a firm possesses for reducing sulfur dioxide (SO2) emissions. It captures the extent to which a company can minimize its environmental impact through various abatement strategies, whether through end-of-pipe solutions or more integrated, process-level innovations.
- SO2 Removed: This metric denotes the actual volume of sulfur dioxide (SO2) emissions that a firm has successfully eliminated during a specific time frame, such as an annual reporting period. This provides a concrete measure of a firm's effectiveness in reducing its environmental footprint and can serve as an indicator for regulatory compliance and social responsibility.
- SO2 Removal Per Hour: This variable quantifies the rate at which a company is capable of eliminating SO2 emissions. It provides an efficiency measure, allowing for the assessment of how quickly a firm can respond to environmental challenges and adjust its operations to reduce pollutants effectively.

In [None]:
folder = 'Tables_0'
table_nb = 5
table = 'table_{}'.format(table_nb)
path = os.path.join(folder, table + '.txt')
if os.path.exists(folder) == False:
        os.mkdir(folder)
for ext in ['.txt', '.pdf']:
    x = [a for a in os.listdir(folder) if a.endswith(ext)]
    [os.remove(os.path.join(folder, i)) for i in x]

In [None]:
%get path table

t0 <- felm(log(dso2_equip + 1) ~
            log(cashflow_to_tangible) + 
            log(liabilities_tot_asset) +  
            log(total_asset) +  
            log(employment)+
            log(age) +
            age_sqr +
             SOE
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>%
            mutate(
             age_sqr = log(age) **2,
         )%>%
             filter_at(
    vars(
        cashflow_to_tangible
    ),
all_vars(between(., 0.01, quantile(., 0.99, na.rm = TRUE)))
  )%>%
                 filter(!year %in% list("1998"))%>%filter(!is.na(financial_dep_china1))
             ,
            exactDOF = TRUE)

t1 <- felm(log(eyhlqcl + 1) ~
            log(cashflow_to_tangible) + 
            log(liabilities_tot_asset) +  
            log(total_asset) +  
            log(employment)+
            log(age) +
            age_sqr +
             SOE 
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>%
            mutate(
              age_sqr = log(age) **2,
         )%>%
        filter_at(
    vars(
        cashflow_to_tangible,

    ),
    all_vars(. >0)
  )%>%
                 filter(!year %in% list("1998"))%>%filter(!is.na(financial_dep_china1))
     ,
     exactDOF = TRUE)

t2 <- felm(log(tlssnl+1) ~
            log(cashflow_to_tangible) + 
            log(liabilities_tot_asset) +  
            log(total_asset) +  
            log(employment)+
            log(age) +
            age_sqr +
             SOE 
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1%>%
            mutate(
              age_sqr = log(age) **2,
         )%>%
             filter_at(
    vars(
        cashflow_to_tangible
    ),
all_vars(between(., 0.01, quantile(., 0.99, na.rm = TRUE)))
  )%>%
                 filter(!year %in% list("1998"))%>%filter(!is.na(financial_dep_china1))
             ,
            exactDOF = TRUE)

dep <- "Dependent variable: "
fe1 <- list(
    c("firm", "Yes", "Yes", "Yes"),
    c("year", "Yes", "Yes", "Yes"),
    c("city", "Yes", "Yes", "Yes")
             )

table_1 <- go_latex(list(
    t0,t1, t2
),
    title="Role of Cashflows in Pollution Abatement",
    dep_var = dep,
    addFE=fe1,
    save=TRUE,
    note = FALSE,
    name=path
) 

**Abatement capacity**:

- Interpretation with Log(cashflow): The coefficient of log(cashflow) in predicting abatement capacity is 0.006 and is not statistically significant. This indicates that, within the scope of this dataset, an increase in cashflow does not necessarily lead to a measurable rise in a firm's pollution abatement capacity. Firms with a higher cashflow are found to increase their abatement capacity, implying that an increase in liquidity allows firms to invest in technologies and systems that enhance their capacity to reduce pollution. This suggests a proactive response by liquidity-rich firms in environmental stewardship.

- Theoretical Explanation: The theoretical framework suggests that firms with higher cashflow should have the financial flexibility to invest in pollution abatement technologies. However, the empirical evidence does not support this hypothesis for abatement capacity. It is possible that firms allocate their excess cash to investments deemed more urgent or beneficial, a topic that could be explored in future research.

**SO2 Removed**:
- Interpretation with Log(cashflow): A positive and highly statistically significant coefficient of 0.099 for log(cashflow) indicates that firms with higher cashflows are effectively utilizing their resources to remove a greater amount of SO2 emissions. Higher cashflows lead to a significant increase in the amount of SO2 removed. Firms that possess ample liquidity are, evidently, deploying it to enhance their environmental footprint by adopting more efficient pollution abatement technologies.

- Theoretical Explanation: This finding is consistent with the theoretical premise that liquidity-rich firms are more capable of allocating funds to environmentally beneficial technologies. The significant coefficient may reflect firms' proactive responses to social pressures and potential regulatory incentives.

**SO2 Removal Per Hour**:

- Interpretation with Log(cashflow): The coefficient of 0.022 for log(cashflow) is statistically significant at the 10% level, implying that firms with greater cashflow are not just increasing their capacity to remove SO2, but are doing so more efficiently on an hourly basis. An increase in cashflow correlates with more efficient hourly SO2 removal rates. This implies that liquidity not only aids in sheer volume but also in the efficiency of the pollution control mechanisms.

- Theoretical Explanation: The positive and statistically significant coefficient aligns with the theoretical expectation that firms with more liquidity invest not only in the scale but also in the efficiency of their pollution abatement activities. This could be interpreted as a strategic focus on sustainable operations, possibly motivated by regulatory pressures and internal corporate commitments to sustainability.

These interpretations and theoretical discussions aim to elucidate the relationships observed in the data, providing both empirical and conceptual insights into the determinants of pollution abatement activities among firms.

In summary, the results shed light on the pivotal role that liquidity, in the form of cashflow, plays in a firm's environmental endeavors, specifically in the realm of SO2 abatement. It further underscores the nuanced ways in which firm size, leverage, age, and ownership influence environmental responsiveness in the context of SO2 pollution control.

In [None]:
footnote = ("This table estimates eq(3). "
            "\\textit{Abatement Capacity} refers to the firm's equipment designed to mitigate SO2 emissions. "
            "\\textit{SO2 Removed} quantifies the annual amount of SO2 mitigated by the firm through end-of-pipe solutions following production. "
            "\\textit{SO2 Removal Per Hour} represents the rate at which SO2 is eliminated, measured in kilograms per hour. "
            "\\textit{Cash flow} is net income plus depreciation, scaled by assets. "
            "Heteroskedasticity-robust standard errors clustered at the product level are in parentheses. "
            "\\sym{*} Significance at the 10\\%, \\sym{**} Significance at the 5\\%, \\sym{***} Significance at the 1\\%.")


multicolumn ={
    'SO2': 2,
    'COD': 2,
    'Waste water': 2
}

#multi_lines_dep = '(city/product/trade regime/year)'
new_r = ['& Abatement capacity', 'SO2 removed', 'SO2 removal per hour']
lb.beautify(table_number = table_nb,
            #reorder_var = reorder,
            #multi_lines_dep = multi_lines_dep,
            new_row= new_r,
            #multicolumn = multicolumn,
            table_nte = footnote,
            jupyter_preview = True,
            resolution = 150,
            folder = folder)

Credit constraint

In [None]:
folder = 'Tables_0'
table_nb = 6
table = 'table_{}'.format(table_nb)
path = os.path.join(folder, table + '.txt')
if os.path.exists(folder) == False:
        os.mkdir(folder)
for ext in ['.txt', '.pdf']:
    x = [a for a in os.listdir(folder) if a.endswith(ext)]
    [os.remove(os.path.join(folder, i)) for i in x]

In [None]:
%get path table
t0 <- felm(log(dso2_equip + 1) ~
            log(cashflow_to_tangible) * financial_dep_china+ 
            log(liabilities_tot_asset) +  
            log(total_asset) +  
            log(employment)+
            log(age) +
            age_sqr +
             SOE
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>%
            mutate(
              age_sqr = log(age) **2,
         )%>%
             filter_at(
    vars(
        cashflow_to_tangible
    ),
all_vars(between(., 0.01, quantile(., 0.99, na.rm = TRUE)))
  )%>%
                 filter(!year %in% list("1998"))
             ,
            exactDOF = TRUE)

t1 <- felm(log(eyhlqcl + 1) ~
            log(cashflow_to_tangible) * financial_dep_china+ 
            log(liabilities_tot_asset) +  
            log(total_asset) +  
            log(employment)+
            log(age) +
            age_sqr +
             SOE
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1 %>%
            mutate(
              age_sqr = log(age) **2,
         )%>%
        filter_at(
    vars(
        cashflow_to_tangible,

    ),
    all_vars(. >0)
  )%>%
                 filter(!year %in% list("1998"))
     ,
     exactDOF = TRUE)

t2<- felm(log(tlssnl+1) ~
            log(cashflow_to_tangible) * financial_dep_china+ 
            log(liabilities_tot_asset) +  
            log(total_asset) +  
            log(employment)+
            log(age) +
            age_sqr +
             SOE 
            |firm + geocode4_corr+indu_2|0 | indu_2, df_final1%>%
            mutate(
              age_sqr = log(age) **2,
         )%>%
             filter_at(
    vars(
        cashflow_to_tangible
    ),
all_vars(between(., 0.01, quantile(., 0.99, na.rm = TRUE)))
  )%>%
                 filter(!year %in% list("1998"))
             ,
            exactDOF = TRUE)

dep <- "Dependent variable: "
fe1 <- list(
    c("firm", "Yes", "Yes", "Yes", "Yes"),
    c("year", "Yes", "Yes", "Yes", "Yes"),
    c("city", "Yes", "Yes", "Yes", "Yes")
             )

table_1 <- go_latex(list(
    t0,t1, t2
),
    title="Effects of Cashflow on Pollution Abatement in Credit-Constrained Firms",
    dep_var = dep,
    addFE=fe1,
    save=TRUE,
    note = FALSE,
    name=path
) 

**Abatement capacity**:

- Interpretation with Cashflow: The coefficient for log(cashflow) has increased and is now significant at the 5% level. This indicates a stronger positive relationship between higher cashflows and the firm's investment in pollution abatement capacity.
- Theoretical Explanation: The presence of a significant interaction term between cashflow and credit constraint suggests that firms in credit-constrained sectors are even more likely to invest in pollution abatement when they have higher cashflows. This can be interpreted as firms in financially restricted sectors taking advantage of liquidity to address a pressing issue they otherwise would not have the resources for.

**SO2 Removed**:

- Interpretation with Cashflow: The coefficient is higher and remains highly significant, implying that firms with greater cashflows are able to significantly enhance their SO2 removal capabilities.
- Theoretical Explanation: The significant interaction term between cashflow and credit constraint underscores the importance of liquidity in enabling firms, particularly those in credit-constrained industries, to invest in effective pollution abatement technologies. This implies that for firms in such constrained sectors, liquidity may be an especially valuable resource for making environmentally responsible choices.

**SO2 Removal per Hour**:

- Interpretation with Cashflow: The coefficient has doubled compared to previous findings and is significant at the 5% level. This suggests that firms with higher liquidity are not just reducing more pollution but are doing so more efficiently.
- Theoretical Explanation: The significance of the interaction term between cashflow and credit constraint suggests that credit-constrained firms regard efficient pollution control not just as a social responsibility but as a strategic necessity. When such firms experience an increase in cashflow, they are more likely to allocate this liquidity towards enhancing their pollution control mechanisms' efficiency.

In this way, the results collectively emphasize that firms with better cashflow positions are more proactive in pollution control, especially in credit-constrained sectors. This underscores the importance of financial liquidity in the fight against industrial pollution.

In [None]:
footnote = ("This table estimates eq(3). "
            "\\textit{Abatement Capacity} refers to the firm's equipment designed to mitigate SO2 emissions. "
            "\\textit{SO2 Removed} quantifies the annual amount of SO2 mitigated by the firm through end-of-pipe solutions following production. "
            "\\textit{SO2 Removal Per Hour} represents the rate at which SO2 is eliminated, measured in kilograms per hour. "
            "\\textit{Cash flow} is net income plus depreciation, scaled by assets. "
            "\\textit{Credit constraints} is a dummy variable taking the value of 1 if the industry is financially dependent. "
            "Heteroskedasticity-robust standard errors clustered at the product level are in parentheses. "
            "\\sym{*} Significance at the 10\\%, \\sym{**} Significance at the 5\\%, \\sym{***} Significance at the 1\\%.")


multicolumn ={
    'SO2': 2,
    'COD': 2,
    'Waste water': 2
}

#multi_lines_dep = '(city/product/trade regime/year)'
new_r = ['& Abatement capacity', 'SO2 removed', 'SO2 removal per hour']
lb.beautify(table_number = table_nb,
            #reorder_var = reorder,
            #multi_lines_dep = multi_lines_dep,
            new_row= new_r,
            #multicolumn = multicolumn,
            table_nte = footnote,
            jupyter_preview = True,
            resolution = 150,
            folder = folder)

# Generate reports

In [None]:
import os, time, shutil, urllib, ipykernel, json
from pathlib import Path
from notebook import notebookapp
import sys
path = os.getcwd()
parent_path = str(Path(path).parent.parent.parent)
sys.path.append(os.path.join(parent_path, 'utils'))
import make_toc
import create_report

In [None]:
name_json = 'parameters_ETL_pollution_credit_constraint.json'
path_json = os.path.join(str(Path(path).parent.parent), 'utils',name_json)

In [None]:
create_report.create_report(extension = "html", keep_code = False, notebookname = None)

In [None]:
### Update TOC in Github
for p in [parent_path,
          str(Path(path).parent),
          #os.path.join(str(Path(path).parent), "00_download_data_from"),
          #os.path.join(str(Path(path).parent.parent), "02_data_analysis"),
          #os.path.join(str(Path(path).parent.parent), "02_data_analysis", "00_statistical_exploration"),
          #os.path.join(str(Path(path).parent.parent), "02_data_analysis", "01_model_estimation"),
         ]:
    try:
        os.remove(os.path.join(p, 'README.md'))
    except:
        pass
    path_parameter = os.path.join(parent_path,'utils', name_json)
    md_lines =  make_toc.create_index(cwd = p, path_parameter = path_parameter)
    md_out_fn = os.path.join(p,'README.md')
    
    if p == parent_path:
    
        make_toc.replace_index(md_out_fn, md_lines, Header = os.path.basename(p).replace('_', ' '), add_description = True, path_parameter = path_parameter)
    else:
        make_toc.replace_index(md_out_fn, md_lines, Header = os.path.basename(p).replace('_', ' '), add_description = False)

In [None]:
!jupyter nbconvert --no-input --to html 09_firm_level_estimation_pollution_2.ipynb