In [1]:
import os
import zipfile

import numpy as np
import pandas as pd
from pandas import DataFrame
from scipy.stats.mstats import winsorize

from Constant import Constants as const
from OrganizeData.step02_merge_all_financial_data import sort_csmar_data

In [38]:
ldx_df: DataFrame = pd.read_pickle(os.path.join(const.TEMP_PATH, '20240905_ldy_china_data.pkl')).rename(columns={'code': const.TICKER, 'year': const.YEAR})
er_guarantee_df: DataFrame = pd.read_csv(os.path.join(r'D:\Onedrive\Projects\CapitalControl\data\KarXiong', 'df4.csv'), usecols=['Symbol', 'Year', 'TotalLoan', 'NumGuarantee']).rename(columns={"Symbol": const.TICKER, 'Year': const.YEAR})


In [40]:
for key in ['bsize', 'totalassets', 'firmage', 'fixedassets', 'listage']:
    ldx_df[f'ln_{key}'] = np.log(ldx_df[key] + 2)
    
ldx_df.sort_values(by=[const.TICKER, const.YEAR], ascending=True, inplace=True)
ldx_df['lagged_at'] = ldx_df.groupby(const.TICKER)['totalassets'].shift(1)
ldx_df['sale_diff'] = ldx_df.groupby(const.TICKER)['sales'].diff(1)
ldx_df['sale_growth'] = ldx_df['sale_diff'] / ldx_df['lagged_at']
ldx_df['fix_at'] = ldx_df['fixedassets'] / ldx_df['lagged_at']
ldx_df['salecost_at'] = ldx_df['cost'] / ldx_df['lagged_at']
ldx_df['rev_at'] = ldx_df['revenue'] / ldx_df['lagged_at']

ldx_df.replace([np.inf, -np.inf], np.nan, inplace=True)

for key in ['zscore', 'tbq3', 'mb', 'incometaxtate', 'tbq2', 'roa', 'growth', 'tbq1', 'lev', 'tbq4', 'sale_growth', 'fix_at', 'salecost_at', 'rev_at']:
    ldx_df.loc[ldx_df[key].notnull(), key] = winsorize(ldx_df[key].dropna(), limits=(0.01, 0.01))

In [45]:
reg_df: DataFrame = ldx_df.merge(er_guarantee_df, on=[const.TICKER, const.YEAR], how='left')
reg_df.loc[:, 'Post2017'] = (reg_df[const.YEAR] >= 2017).astype(int)

# Filter the DataFrame to include only the years 2014 to 2017
df5_filtered = er_guarantee_df[(er_guarantee_df[const.YEAR] >= 2014) & (er_guarantee_df[const.YEAR] <= 2017)]

# Create a new column "er_foreign_gua" and set it to 0 initially
reg_df['er_foreign_gua'] = 0

# Identify Symbols with at least one "NumGuarantee" greater than 0 between 2014 and 2017
symbols_with_gua = df5_filtered[df5_filtered['NumGuarantee'] > 0][const.TICKER].unique()

reg_df.loc[reg_df[const.TICKER].isin(symbols_with_gua), 'er_foreign_gua'] = 1
reg_df['has_guarantee'] = 0
reg_df.loc[reg_df['NumGuarantee'] > 0, 'has_guarantee'] = 1

In [46]:
reg_df.to_pickle(os.path.join(const.TEMP_PATH, '20240908_temp_base_reg_data.pkl'))

# Construct some financial data from CSMAR

In [19]:
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '财务指标文件.zip'), 'r') as zip_ref:
    with zip_ref.open('CSR_Finidx.csv') as csv_file:
        finidx_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip', usecols=['Stkcd', 'Accper', 'Outcap', 'Surplus', 'D610000', 'B150101', 'B140204', 'B140101', 'B120101', 'A100000', 'A200000']).rename(columns={'D610000': 'OCF', 'B150101': 'NetIncome', 'B140204': 'TXPD', 'B140101': 'EarningBI', 'B120101': 'OperatingRevenue', 'A100000': 'TotalAssets', 'A200000': 'TotalLiabilities'})
        finidx_df: DataFrame = sort_csmar_data(finidx_df)

  gov_grant_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip', usecols=['Stkcd', 'Accper', 'Item', 'Amount']).rename(


ValueError: Usecols do not match columns, columns expected but not found: ['RDInvest', 'RDSpendSumRatio', 'RDPersonRatio', 'RDInvestNetprofitRatio', 'RDInvestRatio', 'RDSpendSum', 'RDExpenses', 'RDPerson']

In [24]:
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '研发投入情况表.zip'), 'r') as zip_ref:
    with zip_ref.open('PT_LCRDSPENDING.csv') as csv_file:
        rd_spending_df: DataFrame = pd.read_csv(csv_file, usecols=['Symbol', 'EndDate', 'RDPerson', 'RDPersonRatio', 'RDSpendSum', 'RDSpendSumRatio', 'RDExpenses', 'RDInvest', 'RDInvestRatio', 'RDInvestNetprofitRatio'], dtype={'EndDate': 'str'})
        rd_spending_df['EndDate'] = pd.to_datetime(rd_spending_df['EndDate'], format='%Y-%m-%d', errors='coerce')
        rd_spending_df.dropna(subset=['EndDate', 'Symbol'], how='any', inplace=True)
        rd_spending_df[const.TICKER] = rd_spending_df['Symbol'].astype(int)
        rd_spending_df[const.YEAR] = rd_spending_df['EndDate'].dt.year
        for key in ['RDPerson', 'RDPersonRatio', 'RDSpendSum', 'RDSpendSumRatio', 'RDExpenses', 'RDInvest', 'RDInvestRatio', 'RDInvestNetprofitRatio']:
            rd_spending_df[key] = rd_spending_df[key].astype(np.float64)
        rd_spend_df1: DataFrame = rd_spending_df.groupby([const.TICKER, const.YEAR])[['RDPerson', 'RDSpendSum', 'RDExpenses', 'RDInvest']].sum()
        rd_spend_df2: DataFrame = rd_spending_df.groupby([const.TICKER, const.YEAR])[['RDPersonRatio', 'RDSpendSumRatio', 'RDInvestRatio', 'RDInvestNetprofitRatio']].mean()
        rd_spend_df: DataFrame = rd_spend_df1.merge(rd_spend_df2, left_index=True, right_index=True, how='outer').reset_index(drop=False)
        
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '政府补助.zip'), 'r') as zip_ref:
    with zip_ref.open('PT_LCGovGrants.csv') as csv_file:
        gov_grant_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip', usecols=['Stkcd', 'Accper', 'Item', 'Amount']).rename(
            columns={'Stkcd': const.TICKER})
        gov_grant_df = gov_grant_df[gov_grant_df['Item'] == '合计']
        gov_grant_df['Accper'] = pd.to_datetime(gov_grant_df['Accper'], format='%Y-%m-%d', errors='coerce')
        gov_grant_df[const.YEAR] = gov_grant_df['Accper'].dt.year
        gov_grant_df['GovGrantAmount'] = gov_grant_df['Amount'].astype(float)
        gov_grant_df.dropna(subset=['GovGrantAmount'], inplace=True)
        gov_grant_df = gov_grant_df[gov_grant_df['GovGrantAmount'] > 0]
        gov_grant_df2: DataFrame = gov_grant_df.groupby([const.TICKER, const.YEAR])['GovGrantAmount'].sum().reset_index(drop=False)
        gov_grant_df2['lnGovGrantAmount'] = gov_grant_df2['GovGrantAmount'].apply(np.log)

  gov_grant_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip', usecols=['Stkcd', 'Accper', 'Item', 'Amount']).rename(


In [26]:
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '财务指标.zip'), 'r') as zip_ref:
    with zip_ref.open('BDT_FinIndex.csv') as csv_file:
        finindex_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip', usecols=['Symbol', 'EndDate', 'TotalAssets', 'TotalLiabilities', 'OperatingRevenue', 'IncomeTaxTate', 'TaxBearing', 'BankLoanRatio'])
        finindex_df[const.TICKER] = finindex_df['Symbol'].astype(int)
        finindex_df['EndDate'] = pd.to_datetime(finindex_df['EndDate'], format='%Y-%m-%d', errors='coerce')
        finindex_df[const.YEAR] = finindex_df['EndDate'].dt.year
        finindex_df.drop_duplicates(subset=[const.TICKER, const.YEAR], keep='last', inplace=True)
        finindex_df.drop(['Symbol', 'EndDate'], axis=1, inplace=True)
        
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '融资约束—WW指数.zip'), 'r') as zip_ref:
    with zip_ref.open('BDT_FinConstWW.csv') as csv_file:
        fcww_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip', usecols=['Symbol', 'Enddate', 'WW', 'OperatingRevenueGrowth'])
        fcww_df[const.TICKER] = fcww_df['Symbol'].astype(int)
        fcww_df['Enddate'] = pd.to_datetime(fcww_df['Enddate'], format='%Y-%m-%d', errors='coerce')
        fcww_df[const.YEAR] = fcww_df['Enddate'].dt.year
        fcww_df.drop_duplicates(subset=[const.TICKER, const.YEAR], keep='last', inplace=True)
        fcww_df.drop(['Symbol', 'Enddate'], axis=1, inplace=True)
        
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '融资约束—SA指数.zip'), 'r') as zip_ref:
    with zip_ref.open('BDT_FinConstSA.csv') as csv_file:
        fcsa_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip', usecols=['Symbol', 'Enddate', 'SA'])
        fcsa_df[const.TICKER] = fcsa_df['Symbol'].astype(int)
        fcsa_df['Enddate'] = pd.to_datetime(fcsa_df['Enddate'], format='%Y-%m-%d', errors='coerce')
        fcsa_df[const.YEAR] = fcsa_df['Enddate'].dt.year
        fcsa_df.drop_duplicates(subset=[const.TICKER, const.YEAR], keep='last', inplace=True)
        fcsa_df.drop(['Symbol', 'Enddate'], axis=1, inplace=True)
        
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '融资约束—KZ指数.zip'), 'r') as zip_ref:
    with zip_ref.open('BDT_FinConstKZ.csv') as csv_file:
        fckz_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip', usecols=['Symbol', 'Enddate', 'KZ', 'TobinQ'])
        fckz_df[const.TICKER] = fckz_df['Symbol'].astype(int)
        fckz_df['Enddate'] = pd.to_datetime(fckz_df['Enddate'], format='%Y-%m-%d', errors='coerce')
        fckz_df[const.YEAR] = fckz_df['Enddate'].dt.year
        fckz_df.drop_duplicates(subset=[const.TICKER, const.YEAR], keep='last', inplace=True)
        fckz_df.drop(['Symbol', 'Enddate'], axis=1, inplace=True)
        
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '融资约束—FC指数.zip'), 'r') as zip_ref:
    with zip_ref.open('BDT_FinConstFC.csv') as csv_file:
        fcfc_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip', usecols=['Symbol', 'Enddate', 'FC', 'EBIT'])
        fcfc_df[const.TICKER] = fcfc_df['Symbol'].astype(int)
        fcfc_df['Enddate'] = pd.to_datetime(fcfc_df['Enddate'], format='%Y-%m-%d', errors='coerce')
        fcfc_df[const.YEAR] = fcfc_df['Enddate'].dt.year
        fcfc_df.drop_duplicates(subset=[const.TICKER, const.YEAR], keep='last', inplace=True)
        fcfc_df.drop(['Symbol', 'Enddate'], axis=1, inplace=True)


In [6]:
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '资产负债表.zip'), 'r') as zip_ref:
    with zip_ref.open('FS_Combas.csv') as csv_file:
        combas_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip', usecols=['Stkcd', 'Accper', 'A001212000', 'A001218000', 'A001219000']).rename(columns={'A001212000': 'FixedAssets', 'A001218000': 'Intangible', 'A001219000': 'RDSpend'})
        combas_df: DataFrame = sort_csmar_data(combas_df)

In [None]:
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '现金流分析.zip'), 'r') as zip_ref:
    with zip_ref.open('FI_T6.csv') as csv_file:
        t6_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip', usecols=['Stkcd', 'Accper', 'F061201B']).rename(columns={'F061201B': 'DA'})
        t6_df: DataFrame = sort_csmar_data(t6_df)

In [34]:
csmar_data_df: DataFrame = finidx_df.merge(rd_spend_df, on=[const.TICKER, const.YEAR], how='outer').merge(gov_grant_df2, on=[const.TICKER, const.YEAR], how='outer').merge(finindex_df, on=[const.TICKER, const.YEAR], how='outer').merge(fcww_df, on=[const.TICKER, const.YEAR], how='outer').merge(fckz_df, on=[const.TICKER, const.YEAR], how='outer').merge(fcfc_df, on=[const.TICKER, const.YEAR], how='outer').merge(fcsa_df, on=[const.TICKER, const.YEAR], how='outer')

In [7]:
csmar_data_df: DataFrame = pd.read_pickle(os.path.join(const.TEMP_PATH, '20240908_csmar_temp_data.pkl'))
csmar_data_df = csmar_data_df.merge(t6_df, on=[const.TICKER, const.YEAR], how='outer').merge(combas_df, on=[const.TICKER, const.YEAR], how='outer')
csmar_data_df.to_pickle(os.path.join(const.TEMP_PATH, '20240909_csmar_temp_data.pkl'))


In [35]:
drop_keys = list()
for key in ['TotalAssets_x', 'TotalLiabilities_x', 'OperatingRevenue_x']:
    drop_keys.append(key)
    csmar_data_df.loc[:, key[:-2]] = csmar_data_df.loc[:, key].fillna(csmar_data_df[key.replace('_x', '_y')])
    drop_keys.append(key.replace('_x', '_y'))
    
csmar_data_df.drop(drop_keys, axis=1, inplace=True)
csmar_data_df.to_pickle(os.path.join(const.TEMP_PATH, '20240908_csmar_temp_data.pkl'))
csmar_data_df = csmar_data_df[csmar_data_df[const.YEAR] > 2010].copy()
csmar_data_df.shape

(56771, 31)

In [2]:
csmar_data_df: DataFrame = pd.read_pickle(os.path.join(const.TEMP_PATH, '20240909_csmar_temp_data.pkl'))
csmar_data_df = csmar_data_df[csmar_data_df[const.YEAR] > 2010].copy()
csmar_data_df.keys()

Index(['tic', 'EarningBI', 'TXPD', 'NetIncome', 'OCF', 'Surplus', 'Outcap',
       'year', 'RDPerson', 'RDSpendSum', 'RDExpenses', 'RDInvest',
       'RDPersonRatio', 'RDSpendSumRatio', 'RDInvestRatio',
       'RDInvestNetprofitRatio', 'GovGrantAmount', 'lnGovGrantAmount',
       'IncomeTaxTate', 'TaxBearing', 'BankLoanRatio',
       'OperatingRevenueGrowth', 'WW', 'TobinQ', 'KZ', 'EBIT', 'FC', 'SA',
       'TotalAssets', 'TotalLiabilities', 'OperatingRevenue', 'DA',
       'FixedAssets', 'Intangible', 'RDSpend'],
      dtype='object')

In [3]:
csmar_data_df.sort_values(by=[const.TICKER, const.YEAR], ascending=True, inplace=True)
csmar_data_df['lagged_at'] = csmar_data_df.groupby(const.TICKER)['TotalAssets'].shift(1)
csmar_data_df['EarningBI_lat'] = csmar_data_df['EarningBI'] / csmar_data_df['lagged_at']
csmar_data_df['NI_lat'] = csmar_data_df['NetIncome'] / csmar_data_df['lagged_at']
csmar_data_df['CAPEX_lat'] = csmar_data_df['Outcap'] / csmar_data_df['lagged_at']
csmar_data_df['RDExpenses_lat'] = csmar_data_df['RDExpenses'].fillna(0) / csmar_data_df['lagged_at']
csmar_data_df['RDSpendSum_lat'] = csmar_data_df['RDSpendSum'].fillna(0) / csmar_data_df['lagged_at']
csmar_data_df['RDSpend_lat'] = csmar_data_df['RDSpend'].fillna(0) / csmar_data_df['lagged_at']
csmar_data_df['RDInvest_lat'] = csmar_data_df['RDInvest'].fillna(0) / csmar_data_df['lagged_at']
csmar_data_df['GovGrantAmount_lat'] = csmar_data_df['GovGrantAmount'].fillna(0) / csmar_data_df['lagged_at']
csmar_data_df['LEV_lat'] = csmar_data_df['TotalLiabilities'] / csmar_data_df['lagged_at']
csmar_data_df['EBIT_lat'] = csmar_data_df['EBIT'] / csmar_data_df['lagged_at']
csmar_data_df['Int_lat'] = csmar_data_df['Intangible'] / csmar_data_df['lagged_at']
csmar_data_df['OCF_lat'] = csmar_data_df['OCF'] / csmar_data_df['lagged_at']
csmar_data_df['FA_lat'] = csmar_data_df['FixedAssets'] / csmar_data_df['lagged_at']
csmar_data_df['CAPEX_RDEs_lat'] = (csmar_data_df['Outcap'] + csmar_data_df['RDExpenses'].fillna(0)) / csmar_data_df['lagged_at']
csmar_data_df['CAPEX_RDS_lat'] = (csmar_data_df['Outcap'] + csmar_data_df['RDSpend'].fillna(0)) / csmar_data_df['lagged_at']
csmar_data_df['CAPEX_RDI_lat'] = (csmar_data_df['Outcap'] + csmar_data_df['RDInvest'].fillna(0)) / csmar_data_df['lagged_at']
csmar_data_df['FA_DA'] = csmar_data_df['FixedAssets'] + csmar_data_df['DA'].fillna(0)
csmar_data_df['FA_DA_diff'] = csmar_data_df.groupby(const.TICKER)['FA_DA'].diff()
csmar_data_df['DFIX'] = csmar_data_df['FA_DA_diff'] / csmar_data_df['lagged_at']

csmar_data_df['DFIX2'] = csmar_data_df.groupby(const.TICKER)['FixedAssets'].pct_change(1)

csmar_data_df['TXPD_Sales'] = csmar_data_df['TXPD'] / csmar_data_df['OperatingRevenue']


  csmar_data_df['DFIX2'] = csmar_data_df.groupby(const.TICKER)['FixedAssets'].pct_change(1)


In [4]:
csmar_data_df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [5]:
dep_vars = ['RDPersonRatio', 'RDSpendSumRatio', 'RDInvestRatio', 'RDInvestNetprofitRatio', 'lnGovGrantAmount', 'IncomeTaxTate', 'TaxBearing', 'BankLoanRatio', 'EarningBI_lat', 'NI_lat', 'CAPEX_lat', 'RDExpenses_lat', 'RDSpend_lat', 'RDInvest_lat', 'GovGrantAmount_lat', 'LEV_lat', 'Int_lat', 'OCF_lat', 'FA_lat', 'CAPEX_RDEs_lat', 'CAPEX_RDS_lat', 'CAPEX_RDI_lat', 'FA_DA', 'RDSpendSum_lat', 'FA_DA_diff', 'DFIX', 'DFIX2', 'TXPD_Sales', 'EBIT_lat', const.TICKER, const.YEAR]
win_vars = ['RDPersonRatio', 'RDSpendSumRatio', 'RDInvestRatio', 'RDInvestNetprofitRatio', 'IncomeTaxTate', 'TaxBearing', 'OperatingRevenueGrowth', 'WW', 'TobinQ', 'EarningBI_lat', 'NI_lat', 'CAPEX_lat', 'RDExpenses_lat', 'RDSpend_lat', 'RDInvest_lat', 'GovGrantAmount_lat', 'LEV_lat', 'Int_lat', 'OCF_lat', 'FA_lat', 'CAPEX_RDEs_lat', 'CAPEX_RDS_lat', 'CAPEX_RDI_lat', 'DFIX', 'DFIX2', 'TXPD_Sales', 'RDSpendSum_lat', 'EBIT_lat']

for key in win_vars:
    csmar_data_df.loc[csmar_data_df[key].notnull(), key] = winsorize(csmar_data_df[key].dropna(), limits=(0.01, 0.01))


In [6]:
reg_df: DataFrame = pd.read_pickle(os.path.join(const.TEMP_PATH, '20240908_temp_base_reg_data.pkl'))
reg_df2: DataFrame = reg_df.merge(csmar_data_df, how='left', on=[const.TICKER, const.YEAR])

In [7]:
dep_df = csmar_data_df[dep_vars]

for lead_year in range(1, 4):
    dep_df[const.YEAR] -= 1
    reg_df2: DataFrame = reg_df2.merge(dep_df, how='left', on=[const.TICKER, const.YEAR], suffixes=('', f'_{lead_year}'))
    
for year in range(2014, 2021):
    reg_df2[f'dummy_{year}'] = reg_df2[const.YEAR].apply(lambda x: int(x == year))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dep_df[const.YEAR] -= 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dep_df[const.YEAR] -= 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dep_df[const.YEAR] -= 1


In [8]:
reg_df2['NumGuarantee'] = reg_df2['NumGuarantee'].fillna(0)
reg_df2['TotalLoan'] = reg_df2['TotalLoan'].fillna(0)

In [9]:
reg_df2.to_stata(os.path.join(const.OUTPUT_PATH, '20240909_cc_reg_data.dta'), write_index=False, version=119)

In [10]:
print(' '.join(dep_vars))

RDPersonRatio RDSpendSumRatio RDInvestRatio RDInvestNetprofitRatio lnGovGrantAmount IncomeTaxTate TaxBearing BankLoanRatio EarningBI_lat NI_lat CAPEX_lat RDExpenses_lat RDSpend_lat RDInvest_lat GovGrantAmount_lat LEV_lat Int_lat OCF_lat FA_lat CAPEX_RDEs_lat CAPEX_RDS_lat CAPEX_RDI_lat FA_DA RDSpendSum_lat FA_DA_diff DFIX DFIX2 TXPD_Sales EBIT_lat tic year
