In [18]:
import os
import zipfile

import numpy as np
import pandas as pd
from pandas import DataFrame
from scipy.stats.mstats import winsorize

from Constant import Constants as const
from OrganizeData.step02_merge_all_financial_data import sort_csmar_data

In [38]:
ldx_df: DataFrame = pd.read_pickle(os.path.join(const.TEMP_PATH, '20240905_ldy_china_data.pkl')).rename(columns={'code': const.TICKER, 'year': const.YEAR})
er_guarantee_df: DataFrame = pd.read_csv(os.path.join(r'D:\Onedrive\Projects\CapitalControl\data\KarXiong', 'df4.csv'), usecols=['Symbol', 'Year', 'TotalLoan', 'NumGuarantee']).rename(columns={"Symbol": const.TICKER, 'Year': const.YEAR})


In [40]:
for key in ['bsize', 'totalassets', 'firmage', 'fixedassets', 'listage']:
    ldx_df[f'ln_{key}'] = np.log(ldx_df[key] + 2)
    
ldx_df.sort_values(by=[const.TICKER, const.YEAR], ascending=True, inplace=True)
ldx_df['lagged_at'] = ldx_df.groupby(const.TICKER)['totalassets'].shift(1)
ldx_df['sale_diff'] = ldx_df.groupby(const.TICKER)['sales'].diff(1)
ldx_df['sale_growth'] = ldx_df['sale_diff'] / ldx_df['lagged_at']
ldx_df['fix_at'] = ldx_df['fixedassets'] / ldx_df['lagged_at']
ldx_df['salecost_at'] = ldx_df['cost'] / ldx_df['lagged_at']
ldx_df['rev_at'] = ldx_df['revenue'] / ldx_df['lagged_at']

ldx_df.replace([np.inf, -np.inf], np.nan, inplace=True)

for key in ['zscore', 'tbq3', 'mb', 'incometaxtate', 'tbq2', 'roa', 'growth', 'tbq1', 'lev', 'tbq4', 'sale_growth', 'fix_at', 'salecost_at', 'rev_at']:
    ldx_df.loc[ldx_df[key].notnull(), key] = winsorize(ldx_df[key].dropna(), limits=(0.01, 0.01))

In [45]:
reg_df: DataFrame = ldx_df.merge(er_guarantee_df, on=[const.TICKER, const.YEAR], how='left')
reg_df.loc[:, 'Post2017'] = (reg_df[const.YEAR] >= 2017).astype(int)

# Filter the DataFrame to include only the years 2014 to 2017
df5_filtered = er_guarantee_df[(er_guarantee_df[const.YEAR] >= 2014) & (er_guarantee_df[const.YEAR] <= 2017)]

# Create a new column "er_foreign_gua" and set it to 0 initially
reg_df['er_foreign_gua'] = 0

# Identify Symbols with at least one "NumGuarantee" greater than 0 between 2014 and 2017
symbols_with_gua = df5_filtered[df5_filtered['NumGuarantee'] > 0][const.TICKER].unique()

reg_df.loc[reg_df[const.TICKER].isin(symbols_with_gua), 'er_foreign_gua'] = 1
reg_df['has_guarantee'] = 0
reg_df.loc[reg_df['NumGuarantee'] > 0, 'has_guarantee'] = 1

In [46]:
reg_df.to_pickle(os.path.join(const.TEMP_PATH, '20240908_temp_base_reg_data.pkl'))

# Construct some financial data from CSMAR

In [19]:
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '财务指标文件.zip'), 'r') as zip_ref:
    with zip_ref.open('CSR_Finidx.csv') as csv_file:
        finidx_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip', usecols=['Stkcd', 'Accper', 'Outcap', 'Surplus', 'D610000', 'B150101', 'B140204', 'B140101', 'B120101', 'A100000', 'A200000']).rename(columns={'D610000': 'OCF', 'B150101': 'NetIncome', 'B140204': 'TXPD', 'B140101': 'EarningBI', 'B120101': 'OperatingRevenue', 'A100000': 'TotalAssets', 'A200000': 'TotalLiabilities'})
        finidx_df: DataFrame = sort_csmar_data(finidx_df)

  gov_grant_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip', usecols=['Stkcd', 'Accper', 'Item', 'Amount']).rename(


ValueError: Usecols do not match columns, columns expected but not found: ['RDInvest', 'RDSpendSumRatio', 'RDPersonRatio', 'RDInvestNetprofitRatio', 'RDInvestRatio', 'RDSpendSum', 'RDExpenses', 'RDPerson']

In [24]:
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '研发投入情况表.zip'), 'r') as zip_ref:
    with zip_ref.open('PT_LCRDSPENDING.csv') as csv_file:
        rd_spending_df: DataFrame = pd.read_csv(csv_file, usecols=['Symbol', 'EndDate', 'RDPerson', 'RDPersonRatio', 'RDSpendSum', 'RDSpendSumRatio', 'RDExpenses', 'RDInvest', 'RDInvestRatio', 'RDInvestNetprofitRatio'], dtype={'EndDate': 'str'})
        rd_spending_df['EndDate'] = pd.to_datetime(rd_spending_df['EndDate'], format='%Y-%m-%d', errors='coerce')
        rd_spending_df.dropna(subset=['EndDate', 'Symbol'], how='any', inplace=True)
        rd_spending_df[const.TICKER] = rd_spending_df['Symbol'].astype(int)
        rd_spending_df[const.YEAR] = rd_spending_df['EndDate'].dt.year
        for key in ['RDPerson', 'RDPersonRatio', 'RDSpendSum', 'RDSpendSumRatio', 'RDExpenses', 'RDInvest', 'RDInvestRatio', 'RDInvestNetprofitRatio']:
            rd_spending_df[key] = rd_spending_df[key].astype(np.float64)
        rd_spend_df1: DataFrame = rd_spending_df.groupby([const.TICKER, const.YEAR])[['RDPerson', 'RDSpendSum', 'RDExpenses', 'RDInvest']].sum()
        rd_spend_df2: DataFrame = rd_spending_df.groupby([const.TICKER, const.YEAR])[['RDPersonRatio', 'RDSpendSumRatio', 'RDInvestRatio', 'RDInvestNetprofitRatio']].mean()
        rd_spend_df: DataFrame = rd_spend_df1.merge(rd_spend_df2, left_index=True, right_index=True, how='outer').reset_index(drop=False)
        
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '政府补助.zip'), 'r') as zip_ref:
    with zip_ref.open('PT_LCGovGrants.csv') as csv_file:
        gov_grant_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip', usecols=['Stkcd', 'Accper', 'Item', 'Amount']).rename(
            columns={'Stkcd': const.TICKER})
        gov_grant_df = gov_grant_df[gov_grant_df['Item'] == '合计']
        gov_grant_df['Accper'] = pd.to_datetime(gov_grant_df['Accper'], format='%Y-%m-%d', errors='coerce')
        gov_grant_df[const.YEAR] = gov_grant_df['Accper'].dt.year
        gov_grant_df['GovGrantAmount'] = gov_grant_df['Amount'].astype(float)
        gov_grant_df.dropna(subset=['GovGrantAmount'], inplace=True)
        gov_grant_df = gov_grant_df[gov_grant_df['GovGrantAmount'] > 0]
        gov_grant_df2: DataFrame = gov_grant_df.groupby([const.TICKER, const.YEAR])['GovGrantAmount'].sum().reset_index(drop=False)
        gov_grant_df2['lnGovGrantAmount'] = gov_grant_df2['GovGrantAmount'].apply(np.log)

  gov_grant_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip', usecols=['Stkcd', 'Accper', 'Item', 'Amount']).rename(


In [26]:
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '财务指标.zip'), 'r') as zip_ref:
    with zip_ref.open('BDT_FinIndex.csv') as csv_file:
        finindex_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip', usecols=['Symbol', 'EndDate', 'TotalAssets', 'TotalLiabilities', 'OperatingRevenue', 'IncomeTaxTate', 'TaxBearing', 'BankLoanRatio'])
        finindex_df[const.TICKER] = finindex_df['Symbol'].astype(int)
        finindex_df['EndDate'] = pd.to_datetime(finindex_df['EndDate'], format='%Y-%m-%d', errors='coerce')
        finindex_df[const.YEAR] = finindex_df['EndDate'].dt.year
        finindex_df.drop_duplicates(subset=[const.TICKER, const.YEAR], keep='last', inplace=True)
        finindex_df.drop(['Symbol', 'EndDate'], axis=1, inplace=True)
        
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '融资约束—WW指数.zip'), 'r') as zip_ref:
    with zip_ref.open('BDT_FinConstWW.csv') as csv_file:
        fcww_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip', usecols=['Symbol', 'Enddate', 'WW', 'OperatingRevenueGrowth'])
        fcww_df[const.TICKER] = fcww_df['Symbol'].astype(int)
        fcww_df['Enddate'] = pd.to_datetime(fcww_df['Enddate'], format='%Y-%m-%d', errors='coerce')
        fcww_df[const.YEAR] = fcww_df['Enddate'].dt.year
        fcww_df.drop_duplicates(subset=[const.TICKER, const.YEAR], keep='last', inplace=True)
        fcww_df.drop(['Symbol', 'Enddate'], axis=1, inplace=True)
        
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '融资约束—SA指数.zip'), 'r') as zip_ref:
    with zip_ref.open('BDT_FinConstSA.csv') as csv_file:
        fcsa_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip', usecols=['Symbol', 'Enddate', 'SA'])
        fcsa_df[const.TICKER] = fcsa_df['Symbol'].astype(int)
        fcsa_df['Enddate'] = pd.to_datetime(fcsa_df['Enddate'], format='%Y-%m-%d', errors='coerce')
        fcsa_df[const.YEAR] = fcsa_df['Enddate'].dt.year
        fcsa_df.drop_duplicates(subset=[const.TICKER, const.YEAR], keep='last', inplace=True)
        fcsa_df.drop(['Symbol', 'Enddate'], axis=1, inplace=True)
        
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '融资约束—KZ指数.zip'), 'r') as zip_ref:
    with zip_ref.open('BDT_FinConstKZ.csv') as csv_file:
        fckz_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip', usecols=['Symbol', 'Enddate', 'KZ', 'TobinQ'])
        fckz_df[const.TICKER] = fckz_df['Symbol'].astype(int)
        fckz_df['Enddate'] = pd.to_datetime(fckz_df['Enddate'], format='%Y-%m-%d', errors='coerce')
        fckz_df[const.YEAR] = fckz_df['Enddate'].dt.year
        fckz_df.drop_duplicates(subset=[const.TICKER, const.YEAR], keep='last', inplace=True)
        fckz_df.drop(['Symbol', 'Enddate'], axis=1, inplace=True)
        
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '融资约束—FC指数.zip'), 'r') as zip_ref:
    with zip_ref.open('BDT_FinConstFC.csv') as csv_file:
        fcfc_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip', usecols=['Symbol', 'Enddate', 'FC', 'EBIT'])
        fcfc_df[const.TICKER] = fcfc_df['Symbol'].astype(int)
        fcfc_df['Enddate'] = pd.to_datetime(fcfc_df['Enddate'], format='%Y-%m-%d', errors='coerce')
        fcfc_df[const.YEAR] = fcfc_df['Enddate'].dt.year
        fcfc_df.drop_duplicates(subset=[const.TICKER, const.YEAR], keep='last', inplace=True)
        fcfc_df.drop(['Symbol', 'Enddate'], axis=1, inplace=True)


In [34]:
csmar_data_df: DataFrame = finidx_df.merge(rd_spend_df, on=[const.TICKER, const.YEAR], how='outer').merge(gov_grant_df2, on=[const.TICKER, const.YEAR], how='outer').merge(finindex_df, on=[const.TICKER, const.YEAR], how='outer').merge(fcww_df, on=[const.TICKER, const.YEAR], how='outer').merge(fckz_df, on=[const.TICKER, const.YEAR], how='outer').merge(fcfc_df, on=[const.TICKER, const.YEAR], how='outer').merge(fcsa_df, on=[const.TICKER, const.YEAR], how='outer')

In [35]:
drop_keys = list()
for key in ['TotalAssets_x', 'TotalLiabilities_x', 'OperatingRevenue_x']:
    drop_keys.append(key)
    csmar_data_df.loc[:, key[:-2]] = csmar_data_df.loc[:, key].fillna(csmar_data_df[key.replace('_x', '_y')])
    drop_keys.append(key.replace('_x', '_y'))
    
csmar_data_df.drop(drop_keys, axis=1, inplace=True)
csmar_data_df.to_pickle(os.path.join(const.TEMP_PATH, '20240908_csmar_temp_data.pkl'))
csmar_data_df = csmar_data_df[csmar_data_df[const.YEAR] > 2010].copy()
csmar_data_df.shape

(56771, 31)

In [47]:
csmar_data_df.describe()

Unnamed: 0,tic,EarningBI,TXPD,NetIncome,OCF,Surplus,Outcap,year,RDPerson,RDSpendSum,...,OperatingRevenueGrowth,WW,TobinQ,KZ,EBIT,FC,SA,TotalAssets,TotalLiabilities,OperatingRevenue
count,56771.0,48723.0,48448.0,48723.0,48714.0,47999.0,48656.0,56771.0,47887.0,47887.0,...,46443.0,40072.0,44085.0,44085.0,44086.0,44086.0,46501.0,48734.0,48734.0,48197.0
mean,350597.883233,1232104000.0,250232400.0,983274500.0,1897705000.0,951302600.0,836731500.0,2017.868348,447.083697,217698100.0,...,3.658458,-1.161971,2.167367,0.968429,826085400.0,0.511121,-3.828629,64086860000.0,53573590000.0,1893662000.0
std,281099.89975,11475310000.0,2292300000.0,9274405000.0,27331730000.0,7252134000.0,6239530000.0,3.582191,4211.965467,1126431000.0,...,628.534677,23.682564,5.807248,2.844719,5129697000.0,0.290494,0.323962,885160200000.0,812021300000.0,91507320000.0
min,1.0,-71301880000.0,-10755000000.0,-68742560000.0,-588009000000.0,-65174500000.0,-7556230.0,2011.0,0.0,0.0,...,-2.683831,-4712.153159,0.608519,-13.102681,-65174500000.0,0.0,-5.980174,3083701.0,-2033024.0,-19618490000000.0
25%,2604.0,37263700.0,5262122.0,30506910.0,10989590.0,46500480.0,35814830.0,2015.0,0.0,20521110.0,...,-0.045281,-1.066531,1.234627,-0.608559,49220630.0,0.25541,-4.0183,1550913000.0,423803500.0,166082500.0
50%,300815.0,128576000.0,19917750.0,107235900.0,117027600.0,148455600.0,115973800.0,2018.0,118.0,48228200.0,...,0.089951,-1.015326,1.58655,1.23607,154424900.0,0.546484,-3.836862,3439027000.0,1317950000.0,395552300.0
75%,601098.0,423748100.0,71382770.0,353457800.0,450100300.0,472622400.0,348368900.0,2021.0,332.0,122745000.0,...,0.248646,-0.968374,2.27761,2.759846,474501800.0,0.76743,-3.656814,9279478000.0,4646315000.0,1079574000.0
max,900957.0,424899000000.0,85515000000.0,365116000000.0,1825282000000.0,363235000000.0,330861000000.0,2023.0,836001.0,73839000000.0,...,134607.05841,0.250704,729.629281,15.431583,255568000000.0,0.998306,-0.287821,44697080000000.0,40920490000000.0,1627613000000.0
