In [2]:
import os
import zipfile

import numpy as np
import pandas as pd
from pandas import DataFrame
from scipy.stats.mstats import winsorize

from Constant import Constants as const
from OrganizeData.step02_merge_all_financial_data import sort_csmar_data

# Sort Finance Cost Data

In [21]:
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '1990_2023_Income Statement.zip'), 'r') as zip_ref:
    with zip_ref.open('FS_Comins.csv') as csv_file:
        finidx_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip',
                                           usecols=['Stkcd', 'Accper', 'B001211000', 'B001211101',
                                                    'Bbd1102203']).rename(
            columns={'B001211000': 'FinaExpense', 'B001211101': 'InterestExpense',
                     'Bbd1102203': 'InterestExpenses'}).dropna(
            subset=['FinaExpense', 'InterestExpense', 'InterestExpenses'], how='all')
        finidx_df: DataFrame = sort_csmar_data(finidx_df)

In [28]:
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '1990_2023_Balance Sheet.zip'), 'r') as zip_ref:
    with zip_ref.open('FS_Combas.csv') as csv_file:
        fscombas_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip',
                                             usecols=['Stkcd', 'Accper', 'A001000000', 'A002000000']).rename(
            columns={'A001000000': 'at', 'A002000000': 'lt'})
        fscombas_df: DataFrame = sort_csmar_data(fscombas_df)

In [29]:
fc_df: DataFrame = finidx_df.merge(fscombas_df, on=[const.TICKER, const.YEAR], how='left')
fc_df.sort_values(by=[const.TICKER, const.YEAR], ascending=True, inplace=True)
fc_df.loc[:, 'lag_at'] = fc_df.groupby(const.TICKER)['at'].shift(1)
fc_df.loc[:, 'lag_lt'] = fc_df.groupby(const.TICKER)['lt'].shift(1)

fc_df['fe_at'] = fc_df['FinaExpense'] / fc_df['at']
fc_df['fe_lt'] = fc_df['FinaExpense'] / fc_df['lt']
fc_df['fe_lat'] = fc_df['FinaExpense'] / fc_df['lag_at']
fc_df['fe_llt'] = fc_df['FinaExpense'] / fc_df['lag_lt']

fc_df['ie_at'] = fc_df['InterestExpense'] / fc_df['at']
fc_df['ie_lt'] = fc_df['InterestExpense'] / fc_df['lt']
fc_df['ie_lat'] = fc_df['InterestExpense'] / fc_df['lag_at']
fc_df['ie_llt'] = fc_df['InterestExpense'] / fc_df['lag_lt']

fc_df['ies_at'] = fc_df['InterestExpenses'] / fc_df['at']
fc_df['ies_lt'] = fc_df['InterestExpenses'] / fc_df['lt']
fc_df['ies_lat'] = fc_df['InterestExpenses'] / fc_df['lag_at']
fc_df['ies_llt'] = fc_df['InterestExpenses'] / fc_df['lag_lt']



Unnamed: 0,tic,InterestExpenses,FinaExpense,InterestExpense,year,at,lt,lag_at,lag_lt,fe_at,...,fe_lat,fe_llt,ie_at,ie_lt,ie_lat,ie_llt,ies_at,ies_lt,ies_lat,ies_llt
0,1,3.972235e+08,,,1993,9.337871e+09,8.148741e+09,,,,...,,,,,,,0.042539,0.048747,,
1,1,5.994891e+08,,,1994,1.548841e+10,1.382858e+10,9.337871e+09,8.148741e+09,,...,,,,,,,0.038706,0.043351,0.064200,0.073568
2,1,8.678880e+08,,,1995,2.031248e+10,1.835194e+10,1.548841e+10,1.382858e+10,,...,,,,,,,0.042727,0.047291,0.056035,0.062760
3,1,1.154543e+09,,,1996,3.002203e+10,2.748234e+10,2.031248e+10,1.835194e+10,,...,,,,,,,0.038457,0.042010,0.056839,0.062911
4,1,1.221109e+09,,,1997,3.179863e+10,2.839795e+10,3.002203e+10,2.748234e+10,,...,,,,,,,0.038401,0.043000,0.040674,0.044432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70374,900957,,32709337.44,32271333.76,2019,5.549938e+08,9.741982e+07,5.520247e+08,1.156318e+08,5.893640e-02,...,5.925339e-02,0.282875,0.058147,0.331260,0.058460,0.279087,,,,
70375,900957,,30126704.08,30168772.78,2020,5.733646e+08,1.018166e+08,5.549938e+08,9.741982e+07,5.254371e-02,...,5.428296e-02,0.309246,0.052617,0.296305,0.054359,0.309678,,,,
70376,900957,,28805657.42,,2021,5.801044e+08,9.482549e+07,5.733646e+08,1.018166e+08,4.965599e-02,...,5.023969e-02,0.282917,,,,,,,,
70377,900957,,-2542.01,,2022,5.918676e+08,9.849196e+07,5.801044e+08,9.482549e+07,-4.294896e-06,...,-4.381987e-06,-0.000027,,,,,,,,


In [30]:
fc_df.replace([np.inf, -np.inf], np.nan, inplace=True)
fc_df.describe()

Unnamed: 0,tic,InterestExpenses,FinaExpense,InterestExpense,year,at,lt,lag_at,lag_lt,fe_at,...,fe_lat,fe_llt,ie_at,ie_lt,ie_lat,ie_llt,ies_at,ies_lt,ies_lat,ies_llt
count,70379.0,15028.0,69634.0,25157.0,70379.0,70221.0,70216.0,64551.0,64546.0,69474.0,...,63884.0,63876.0,25156.0,25157.0,23632.0,23631.0,15028.0,15022.0,14141.0,14136.0
mean,339479.682377,2598846000.0,83213860.0,146657600.0,2013.669461,39900820000.0,33247620000.0,37707960000.0,31454590000.0,0.032418,...,0.013242,0.347566,0.010208,0.3481,0.011657,0.454932,0.001353,0.560346,0.00221,0.136404
std,292799.994027,23776110000.0,514633100.0,694318000.0,7.627352,705289400000.0,648819400000.0,665236200000.0,611978900000.0,5.191898,...,0.189967,31.158398,0.024119,16.262805,0.024459,20.735176,0.010305,51.070826,0.057734,10.428371
min,1.0,-32809.45,-8605000000.0,-5812000000.0,1991.0,0.0,-2033024.0,0.0,-2033024.0,-0.181792,...,-0.383092,-1526.823132,-0.058643,-1.027385,-0.047658,-1.033818,-6.2e-05,-0.000221,-5.8e-05,-0.000174
25%,2230.0,0.0,-845122.3,2697814.0,2009.0,1009060000.0,242936100.0,967819300.0,233086800.0,-0.00065,...,-0.000595,-0.002568,0.001282,0.005457,0.001634,0.007273,0.0,0.0,0.0,0.0
50%,300651.0,0.0,8716583.0,16318780.0,2016.0,2155166000.0,668824400.0,2055436000.0,633164500.0,0.005445,...,0.006656,0.018614,0.005213,0.015677,0.006483,0.020578,0.0,0.0,0.0,0.0
75%,600713.0,0.0,40654970.0,69656840.0,2020.0,5102345000.0,1975373000.0,4800939000.0,1823902000.0,0.015247,...,0.017624,0.040931,0.012689,0.03219,0.014865,0.039401,0.0,0.0,0.0,0.0
max,900957.0,554819000000.0,27816000000.0,30409000000.0,2023.0,42437950000000.0,38952250000000.0,37739290000000.0,34393660000000.0,1364.361569,...,29.131217,5345.976906,1.570772,1568.057792,1.032464,1883.610966,0.353779,6022.492599,5.261082,1097.915367


In [59]:
reg_df: DataFrame = pd.read_stata(os.path.join(const.OUTPUT_PATH, '20241020_cc_reg_data_v2.dta'))
fc_useful_df: DataFrame = fc_df.loc[:,
                          [const.TICKER, const.YEAR, 'fe_at', 'fe_lat', 'fe_llt', 'fe_lt', 'ie_at', 'ie_lat', 'ie_lt',
                           'ie_llt', 'ies_at', 'ies_lt', 'ies_lat', 'ies_llt']].copy()
reg_df2: DataFrame = reg_df.merge(fc_useful_df, on=[const.TICKER, const.YEAR], how='left')

fc_useful_df[const.YEAR] -= 1
reg_df2: DataFrame = reg_df2.merge(fc_useful_df, on=[const.TICKER, const.YEAR], how='left', suffixes=('', '_1'))

In [38]:
reg_df2.drop_duplicates(subset=[const.TICKER, const.YEAR]).describe()


Unnamed: 0,zscore,ppe1,concurrentposition,year,city4,insinvestorprop,tbq3,ppe2,tic,kz,...,fe_llt_1,fe_lt_1,ie_at_1,ie_lat_1,ie_llt_1,ie_llt_1.1,ies_at_1,ies_lt_1,ies_lat_1,ies_llt_1
count,37836.0,38948.0,38980.0,41072.0,40840.0,39668.0,38157.0,40536.0,41072.0,33050.0,...,40529.0,40532.0,13829.0,13830.0,13829.0,13829.0,13866.0,13860.0,13846.0,13841.0
mean,4.644661,0.371384,0.262442,2014.312159,3508.336914,45.814915,2.628775,0.431181,310949.572361,0.606748,...,0.492202,0.322825,0.014084,0.014857,0.600229,0.600229,0.00108,0.606451,0.001904,0.138329
std,5.626894,0.188788,0.439967,4.175722,1273.811035,24.337039,2.026632,0.216432,284214.839517,2.360656,...,37.959273,32.421451,0.030876,0.030155,21.852201,21.852201,0.009961,53.168464,0.058195,10.538799
min,-1.201388,0.0,0.0,2006.0,1101.0,0.0001,0.841669,0.0,1.0,-10.906232,...,-1526.823132,-2109.334712,-0.058643,-0.047658,-1.033818,-1.033818,-6.2e-05,-0.000221,-5.8e-05,-0.000174
25%,1.696103,0.231581,0.0,2011.0,3101.0,27.173825,1.35422,0.265134,2222.0,-0.679577,...,-0.001886,-0.001713,0.002378,0.002616,0.011162,0.011162,0.0,0.0,0.0,0.0
50%,2.916333,0.359517,0.0,2015.0,3402.0,47.88615,1.969714,0.416556,300274.0,0.886243,...,0.021406,0.018637,0.00823,0.009089,0.027766,0.027766,0.0,0.0,0.0,0.0
75%,5.231857,0.500105,1.0,2018.0,4403.0,64.751425,3.104649,0.583437,600587.0,2.145922,...,0.046242,0.041501,0.017628,0.018841,0.049246,0.049246,0.0,0.0,0.0,0.0
max,36.381415,0.994555,1.0,2020.0,6542.0,113.711,12.566607,1.0,900957.0,10.983211,...,5345.976906,4239.189086,1.570772,1.032464,1795.513574,1795.513574,0.353779,6022.492599,5.261082,1097.915367


In [58]:
print(fc_useful_df.dtypes)

tic          int64
year         int32
fe_at      float64
fe_lat     float64
fe_llt     float64
fe_lt      float64
ie_at      float64
ie_lat     float64
ie_llt     float64
ie_llt     float64
ies_at     float64
ies_lt     float64
ies_lat    float64
ies_llt    float64
dtype: object


In [60]:
for key in ['fe_at', 'fe_lat', 'fe_llt', 'fe_lt', 'ie_at', 'ie_lat', 'ie_llt', 'ie_llt', 'ies_at', 'ies_lt', 'ies_lat',
            'ies_llt']:
    reg_df2.loc[reg_df2[key].notnull(), key] = winsorize(reg_df2[key].dropna(), limits=(0.005, 0.005))
    # fc_useful_df.loc[fc_useful_df[key].notnull(), key] = winsorize(fc_useful_df[key].dropna(), limits=(0.005, 0.005))
    reg_df2.loc[reg_df2[f'{key}_1'].notnull(), f'{key}_1'] = winsorize(reg_df2[f'{key}_1'].dropna(),
                                                                       limits=(0.005, 0.005))

In [51]:
fc_useful_df

Unnamed: 0,tic,year,fe_at,fe_lat,fe_llt,fe_lt,ie_at,ie_lat,ie_llt,ie_llt.1,ies_at,ies_lt,ies_lat,ies_llt
0,1,1992,,,,,,,,,0.02858,0.048747,,
1,1,1993,,,,,,,,,0.02858,0.043351,0.03456,0.073568
2,1,1994,,,,,,,,,0.02858,0.047291,0.03456,0.062760
3,1,1995,,,,,,,,,0.02858,0.042010,0.03456,0.062911
4,1,1996,,,,,,,,,0.02858,0.043000,0.03456,0.044432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70374,900957,2018,5.893640e-02,5.925339e-02,0.282875,0.335757,0.058147,0.058460,0.279087,0.279087,,,,
70375,900957,2019,5.254371e-02,5.428296e-02,0.309246,0.295892,0.052617,0.054359,0.309678,0.309678,,,,
70376,900957,2020,4.965599e-02,5.023969e-02,0.282917,0.303775,,,,,,,,
70377,900957,2021,-4.294896e-06,-4.381987e-06,-0.000027,-0.000026,,,,,,,,


In [63]:
reg_df2.drop_duplicates(subset=[const.TICKER, const.YEAR]).to_stata(
    os.path.join(const.OUTPUT_PATH, '20241026_cc_reg_data.dta'), write_index=False, version=119
)

# Construct Bond level data

In [3]:
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '债券基本情况表.zip'), 'r') as zip_ref:
    with zip_ref.open('BND_Bndinfo.csv') as csv_file:
        bnd_info_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip').dropna(subset=['IssSymbol'])

  bnd_info_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip').dropna(subset=['IssSymbol'])


In [12]:
df_gov = pd.read_excel(os.path.join(const.DATABASE_PATH, 'resset', '1981_2024_国债数据.xls'))

In [29]:
# Convert dates to datetime format
bnd_info_df['Listdt'] = pd.to_datetime(bnd_info_df['Listdt'])
df_gov['首次信息发布时间_IInfoPubDt'] = pd.to_datetime(df_gov['首次信息发布时间_IInfoPubDt'])

df_gov2: DataFrame = df_gov[df_gov['初始票面年利率(%)_CoupRt'] > 0].copy()

In [17]:
from datetime import timedelta


# Function to match the government bond to corporate bond based on similar issuance time and period
def match_bonds(corp_row, gov_bonds):
    # Filter government bonds within one year of the corporate bond issuance date
    filtered_gov_bonds = gov_bonds[
        (gov_bonds['首次信息发布时间_IInfoPubDt'] >= corp_row['Listdt'] - timedelta(days=365)) &
        (gov_bonds['首次信息发布时间_IInfoPubDt'] <= corp_row['Listdt'] + timedelta(days=365))]

    if filtered_gov_bonds.empty:
        return None

    issuance_time_diff = abs(filtered_gov_bonds['首次信息发布时间_IInfoPubDt'] - corp_row['Listdt'])
    issuance_period_diff = abs(filtered_gov_bonds['债券期限_年(年)_Maturity'] - corp_row['Term'])
    total_diff = issuance_time_diff.dt.days + issuance_period_diff * 365

    matched_gov_bond = filtered_gov_bonds.loc[total_diff.idxmin()]
    return matched_gov_bond['初始票面年利率(%)_CoupRt']

In [30]:
# Add government bond interest rate to corporate bonds
bnd_info_df['GovInterestRate'] = bnd_info_df.apply(lambda x: match_bonds(x, df_gov2), axis=1)

In [24]:
corp_row = bnd_info_df.loc[68]
# Filter government bonds within one year of the corporate bond issuance date
filtered_gov_bonds = df_gov[(df_gov['首次信息发布时间_IInfoPubDt'] >= corp_row['Listdt'] - timedelta(days=365)) &
                            (df_gov['首次信息发布时间_IInfoPubDt'] <= corp_row['Listdt'] + timedelta(days=365))]

In [26]:
issuance_time_diff = abs(filtered_gov_bonds['首次信息发布时间_IInfoPubDt'] - corp_row['Listdt'])
issuance_period_diff = abs(filtered_gov_bonds['债券期限_年(年)_Maturity'] - corp_row['Term'])
total_diff = issuance_time_diff.dt.days + issuance_period_diff * 365

matched_gov_bond = filtered_gov_bonds.loc[total_diff.idxmin()]

In [36]:
import datetime

bnd_info_df['Spread'] = bnd_info_df['Intrrate'] - bnd_info_df['GovInterestRate']
for key in ['Crdeem', 'Crtsell', 'Creplm']:
    bnd_info_df[key].replace({'Y': 1, 'N': 0}, inplace=True)

bnd_info_df.drop(['Basrted', 'Bemkrate'], axis=1, inplace=True)
bnd_info_df.loc[:, 'isPut'] = (bnd_info_df['CallOrPut'] == 'P').astype(int)
bnd_info_df.loc[:, 'isCall'] = (bnd_info_df['CallOrPut'] == 'C').astype(int)

bnd_info_df.loc[:, 'Post'] = bnd_info_df['Listdt'].apply(lambda x: int(x > datetime.datetime(2014, 5, 19)))

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  bnd_info_df[key].replace({'Y': 1, 'N': 0}, inplace=True)


In [52]:
bnd_info_df.keys()

Index(['Liscd', 'Abbrnme', 'Sctcd', 'Listdt', 'SecurityID', 'Varsortcd',
       'Varsort', 'Orgid', 'IssSymbol', 'Acisuquty', 'Pooprc', 'Term',
       'Intrrate', 'Crdrate', 'Crdeem', 'Crtsell', 'Creplm', 'OptType',
       'CallOrPut', 'GovInterestRate', 'Spread', 'isPut', 'isCall', 'Post',
       'year'],
      dtype='object')

In [39]:
bnd_info_df.loc[:, const.YEAR] = bnd_info_df['Listdt'].dt.year

In [40]:
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '财务指标文件.zip'), 'r') as zip_ref:
    with zip_ref.open('CSR_Finidx.csv') as csv_file:
        finidx_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip')
        finidx_df: DataFrame = sort_csmar_data(finidx_df)

In [42]:
finidx_df['ln_at'] = finidx_df['A100000'].apply(np.log)
finidx_df['lev'] = finidx_df['A200000'] / finidx_df['A100000']

In [66]:
# finidx_df[const.YEAR] += 1
bnd_reg_df = bnd_info_df.merge(finidx_df, how='left', left_on=['IssSymbol', const.YEAR], right_on=['tic', const.YEAR])

In [55]:
bnd_reg_df['Liscd'] = bnd_reg_df['Liscd'].astype(int)

In [67]:
reg_df = pd.read_stata(os.path.join(const.OUTPUT_PATH, '20241026_cc_reg_data_v2.dta'))[
    ['tic', const.YEAR, 'has_guarantee']]

bnd_reg_df = bnd_reg_df.drop(['tic'], axis=1).merge(reg_df, how='left', left_on=['IssSymbol', const.YEAR],
                                                    right_on=['tic', const.YEAR])
bnd_reg_df.loc[:, 'has_guarantee'] = bnd_reg_df['has_guarantee'].fillna(0)

In [60]:
bnd_reg_df['rate'] = bnd_reg_df['Crdrate'].apply(lambda x: 0 if pd.isna(x) else 1 if 'B' in x else 2)

In [63]:
with zipfile.ZipFile(os.path.join(const.CSMAR_PATH, '债券及主体评级情况表.zip'), 'r') as zip_ref:
    with zip_ref.open('BND_Rating.csv') as csv_file:
        bnd_rate_df: DataFrame = pd.read_csv(csv_file, on_bad_lines='skip')

bnd_rate_df['DeclareDate'] = pd.to_datetime(bnd_rate_df['DeclareDate'])
bnd_rate_df['Liscd'] = bnd_rate_df['Liscd'].astype(int)

In [64]:
# Merge rating information with bond basic information, using the rating closest to the bond Listdt
def match_rating(corp_row, ratings_df):
    filtered_ratings = ratings_df[ratings_df['Liscd'] == corp_row['Liscd']]
    if filtered_ratings.empty:
        return pd.Series([None, None])

    filtered_ratings['date_diff'] = abs(filtered_ratings['DeclareDate'] - corp_row['Listdt'])
    closest_rating = filtered_ratings.loc[filtered_ratings['date_diff'].idxmin()]
    return pd.Series([closest_rating['BtcrAdj'], closest_rating['CtcrAdj']])

In [65]:
bnd_info_df[['BtcrAdj', 'CtcrAdj']] = bnd_info_df.apply(lambda x: match_rating(x, bnd_rate_df), axis=1)


In [73]:
rating_dict = {'AAA+': 12,
               'AAA': 11,
               'AA+sf': 10,
               'AA+': 10,
               'AA': 9,
               'AA-': 8,
               'AA- ': 8,
               'A+': 7,
               'A': 6,
               'A-1': 5,
               'A-1 ': 5,
               'A-': 5,
               'BBB+': 4,
               'BBB': 3,
               'BBB-': 2,
               'B': 1,
               'C': 0}
for key in ['Crdrate', 'BtcrAdj', 'CtcrAdj']:
    bnd_reg_df[f'{key}_val'] = bnd_reg_df[key].replace(rating_dict)
    bnd_reg_df[key] = bnd_reg_df[key].fillna('C')
    bnd_reg_df[f'{key}_val'] = bnd_reg_df[f'{key}_val'].fillna(0)

  bnd_reg_df[f'{key}_val'] = bnd_reg_df[key].replace(rating_dict)
  bnd_reg_df[f'{key}_val'] = bnd_reg_df[key].replace(rating_dict)
  bnd_reg_df[f'{key}_val'] = bnd_reg_df[key].replace(rating_dict)


In [75]:
bnd_reg_df['Liscd'] = bnd_reg_df['Liscd'].astype(int)

In [76]:
bnd_reg_df.to_stata(os.path.join(const.OUTPUT_PATH, '20241027_corporate_bond_spread.dta'), write_index=False,
                    version=119)

# Calculate Investment Efficiency

Source: Baik, D. (Young-I., Chen, C. X., & Godsell, D. (2024). Board Gender Diversity and Investment Efficiency: Global Evidence from 83 Country-Level Interventions. The Accounting Review, 99(3), 1–36. https://doi.org/10.2308/TAR-2022-0251


In [81]:
import statsmodels.api as sm

full_reg_df = pd.read_stata(os.path.join(const.OUTPUT_PATH, '20241026_cc_reg_data_v2.dta'))

reg_df = full_reg_df[['indcd', 'sale_growth', 'tic', 'year', 'CAPEX_RDI_lat_1']].dropna(how='any')

# Add an indicator for negative sales growth
reg_df['negative_sales_growth'] = np.where(reg_df['sale_growth'] < 0, 1, 0)

# Prepare independent variables
# Sales growth and interaction term
reg_df['interaction'] = reg_df['negative_sales_growth'] * reg_df['sale_growth']

# Dependent variable
y = reg_df['CAPEX_RDI_lat_1']

# Estimate the model for each industry code (cross-sectional regression)
residuals = []
for indcd, group in reg_df.groupby('indcd'):
    X = group[['sale_growth', 'negative_sales_growth', 'interaction']]
    X = sm.add_constant(X)  # Adds a constant term for the intercept
    model = sm.OLS(group[y.name], X)
    results = model.fit()
    group['residuals'] = results.resid
    residuals.append(group)

# Combine residuals from all industry groups
df_with_residuals = pd.concat(residuals)

# Calculate investment efficiency
# Absolute value of residual * -1
df_with_residuals['investment_efficiency'] = -1 * df_with_residuals['residuals'].abs()

# Alternate definition: 1 if absolute value of residual * -1 is above median, otherwise 0
median_residual = df_with_residuals['investment_efficiency'].median()
df_with_residuals['investment_efficiency_alt'] = np.where(df_with_residuals['investment_efficiency'] > median_residual,
                                                          1, 0)

# Display the result
df_with_residuals[['indcd', 'CAPEX_RDI_lat_1', 'sale_growth', 'negative_sales_growth', 'investment_efficiency',
                   'investment_efficiency_alt']].head()


Unnamed: 0,indcd,CAPEX_RDI_lat_1,sale_growth,negative_sales_growth,investment_efficiency,investment_efficiency_alt
16794,,0.002151,0.364444,0,-0.04885,0
16795,,0.00676,0.0546,0,-0.043109,0
16796,,0.001904,0.066165,0,-0.048008,0
16797,,0.001213,0.013319,0,-0.048505,0
16798,,0.000509,0.097705,0,-0.049518,0


In [85]:
df_with_residuals.head()

Unnamed: 0,indcd,sale_growth,tic,year,CAPEX_RDI_lat_1,negative_sales_growth,interaction,residuals,investment_efficiency,investment_efficiency_alt
16794,,0.364444,200002,2007.0,0.002151,0,0.0,-0.04885,-0.04885,0
16795,,0.0546,200002,2008.0,0.00676,0,0.0,-0.043109,-0.043109,0
16796,,0.066165,200002,2009.0,0.001904,0,0.0,-0.048008,-0.048008,0
16797,,0.013319,200002,2010.0,0.001213,0,0.0,-0.048505,-0.048505,0
16798,,0.097705,200002,2011.0,0.000509,0,0.0,-0.049518,-0.049518,0


In [90]:
full_reg_df2['investment_efficiency'].astype(float)

0             NaN
1       -0.000211
2       -0.000140
3       -0.000101
4       -0.000716
           ...   
41067   -0.015118
41068   -0.013333
41069   -0.012785
41070   -0.008934
41071   -0.008259
Name: investment_efficiency, Length: 41072, dtype: float64

In [91]:
full_reg_df2 = full_reg_df.merge(
    df_with_residuals[[const.TICKER, const.YEAR, 'investment_efficiency', 'investment_efficiency_alt']],
    on=[const.TICKER, const.YEAR], how='left')
full_reg_df2['investment_efficiency'] = full_reg_df2['investment_efficiency'].astype(float)
full_reg_df2.to_stata(os.path.join(const.OUTPUT_PATH, '20241027_cc_reg_data.dta'), write_index=False, version=119)

In [84]:
df_with_residuals['indcd'].unique()

array(['', 'A01', 'A02', 'A03', 'A04', 'A05', 'B06', 'B07', 'B08', 'B09',
       'B10', 'B11', 'C13', 'C14', 'C15', 'C17', 'C18', 'C19', 'C20',
       'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C27', 'C28', 'C29',
       'C30', 'C31', 'C32', 'C33', 'C34', 'C35', 'C36', 'C37', 'C38',
       'C39', 'C40', 'C41', 'C42', 'C43', 'D44', 'D45', 'D46', 'E47',
       'E48', 'E49', 'E50', 'F51', 'F52', 'G53', 'G54', 'G55', 'G56',
       'G58', 'G59', 'G60', 'H61', 'H62', 'I63', 'I64', 'I65', 'J66',
       'J67', 'J68', 'J69', 'K70', 'L71', 'L72', 'M73', 'M74', 'M75',
       'N77', 'N78', 'O79', 'O80', 'O81', 'P82', 'Q83', 'R85', 'R86',
       'R87', 'R88', 'S90'], dtype=object)