In [1]:
import os

import pandas as pd
from pandas import DataFrame
import numpy as np
from scipy.stats.mstats import winsorize

from Constants import Constants as const

In [2]:
pc3_df: DataFrame = pd.read_stata(os.path.join(const.DATA_PATH, 'fromZGY', 'firm_year_num_political_donar_3years.dta')).rename(
    columns={'recipient_ext_id': 'numDonation3year'}).drop(['index'], axis=1)
pc4_df: DataFrame = pd.read_stata(os.path.join(const.DATA_PATH, 'fromZGY', 'firm_year_num_political_donar_4year.dta')).rename(
    columns={'recipient_ext_id': 'numDonation4year'}).drop(['index'], axis=1)

In [7]:
pc3_avg = pc3_df.groupby([const.GVKEY])['numDonation3year'].mean()
pc4_avg = pc4_df.groupby([const.GVKEY])['numDonation4year'].mean()

In [9]:
pc3_avg_df: DataFrame = pc3_avg.reset_index(drop=False)
pc4_avg_df: DataFrame = pc4_avg.reset_index(drop=False)

In [10]:
reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20241014_stock_act_reg_data_v2.dta'))
gvkey_series = reg_df[const.GVKEY].unique()

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20241014_stock_act_reg_data_v2.dta'))


In [13]:
# Update the function to handle numpy.ndarray as input
def ensure_gvkeys_with_ndarray(df, gvkey_array, num_col_name):
    # Convert the numpy array to a set for quick lookup
    gvkey_set = set(gvkey_array)
    # Identify missing gvkeys
    existing_gvkeys = set(df['gvkey'])
    missing_gvkeys = gvkey_set - existing_gvkeys
    # Create a DataFrame with missing gvkeys
    missing_rows = pd.DataFrame({
        'gvkey': list(missing_gvkeys),
        num_col_name: [None] * len(missing_gvkeys)
    })
    # Concatenate the original DataFrame with missing rows and sort by gvkey
    updated_df = pd.concat([df, missing_rows], ignore_index=True)
    updated_df = updated_df.sort_values('gvkey').reset_index(drop=True)
    return updated_df

In [14]:
# Update pc3_avg_df and pc4_avg_df
pc3_avg_df2 = ensure_gvkeys_with_ndarray(pc3_avg_df, gvkey_series, 'numDonation3year')
pc4_avg_df2 = ensure_gvkeys_with_ndarray(pc4_avg_df, gvkey_series, 'numDonation4year')


  updated_df = pd.concat([df, missing_rows], ignore_index=True)


In [16]:
pc_avg_df: DataFrame = pc3_avg_df2.merge(pc4_avg_df2, on=[const.GVKEY])
pc_avg_df.shape

(1837, 3)

In [17]:
for key in ['numDonation3year', 'numDonation4year']:
    pc_avg_df[key] = pc_avg_df[key].fillna(0)

In [20]:
pc_avg_df['highDonation3Year'] = (pc_avg_df['numDonation3year'] > pc_avg_df['numDonation3year'].median()).astype(int)
pc_avg_df['highDonation4Year'] = (pc_avg_df['numDonation4year'] > pc_avg_df['numDonation4year'].median()).astype(int)

In [22]:
reg_df2: DataFrame = reg_df.merge(pc_avg_df, on=[const.GVKEY], how='left')
reg_df2.to_stata(os.path.join(const.RESULT_PATH, '20241212_stock_act_reg_data.dta'), write_index=False, version=119)

# Append Government Contract data

In [2]:
reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20241212_stock_act_reg_data.dta'))
gov_contract_df: DataFrame = pd.read_csv(os.path.join(const.DATA_PATH, 'fromZGY', 'gov_characters_data.csv')).drop(['tic'], axis=1)

In [3]:
gov_contract_df.rename(columns=lambda x: '{}_num'.format(x) if x.endswith('gov') else x, inplace=True)

In [4]:
gov_contract_df

Unnamed: 0,fiscal_year,fed_gov_num,gov_num,state_gov_num,loc_gov_num,gvkey
0,2008.0,0.0,0.0,0.0,0.0,151832
1,2009.0,0.0,0.0,0.0,0.0,151832
2,2010.0,0.0,0.0,0.0,0.0,151832
3,2011.0,0.0,0.0,0.0,0.0,151832
4,2012.0,0.0,0.0,0.0,0.0,151832
...,...,...,...,...,...,...
85811,2011.0,0.0,0.0,0.0,0.0,30165
85812,2012.0,0.0,0.0,0.0,0.0,30165
85813,2013.0,0.0,0.0,0.0,0.0,30165
85814,2014.0,0.0,0.0,0.0,0.0,30165


In [10]:
reg_df_gov_num = reg_df.merge(gov_contract_df, on=[const.GVKEY, const.YEAR], how='left')
for key in gov_contract_df.keys():
    if key not in {const.GVKEY, const.YEAR}:
        reg_df_gov_num[key].fillna(0, inplace=True)
        reg_df_gov_num[key.replace('num', 'dummy')] = reg_df_gov_num[key].apply(lambda x: int(x > 0))

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reg_df_gov_num[key].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reg_df_gov_num[key].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alway

In [11]:
reg_df_gov_num.describe()

Unnamed: 0,index,gvkey,fiscal_year,mf_indicator,frequency,log_frequency,MajorGovCustomer,post,log_market_value,lev,...,highDonation3Year,highDonation4Year,fed_gov_num,gov_num,state_gov_num,loc_gov_num,fed_gov_dummy,gov_dummy,state_gov_dummy,loc_gov_dummy
count,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,...,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0
mean,6036.5,81220.205317,2011.54696,0.363508,4.817459,0.868412,0.153553,0.508945,6.51741,0.181687,...,0.439043,0.465629,0.263541,0.295511,0.023025,0.008945,0.152725,0.160593,0.016316,0.006957
min,0.0,1004.0,2008.0,0.0,0.0,0.0,0.0,0.0,0.786519,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3018.25,12142.0,2010.0,0.0,0.0,0.0,0.0,0.0,5.012064,0.00093,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,6036.5,62592.0,2012.0,0.0,0.0,0.0,0.0,1.0,6.520895,0.148584,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,9054.75,156153.0,2014.0,1.0,7.0,2.079442,0.0,1.0,7.976261,0.264176,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,12073.0,296753.0,2015.0,1.0,36.0,3.610918,1.0,1.0,11.561005,17.82545,...,1.0,1.0,19.0,19.0,10.0,3.0,1.0,1.0,1.0,1.0
std,3485.60791,73644.923742,2.279738,0.481029,8.551352,1.2453,0.360535,0.499941,2.163776,0.273871,...,0.496291,0.498838,0.862415,0.934783,0.219107,0.114772,0.359737,0.36717,0.126693,0.083122


In [12]:
reg_df_gov_num.to_stata(os.path.join(const.RESULT_PATH, '20241223_stock_act_reg_data.dta'), write_index=False, version=119)

# append some annual data

In [50]:
reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20241223_stock_act_reg_data.dta'))
gov_contract: DataFrame = pd.read_stata(os.path.join(const.DATA_PATH, 'fromZGY', 'gov_contract.dta')).rename(
    columns={'year': const.YEAR, 'if_gov_contract': 'hasGovContractPanel'}).drop(
    ['index', 'if_cpdata', 'if_costplus', 'if_cas', 'if_noncomm'], axis=1).dropna(how='any').drop_duplicates(
    subset=[const.GVKEY, const.YEAR], keep='first')
govsales_percentage: DataFrame = pd.read_stata(os.path.join(const.DATA_PATH, 'fromZGY', 'govsales_percentage.dta')).rename(
    columns={'tic': const.TICKER, 'annual_per_sale': 'GovSalePercent'}).drop(
    ['index', 'gov_annual_sales', 'all_annual_sales'], axis=1).dropna(how='any')



In [51]:
dollar_amount = pd.read_stata(os.path.join(const.DATA_PATH, 'fromZGY', 'fec_2008_2015_firmlevel_dollarnum.dta')).rename(
    columns={'year': const.YEAR, 'amount': 'contrAmt'}).drop(['index'], axis=1).dropna(how='any')
people_num = pd.read_stata(os.path.join(const.DATA_PATH, 'fromZGY', 'fec_2008_2015_firmlevel_peoplenum.dta')).rename(
    columns={'year': const.YEAR, 'recipient_ext_id': 'numPolitician'}).drop(['index'], axis=1).dropna(how='any')

In [52]:
gov_contract[const.YEAR] = gov_contract[const.YEAR].astype(int)
govsales_percentage[const.YEAR] = govsales_percentage[const.YEAR].astype(int)
dollar_amount[const.YEAR] = dollar_amount[const.YEAR].astype(int)
people_num[const.YEAR] = people_num[const.YEAR].astype(int)
gov_contract[const.GVKEY] = gov_contract[const.GVKEY].astype(int)
dollar_amount[const.GVKEY] = dollar_amount[const.GVKEY].astype(int)
people_num[const.GVKEY] = people_num[const.GVKEY].astype(int)

In [63]:
reg_df2: DataFrame = reg_df.merge(govsales_percentage, on=[const.TICKER, const.YEAR], how='left').merge(
    gov_contract, on=[const.GVKEY, const.YEAR], how='left').merge(
    dollar_amount, on=[const.GVKEY, const.YEAR], how='left').merge(
    people_num, on=[const.GVKEY, const.YEAR], how='left')
reg_df2.loc[:, 'hasGovContractPanel'] = reg_df2['hasGovContractPanel'].fillna(0)


In [47]:
reg_df2[['hasGovContract', 'hasGovContractPanel']].describe()

Unnamed: 0,hasGovContract,hasGovContractPanel
count,12074.0,12074.0
mean,0.281762,0.430098
std,0.449877,0.49511
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,1.0,1.0
max,1.0,1.0


In [64]:
reg_df2['hasGovPanelMajor'] = reg_df2.apply(lambda x: int((x['hasGovContractPanel'] == 1) and (x['MajorGovCustomer'] == 0)), axis=1)
reg_df2['hasGovPanelFed'] = reg_df2.apply(lambda x: int((x['hasGovContractPanel'] == 1) and (x['fed_gov_dummy'] == 0)), axis=1)

In [55]:
reg_df2.loc[:, ['GovSalePercent', 'contrAmt', 'numPolitician']].describe()

Unnamed: 0,GovSalePercent,contrAmt,numPolitician
count,1362.0,4984.0,4984.0
mean,0.421257,30176.18,33.989767
std,0.341576,104663.4,114.45102
min,0.001727,-6618.0,1.0
25%,0.104907,1250.0,2.0
50%,0.327493,5300.0,6.0
75%,0.694477,22643.0,22.0
max,1.0,3602357.0,2581.0


In [56]:
def create_high_low_dummy(df: DataFrame, check_key: str):
    df_median = df[check_key].median()
    df[f'High{check_key}'] = 0
    df[f'Low{check_key}'] = 0
    df.loc[df[check_key] > df_median, f'High{check_key}'] = 1
    df.loc[df[check_key] < df_median, f'Low{check_key}'] = 1

    return df


In [65]:
reg_df3 = reg_df2.copy()
for key in ['GovSalePercent', 'contrAmt', 'numPolitician']:
    reg_df3: DataFrame = reg_df3.groupby(const.YEAR).apply(create_high_low_dummy, check_key=key).reset_index(drop=True)


  reg_df3: DataFrame = reg_df3.groupby(const.YEAR).apply(create_high_low_dummy, check_key=key).reset_index(drop=True)
  reg_df3: DataFrame = reg_df3.groupby(const.YEAR).apply(create_high_low_dummy, check_key=key).reset_index(drop=True)
  reg_df3: DataFrame = reg_df3.groupby(const.YEAR).apply(create_high_low_dummy, check_key=key).reset_index(drop=True)


In [71]:
reg_df3['GovSalePercent'] = reg_df3['GovSalePercent'].fillna(0)
reg_df3['contrAmt'] = reg_df3['contrAmt'].fillna(0)
reg_df3['numPolitician'] = reg_df3['numPolitician'].fillna(0)

In [72]:
reg_df3.to_stata(os.path.join(const.RESULT_PATH, '20241231_stock_act_data.dta'), write_index=False, version=117)