In [1]:
# Import libraries
import pandas as pd

In [2]:
# Load data to a DataFrame
data = pd.read_csv('JKP_factors_data.csv')
display(data)

Unnamed: 0,location,name,freq,weighting,direction,n_stocks,n_stocks_min,date,ret
0,usa,age,monthly,vw_cap,-1,502,8,1926-03-31,-0.086738
1,usa,age,monthly,vw_cap,-1,505,12,1926-04-30,-0.000572
2,usa,age,monthly,vw_cap,-1,510,17,1926-05-31,-0.006332
3,usa,age,monthly,vw_cap,-1,512,21,1926-06-30,-0.003012
4,usa,age,monthly,vw_cap,-1,503,23,1926-07-31,0.003211
...,...,...,...,...,...,...,...,...,...
144615,usa,zero_trades_252d,monthly,vw_cap,1,3627,1622,2024-08-31,0.031409
144616,usa,zero_trades_252d,monthly,vw_cap,1,3631,1630,2024-09-30,-0.011177
144617,usa,zero_trades_252d,monthly,vw_cap,1,3614,1634,2024-10-31,-0.008210
144618,usa,zero_trades_252d,monthly,vw_cap,1,3595,1650,2024-11-30,-0.052747


In [3]:
# Filter data for only after January 1, 2000
data['date'] = pd.to_datetime(data['date'])
data = data[data['date'] >= '2000-01-01'].reset_index(drop=True)

# Select only relevant columns
df_jkp_factors = data[['date', 'name', 'ret']]

# Rename columns for consistency
columns_names = {'name':'JKP_factor', 'date':'Date', 'ret':'Factor_return'}
df_jkp_factors = df_jkp_factors.rename(columns=columns_names)

# Format all dates to the end of month
df_jkp_factors['Date'] = df_jkp_factors['Date'] + pd.offsets.MonthEnd(0)

df_jkp_factors

Unnamed: 0,Date,JKP_factor,Factor_return
0,2000-01-31,age,0.030254
1,2000-02-29,age,0.259166
2,2000-03-31,age,-0.211559
3,2000-04-30,age,-0.225892
4,2000-05-31,age,-0.136842
...,...,...,...
45895,2024-08-31,zero_trades_252d,0.031409
45896,2024-09-30,zero_trades_252d,-0.011177
45897,2024-10-31,zero_trades_252d,-0.008210
45898,2024-11-30,zero_trades_252d,-0.052747


In [4]:
# Look for number of missing values per column
df_jkp_factors.isna().sum()

Date             0
JKP_factor       0
Factor_return    0
dtype: int64

In [5]:
# Pivot the DataFrame into wide format
df_jkp_factors_wide = df_jkp_factors.pivot(index='Date', columns='JKP_factor', values='Factor_return').reset_index()
df_jkp_factors_wide

JKP_factor,Date,age,aliq_at,aliq_mat,ami_126d,at_be,at_gr1,at_me,at_turnover,be_gr1a,...,taccruals_at,taccruals_ni,tangibility,tax_gr1a,turnover_126d,turnover_var_126d,z_score,zero_trades_126d,zero_trades_21d,zero_trades_252d
0,2000-01-31,0.030254,-0.015875,0.018536,0.058705,0.048212,0.008562,-0.046928,-0.040382,0.011158,...,-0.001340,-0.023067,0.048493,-0.020490,-0.007638,-0.031576,0.016389,-0.006343,-0.002099,-0.000643
1,2000-02-29,0.259166,-0.222644,0.138305,0.126145,0.302350,-0.107640,-0.347456,-0.003528,-0.120481,...,0.031784,-0.061379,0.181442,0.054843,-0.267259,-0.098292,0.227692,-0.258749,-0.261256,-0.233215
2,2000-03-31,-0.211559,0.113484,-0.059384,-0.097385,-0.161972,0.060418,0.241058,0.068863,0.021303,...,-0.054041,0.016980,-0.093263,-0.009733,0.132038,0.150523,-0.126733,0.126554,0.115848,0.094000
3,2000-04-30,-0.225892,0.130952,-0.035645,-0.057125,-0.124413,0.092468,0.199574,0.070121,0.057028,...,-0.059227,-0.005137,-0.077151,-0.018708,0.132768,0.098770,-0.103011,0.128177,0.100690,0.123308
4,2000-05-31,-0.136842,0.138348,-0.028966,-0.031415,-0.114599,0.114336,0.151526,-0.020165,0.102710,...,-0.009222,0.031111,-0.049643,-0.026389,0.112625,0.054487,-0.105252,0.110061,0.080989,0.103525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,2024-08-31,-0.009159,0.010798,0.018197,-0.035742,-0.027581,-0.009432,-0.012035,-0.008847,0.004118,...,-0.001930,0.005331,-0.027875,0.028630,0.028526,-0.004074,0.000093,0.028476,0.030556,0.031409
296,2024-09-30,0.015388,0.005373,0.007651,-0.009591,-0.007449,0.003045,-0.009871,0.003660,0.008696,...,0.010853,0.011034,-0.003835,0.016282,-0.009747,0.005741,-0.006687,-0.009855,-0.015172,-0.011177
297,2024-10-31,0.030176,-0.017033,-0.009476,0.002699,-0.020468,-0.015150,0.006831,-0.030677,0.002992,...,-0.003090,0.006834,0.007804,0.025230,-0.007247,-0.012540,-0.015458,-0.007306,-0.001944,-0.008210
298,2024-11-30,0.055421,-0.058225,-0.021419,0.017654,-0.017568,-0.029310,0.003207,-0.008216,-0.014642,...,0.011638,0.013911,0.015177,0.020192,-0.056217,-0.023397,0.001817,-0.056044,-0.046449,-0.052747


In [6]:
# Rename factors based on the documentation provided by Jensen, Kelly, and Pedersen
chars = pd.read_excel('https://github.com/bkelly-lab/ReplicationCrisis/raw/master/GlobalFactors/Factor%20Details.xlsx')
var_new_name = chars[["abr_jkp","name_new"]].set_index("abr_jkp").to_dict()["name_new"] # create a dictionary with the factors name as keys and the new names as values
df_jkp_factors_wide_new_name = df_jkp_factors_wide.rename(columns=var_new_name)
df_jkp_factors_wide_new_name

JKP_factor,Date,Firm age,Liquidity of book assets,Liquidity of market assets,Amihud Measure,Book leverage,Asset Growth,Assets-to-market,Capital turnover,Change in common equity,...,Total accruals,Percent total accruals,Asset tangibility,Tax expense surprise,Share turnover,Coefficient of variation for share turnover,Altman Z-score,Number of zero trades with turnover as tiebreaker (6 months),Number of zero trades with turnover as tiebreaker (1 month),Number of zero trades with turnover as tiebreaker (12 months)
0,2000-01-31,0.030254,-0.015875,0.018536,0.058705,0.048212,0.008562,-0.046928,-0.040382,0.011158,...,-0.001340,-0.023067,0.048493,-0.020490,-0.007638,-0.031576,0.016389,-0.006343,-0.002099,-0.000643
1,2000-02-29,0.259166,-0.222644,0.138305,0.126145,0.302350,-0.107640,-0.347456,-0.003528,-0.120481,...,0.031784,-0.061379,0.181442,0.054843,-0.267259,-0.098292,0.227692,-0.258749,-0.261256,-0.233215
2,2000-03-31,-0.211559,0.113484,-0.059384,-0.097385,-0.161972,0.060418,0.241058,0.068863,0.021303,...,-0.054041,0.016980,-0.093263,-0.009733,0.132038,0.150523,-0.126733,0.126554,0.115848,0.094000
3,2000-04-30,-0.225892,0.130952,-0.035645,-0.057125,-0.124413,0.092468,0.199574,0.070121,0.057028,...,-0.059227,-0.005137,-0.077151,-0.018708,0.132768,0.098770,-0.103011,0.128177,0.100690,0.123308
4,2000-05-31,-0.136842,0.138348,-0.028966,-0.031415,-0.114599,0.114336,0.151526,-0.020165,0.102710,...,-0.009222,0.031111,-0.049643,-0.026389,0.112625,0.054487,-0.105252,0.110061,0.080989,0.103525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,2024-08-31,-0.009159,0.010798,0.018197,-0.035742,-0.027581,-0.009432,-0.012035,-0.008847,0.004118,...,-0.001930,0.005331,-0.027875,0.028630,0.028526,-0.004074,0.000093,0.028476,0.030556,0.031409
296,2024-09-30,0.015388,0.005373,0.007651,-0.009591,-0.007449,0.003045,-0.009871,0.003660,0.008696,...,0.010853,0.011034,-0.003835,0.016282,-0.009747,0.005741,-0.006687,-0.009855,-0.015172,-0.011177
297,2024-10-31,0.030176,-0.017033,-0.009476,0.002699,-0.020468,-0.015150,0.006831,-0.030677,0.002992,...,-0.003090,0.006834,0.007804,0.025230,-0.007247,-0.012540,-0.015458,-0.007306,-0.001944,-0.008210
298,2024-11-30,0.055421,-0.058225,-0.021419,0.017654,-0.017568,-0.029310,0.003207,-0.008216,-0.014642,...,0.011638,0.013911,0.015177,0.020192,-0.056217,-0.023397,0.001817,-0.056044,-0.046449,-0.052747


# Handling outliers

In [7]:
# Factors names into a list
factors = list(df_jkp_factors_wide_new_name.columns[1:])
factors

['Firm age',
 'Liquidity of book assets',
 'Liquidity of market assets',
 'Amihud Measure',
 'Book leverage',
 'Asset Growth',
 'Assets-to-market',
 'Capital turnover',
 'Change in common equity',
 'Book-to-market equity',
 'Market Beta',
 'Dimson beta',
 'Frazzini-Pedersen market beta',
 'Downside beta',
 'Book-to-market enterprise value',
 'The high-low bid-ask spread',
 'Abnormal corporate investment',
 'CAPEX growth (1 year)',
 'CAPEX growth (2 years)',
 'CAPEX growth (3 years)',
 'Cash-to-assets',
 'Net stock issues',
 'Change in current operating assets',
 'Change in current operating liabilities',
 'Cash-based operating profits-to-book assets',
 'Cash-based operating profits-to-lagged book assets',
 'Market correlation',
 'Coskewness',
 'Change in current operating working capital',
 'Net debt issuance',
 'Growth in book debt (3 years)',
 'Debt-to-market',
 'Change gross margin minus change sales',
 'Dividend yield',
 'Dollar trading volume',
 'Coefficient of variation for dolla

Z-scores ($z = \frac{x_i - \mu}{\sigma}$) are often used to detect outliers in datasets. Observations with a z-score less than -3 or greater than 3 are often deemed to be outliers.

In [8]:
# Compute z-scores
z_scores = (df_jkp_factors_wide_new_name[factors] - df_jkp_factors_wide_new_name[factors].mean()) / df_jkp_factors_wide_new_name[factors].std()

# Select the biggest outlier per column
outlier_columns = z_scores.abs().max()

# Select the first 20 columns with the biggest outlier
biggest_outliers = outlier_columns.sort_values(ascending=False).head(20)
biggest_outliers

JKP_factor
R&D capital-to-book assets                                     8.202260
Change in noncurrent operating liabilities                     7.964524
Book leverage                                                  7.581838
Coefficient of variation for dollar trading volume             7.564661
Net debt-to-price                                              7.175027
Price momentum t-3 to t-1                                      7.174680
Debt-to-market                                                 7.095864
Book-to-market enterprise value                                7.089557
Equity duration                                                7.071814
Assets-to-market                                               7.032712
Asset tangibility                                              7.005187
Book-to-market equity                                          6.994425
Cash flow volatility                                           6.966034
Ebitda-to-market enterprise value                    

In [9]:
# Drop all columns for which its biggest outlier has a z-score greater or equal to 7
columns_to_drop = outlier_columns[outlier_columns >= 7].index
df_jkp_factors_wide_new_name_out = df_jkp_factors_wide_new_name.drop(columns=columns_to_drop)

# Update factors list
factors_updated = [col for col in factors if col not in columns_to_drop]

# Print the shape of the DataFrame to know how many columns were dropped (154-143=11)
df_jkp_factors_wide_new_name_out.shape

(300, 143)

In [10]:
# Compute z-scores
z_scores = (df_jkp_factors_wide_new_name_out[factors_updated] - df_jkp_factors_wide_new_name_out[factors_updated].mean()) / df_jkp_factors_wide_new_name_out[factors_updated].std()

# Compute the number of outliers per column
outliers_count = (z_scores.abs() > 3).sum()

# Select the 20 columns with the most outliers
outliers_count.sort_values(ascending=False).head(20)

JKP_factor
Change in quarterly return on equity    9
Change in quarterly return on assets    8
Sales-to-market                         8
CAPEX growth (2 years)                  8
Net payout yield                        8
Return volatility                       8
Net equity issuance                     8
Sales growth (1 quarter)                8
Liquidity of book assets                8
Asset Growth                            8
Return on net operating assets          8
Altman Z-score                          8
Hiring rate                             7
Operating profits-to-book equity        7
Payout yield                            7
Operating cash flow-to-market           7
Earnings-to-price                       7
Ohlson O-score                          7
Operating cash flow to assets           7
Price momentum t-6 to t-1               7
dtype: int64

In [11]:
# Drop all columns that have more than or 8 outliers
columns_to_drop_ = outliers_count[outliers_count >= 8].index
df_jkp_factors_wide_new_name_outliers = df_jkp_factors_wide_new_name_out.drop(columns=columns_to_drop_)

# Update factors list
factors = [col for col in factors if col not in columns_to_drop_]

# Print the shape of the DataFrame to know how many columns were dropped (143-131=12)
df_jkp_factors_wide_new_name_outliers.shape

(300, 131)

23 columns were dropped in total.

In [12]:
# Save the clean DataFrame to a csv file
df_jkp_factors_wide_new_name_outliers.to_csv('df_jkp_factors.csv', index=False)