# Data Assignment 1 - `t54zheng` (20939203)


In [1]:
import pandas as pd
from pandasql import sqldf
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Part 1 - Downloading and Reading Raw Data

WRDS Queries
- [8076602 - crsp](https://wrds-www.wharton.upenn.edu/query-manager/query/8076602/)
- [8076622 - compustat](https://wrds-www.wharton.upenn.edu/query-manager/query/8076622/)


In [2]:
# import raw data
crsp = pd.read_sas("crsp_nasdaq_100.sas7bdat", encoding = 'ISO-8859-1')
compustat = pd.read_sas("compustat_nasdaq_100.sas7bdat", encoding = 'ISO-8859-1')

### Data Cleanup

- Notice that there are more tickers in the crsp data than there are companies
- This means that some companies change ticker over the observed time period
- this means that we should actually include sort stocks by permno (as suggested in the assignment)

In [3]:
# Demonstrate this value is greater than 99 (size of our nasdaq 100 data)
print(f'CRSP Tickers - {len(set(crsp.TICKER))}')
print(f'Compustat Tickers - {len(set(compustat.TIC))}')

CRSP Tickers - 119
Compustat Tickers - 99


In [4]:
# To fix this, we will just add permnos to our compustat data
translation_df = pd.read_excel("nasdaq100_pop.xlsx", sheet_name="all", converters={"GVKEY":str, "PERMNO":str})
translation_dict = translation_df[["GVKEY", "PERMNO"]].set_index("GVKEY").to_dict()["PERMNO"]

In [5]:
compustat['PERMNO'] = compustat.apply(lambda row: translation_dict[row.GVKEY], axis=1)

## Part 2 - Merging the data

In [6]:
# Enumerate CRSP data using monthid
crsp['YEAR'] = pd.DatetimeIndex(crsp['DATE']).year
crsp['MONTH'] = pd.DatetimeIndex(crsp['DATE']).month
crsp['monthid_crsp'] = ((crsp['YEAR']) - 1996) * 12 + crsp['MONTH']
sqldf("SELECT * FROM crsp LIMIT 5")

Unnamed: 0,PERMNO,DATE,CUSIP,COMNAM,TICKER,PRC,RET,RETX,SHROUT,VWRETD,SPRTRN,YEAR,MONTH,monthid_crsp
0,10107.0,1997-01-31 00:00:00.000000,59491810,MICROSOFT CORP,MSFT,102.0,0.234493,0.234493,1198000.0,0.053473,0.061317,1997,1,13
1,10107.0,1997-02-28 00:00:00.000000,59491810,MICROSOFT CORP,MSFT,97.5,-0.044118,-0.044118,1198000.0,-0.001067,0.005928,1997,2,14
2,10107.0,1997-03-31 00:00:00.000000,59491810,MICROSOFT CORP,MSFT,91.6875,-0.059615,-0.059615,1191000.0,-0.044889,-0.042614,1997,3,15
3,10107.0,1997-04-30 00:00:00.000000,59491810,MICROSOFT CORP,MSFT,121.5,0.325153,0.325153,1191000.0,0.042396,0.058406,1997,4,16
4,10107.0,1997-05-30 00:00:00.000000,59491810,MICROSOFT CORP,MSFT,124.0,0.020576,0.020576,1191000.0,0.07164,0.058577,1997,5,17


In [7]:
# Enumerate compustat data using monthid
compustat['YEAR'] = pd.DatetimeIndex(compustat['DATADATE']).year
compustat['MONTH'] = pd.DatetimeIndex(compustat['DATADATE']).month
compustat['monthid_compustat'] = ((compustat['YEAR']) - 1996) * 12 + compustat['MONTH']
sqldf("SELECT * FROM compustat LIMIT 5")

Unnamed: 0,GVKEY,DATADATE,FYEARQ,FQTR,CONM,TIC,DATACQTR,DATAFQTR,ATQ,CEQQ,...,CONSOL,INDFMT,DATAFMT,POPSRC,CURCDQ,COSTAT,PERMNO,YEAR,MONTH,monthid_compustat
0,1161,1996-03-31 00:00:00.000000,1996.0,1.0,ADVANCED MICRO DEVICES,AMD,1996Q1,1996Q1,2954.809,2121.948,...,C,INDL,STD,D,USD,A,61241,1996,3,3
1,1161,1996-06-30 00:00:00.000000,1996.0,2.0,ADVANCED MICRO DEVICES,AMD,1996Q2,1996Q2,2845.025,2071.355,...,C,INDL,STD,D,USD,A,61241,1996,6,6
2,1161,1996-09-30 00:00:00.000000,1996.0,3.0,ADVANCED MICRO DEVICES,AMD,1996Q3,1996Q3,3053.992,2036.658,...,C,INDL,STD,D,USD,A,61241,1996,9,9
3,1161,1996-12-31 00:00:00.000000,1996.0,4.0,ADVANCED MICRO DEVICES,AMD,1996Q4,1996Q4,3145.283,2021.878,...,C,INDL,STD,D,USD,A,61241,1996,12,12
4,1161,1997-03-31 00:00:00.000000,1997.0,1.0,ADVANCED MICRO DEVICES,AMD,1997Q1,1997Q1,3426.126,2043.757,...,C,INDL,STD,D,USD,A,61241,1997,3,15


In [8]:
df_compustat_crsp = sqldf("SELECT a.*, b.* \
                           FROM crsp as a \
                           LEFT JOIN compustat as b \
                           ON a.PERMNO = b.PERMNO and a.monthid_crsp >= b.monthid_compustat + 4 and a.monthid_crsp <= b.monthid_compustat + 6") # 3-month lag

## Notice Duplicates
- Notice that our dataframe size exceeds our initial data size from CRSP of 23997 rows, so we have some duplicates
- And indeed we do (I found them using the command below, but showing them first so it's easier to understand)

In [9]:
df_compustat_crsp[6870:6881]

Unnamed: 0,PERMNO,DATE,CUSIP,COMNAM,TICKER,PRC,RET,RETX,SHROUT,VWRETD,...,CONSOL,INDFMT,DATAFMT,POPSRC,CURCDQ,COSTAT,PERMNO.1,YEAR,MONTH,monthid_compustat
6870,45911.0,2002-04-30 00:00:00.000000,83088M10,ALPHA INDUSTRIES INC,AHAA,12.25,-0.196721,-0.196721,44260.0,-0.0496,...,C,INDL,STD,D,USD,A,45911,2001.0,12.0,72.0
6871,45911.0,2002-05-31 00:00:00.000000,83088M10,ALPHA INDUSTRIES INC,AHAA,10.19,-0.168163,-0.168163,44291.0,-0.01051,...,C,INDL,STD,D,USD,A,45911,2001.0,12.0,72.0
6872,45911.0,2002-05-31 00:00:00.000000,83088M10,ALPHA INDUSTRIES INC,AHAA,10.19,-0.168163,-0.168163,44291.0,-0.01051,...,C,INDL,STD,D,USD,A,45911,2001.0,12.0,72.0
6873,45911.0,2002-06-28 00:00:00.000000,83088M10,SKYWORKS SOLUTIONS INC,SWKS,5.55,-0.455348,-0.455348,137368.0,-0.070259,...,C,INDL,STD,D,USD,A,45911,2001.0,12.0,72.0
6874,45911.0,2002-06-28 00:00:00.000000,83088M10,SKYWORKS SOLUTIONS INC,SWKS,5.55,-0.455348,-0.455348,137368.0,-0.070259,...,C,INDL,STD,D,USD,A,45911,2001.0,12.0,72.0
6875,45911.0,2002-07-31 00:00:00.000000,83088M10,SKYWORKS SOLUTIONS INC,SWKS,2.9,-0.477477,-0.477477,137510.0,-0.081125,...,C,INDL,STD,D,USD,A,45911,2002.0,3.0,75.0
6876,45911.0,2002-07-31 00:00:00.000000,83088M10,SKYWORKS SOLUTIONS INC,SWKS,2.9,-0.477477,-0.477477,137510.0,-0.081125,...,C,INDL,STD,D,USD,A,45911,2002.0,3.0,75.0
6877,45911.0,2002-08-30 00:00:00.000000,83088M10,SKYWORKS SOLUTIONS INC,SWKS,4.2,0.448276,0.448276,137510.0,0.007949,...,C,INDL,STD,D,USD,A,45911,2002.0,3.0,75.0
6878,45911.0,2002-08-30 00:00:00.000000,83088M10,SKYWORKS SOLUTIONS INC,SWKS,4.2,0.448276,0.448276,137510.0,0.007949,...,C,INDL,STD,D,USD,A,45911,2002.0,3.0,75.0
6879,45911.0,2002-09-30 00:00:00.000000,83088M10,SKYWORKS SOLUTIONS INC,SWKS,4.53,0.078572,0.078572,137589.0,-0.099923,...,C,INDL,STD,D,USD,A,45911,2002.0,3.0,75.0


In [10]:
# These are our dropped rows
df_compustat_crsp[~df_compustat_crsp.index.isin(df_compustat_crsp.drop_duplicates(["PERMNO", "DATE"], ignore_index=False).index)]

Unnamed: 0,PERMNO,DATE,CUSIP,COMNAM,TICKER,PRC,RET,RETX,SHROUT,VWRETD,...,CONSOL,INDFMT,DATAFMT,POPSRC,CURCDQ,COSTAT,PERMNO.1,YEAR,MONTH,monthid_compustat
6870,45911.0,2002-04-30 00:00:00.000000,83088M10,ALPHA INDUSTRIES INC,AHAA,12.25,-0.196721,-0.196721,44260.0,-0.0496,...,C,INDL,STD,D,USD,A,45911,2001.0,12.0,72.0
6872,45911.0,2002-05-31 00:00:00.000000,83088M10,ALPHA INDUSTRIES INC,AHAA,10.19,-0.168163,-0.168163,44291.0,-0.01051,...,C,INDL,STD,D,USD,A,45911,2001.0,12.0,72.0
6874,45911.0,2002-06-28 00:00:00.000000,83088M10,SKYWORKS SOLUTIONS INC,SWKS,5.55,-0.455348,-0.455348,137368.0,-0.070259,...,C,INDL,STD,D,USD,A,45911,2001.0,12.0,72.0
6876,45911.0,2002-07-31 00:00:00.000000,83088M10,SKYWORKS SOLUTIONS INC,SWKS,2.9,-0.477477,-0.477477,137510.0,-0.081125,...,C,INDL,STD,D,USD,A,45911,2002.0,3.0,75.0
6878,45911.0,2002-08-30 00:00:00.000000,83088M10,SKYWORKS SOLUTIONS INC,SWKS,4.2,0.448276,0.448276,137510.0,0.007949,...,C,INDL,STD,D,USD,A,45911,2002.0,3.0,75.0
6880,45911.0,2002-09-30 00:00:00.000000,83088M10,SKYWORKS SOLUTIONS INC,SWKS,4.53,0.078572,0.078572,137589.0,-0.099923,...,C,INDL,STD,D,USD,A,45911,2002.0,3.0,75.0


In [11]:
# So we drop the duplicates and we are good
df_compustat_crsp.drop_duplicates(["PERMNO", "DATE"], inplace=True)
df_compustat_crsp

Unnamed: 0,PERMNO,DATE,CUSIP,COMNAM,TICKER,PRC,RET,RETX,SHROUT,VWRETD,...,CONSOL,INDFMT,DATAFMT,POPSRC,CURCDQ,COSTAT,PERMNO.1,YEAR,MONTH,monthid_compustat
0,10107.0,1997-01-31 00:00:00.000000,59491810,MICROSOFT CORP,MSFT,102.000000,0.234493,0.234493,1198000.0,0.053473,...,C,INDL,STD,D,USD,A,10107,1996.0,9.0,9.0
1,10107.0,1997-02-28 00:00:00.000000,59491810,MICROSOFT CORP,MSFT,97.500000,-0.044118,-0.044118,1198000.0,-0.001067,...,C,INDL,STD,D,USD,A,10107,1996.0,9.0,9.0
2,10107.0,1997-03-31 00:00:00.000000,59491810,MICROSOFT CORP,MSFT,91.687500,-0.059615,-0.059615,1191000.0,-0.044889,...,C,INDL,STD,D,USD,A,10107,1996.0,9.0,9.0
3,10107.0,1997-04-30 00:00:00.000000,59491810,MICROSOFT CORP,MSFT,121.500000,0.325153,0.325153,1191000.0,0.042396,...,C,INDL,STD,D,USD,A,10107,1996.0,12.0,12.0
4,10107.0,1997-05-30 00:00:00.000000,59491810,MICROSOFT CORP,MSFT,124.000000,0.020576,0.020576,1191000.0,0.071640,...,C,INDL,STD,D,USD,A,10107,1996.0,12.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23998,93436.0,2022-08-31 00:00:00.000000,88160R10,TESLA INC,TSLA,275.609985,-0.072489,-0.072489,3133470.0,-0.036233,...,C,INDL,STD,D,USD,A,93436,2022.0,3.0,315.0
23999,93436.0,2022-09-30 00:00:00.000000,88160R10,TESLA INC,TSLA,265.250000,-0.037589,-0.037589,3158000.0,-0.091323,...,C,INDL,STD,D,USD,A,93436,2022.0,3.0,315.0
24000,93436.0,2022-10-31 00:00:00.000000,88160R10,TESLA INC,TSLA,227.539993,-0.142168,-0.142168,3157752.0,0.077394,...,C,INDL,STD,D,USD,A,93436,2022.0,6.0,318.0
24001,93436.0,2022-11-30 00:00:00.000000,88160R10,TESLA INC,TSLA,194.699997,-0.144326,-0.144326,3157752.0,0.052354,...,C,INDL,STD,D,USD,A,93436,2022.0,6.0,318.0


Finally, we keep the cleaned data from 2000-2022 for our next set of questions.

In [12]:
df_compustat_crsp_2000_2022 = df_compustat_crsp.loc[df_compustat_crsp["DATE"] >= "2000-01-01"]

In [13]:
df_compustat_crsp_2000_2022.reset_index(inplace=True)

In [14]:
# df_compustat_crsp_2000_2022.dropna(axis=0, inplace=True)

In [15]:
df_compustat_crsp_2000_2022

Unnamed: 0,index,PERMNO,DATE,CUSIP,COMNAM,TICKER,PRC,RET,RETX,SHROUT,...,CONSOL,INDFMT,DATAFMT,POPSRC,CURCDQ,COSTAT,PERMNO.1,YEAR,MONTH,monthid_compustat
0,36,10107.0,2000-01-31 00:00:00.000000,59491810,MICROSOFT CORP,MSFT,97.875000,-0.161670,-0.161670,5160025.0,...,C,INDL,STD,D,USD,A,10107,1999.0,9.0,45.0
1,37,10107.0,2000-02-29 00:00:00.000000,59491810,MICROSOFT CORP,MSFT,89.375000,-0.086845,-0.086845,5160025.0,...,C,INDL,STD,D,USD,A,10107,1999.0,9.0,45.0
2,38,10107.0,2000-03-31 00:00:00.000000,59491810,MICROSOFT CORP,MSFT,106.250000,0.188811,0.188811,5242000.0,...,C,INDL,STD,D,USD,A,10107,1999.0,9.0,45.0
3,39,10107.0,2000-04-28 00:00:00.000000,59491810,MICROSOFT CORP,MSFT,69.750000,-0.343529,-0.343529,5262405.0,...,C,INDL,STD,D,USD,A,10107,1999.0,12.0,48.0
4,40,10107.0,2000-05-31 00:00:00.000000,59491810,MICROSOFT CORP,MSFT,62.562500,-0.103047,-0.103047,5262405.0,...,C,INDL,STD,D,USD,A,10107,1999.0,12.0,48.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22015,23998,93436.0,2022-08-31 00:00:00.000000,88160R10,TESLA INC,TSLA,275.609985,-0.072489,-0.072489,3133470.0,...,C,INDL,STD,D,USD,A,93436,2022.0,3.0,315.0
22016,23999,93436.0,2022-09-30 00:00:00.000000,88160R10,TESLA INC,TSLA,265.250000,-0.037589,-0.037589,3158000.0,...,C,INDL,STD,D,USD,A,93436,2022.0,3.0,315.0
22017,24000,93436.0,2022-10-31 00:00:00.000000,88160R10,TESLA INC,TSLA,227.539993,-0.142168,-0.142168,3157752.0,...,C,INDL,STD,D,USD,A,93436,2022.0,6.0,318.0
22018,24001,93436.0,2022-11-30 00:00:00.000000,88160R10,TESLA INC,TSLA,194.699997,-0.144326,-0.144326,3157752.0,...,C,INDL,STD,D,USD,A,93436,2022.0,6.0,318.0


***
## Task 4
We'll add the three additional variables `lnSize`, `bk2mkt`, `eP`

In [16]:
# There were some negative numbers in our data that was causing errors, so we will correct them (share price shouldn't be negative anyway)
# We also can support our hypothesis by looking at this data, the price stays in $4 range, so the -4 price seems like an input error

In [17]:
test = df_compustat_crsp_2000_2022[df_compustat_crsp_2000_2022["CUSIP"] == '61174X10']
test[test["DATE"] >= '2001-31-01'].head(3)

Unnamed: 0,index,PERMNO,DATE,CUSIP,COMNAM,TICKER,PRC,RET,RETX,SHROUT,...,CONSOL,INDFMT,DATAFMT,POPSRC,CURCDQ,COSTAT,PERMNO.1,YEAR,MONTH,monthid_compustat
17284,19231,88031.0,2002-01-31 00:00:00.000000,61174X10,HANSEN NATURAL CORP,HANS,-4.21,0.002381,0.002381,10045.0,...,C,INDL,STD,D,USD,A,88031,2001.0,9.0,69.0
17285,19232,88031.0,2002-02-28 00:00:00.000000,61174X10,HANSEN NATURAL CORP,HANS,4.14,-0.016627,-0.016627,10045.0,...,C,INDL,STD,D,USD,A,88031,2001.0,9.0,69.0
17286,19233,88031.0,2002-03-28 00:00:00.000000,61174X10,HANSEN NATURAL CORP,HANS,4.22,0.019324,0.019324,10053.0,...,C,INDL,STD,D,USD,A,88031,2001.0,9.0,69.0


In [18]:
# Fix data
# df_compustat_crsp_2000_2022["PRC"]
df_compustat_crsp_2000_2022.loc[:,"PRC"] = df_compustat_crsp_2000_2022["PRC"].abs()

# For CRSP, shares outstanding is in thousands
df_compustat_crsp_2000_2022.loc[:,"SHROUT"] *= 1_000

# We need to convert our compustat units to match that of CRSP
# Also note that IBQ, ATQ, CEQQ is reported in millions
df_compustat_crsp_2000_2022.loc[:,"ATQ"] *= 1_000_000
df_compustat_crsp_2000_2022.loc[:,"CEQQ"] *= 1_000_000
df_compustat_crsp_2000_2022.loc[:,"IBQ"] *= 1_000_000
df_compustat_crsp_2000_2022.loc[:,"SALEQ"] *= 1_000_000

In [19]:
# a. lnSize
df_compustat_crsp_2000_2022.loc[:,"mktcap"] = df_compustat_crsp_2000_2022["PRC"] * df_compustat_crsp_2000_2022["SHROUT"]
df_compustat_crsp_2000_2022.loc[:,"lnSize"] = np.log(df_compustat_crsp_2000_2022["mktcap"])

# b. bk2mkt - assuming [CEQQ] Common/Ordinary Equity – Total means Book Value of Equity
df_compustat_crsp_2000_2022.loc[:,"bk2mkt"] = df_compustat_crsp_2000_2022["mktcap"] / df_compustat_crsp_2000_2022["ATQ"]

# c. eP
df_compustat_crsp_2000_2022.loc[:,"eP1"] = df_compustat_crsp_2000_2022["IBQ"] / df_compustat_crsp_2000_2022["mktcap"]
df_compustat_crsp_2000_2022.loc[:,"eP2"] = df_compustat_crsp_2000_2022["EPSPXQ"] / df_compustat_crsp_2000_2022["PRC"]

In [20]:
df_compustat_crsp_2000_2022[["ATQ", "mktcap", "bk2mkt"]].mean()

ATQ       2.426044e+10
mktcap    5.575885e+10
bk2mkt    3.750454e+00
dtype: float64

In [21]:
# Now let's compare the standard deviation of the two methods of computing the earnings-to-price ratio

# eP1 is IBQ (Income) / Market Equity
print(f"eP1 - IBQ / Market Equity - mean: {df_compustat_crsp_2000_2022.eP1.mean():.4f} | stdev: {df_compustat_crsp_2000_2022.eP1.std():.4f}")

# eP2 is EPSPXQ (Earnings per share) / PRC (Share Price)
print(f"eP2 - EPSPXQ (Earnings per share) / Prc - mean: {df_compustat_crsp_2000_2022.eP2.mean():.4f} | stdev: {df_compustat_crsp_2000_2022.eP2.std():.4f}")

eP1 - IBQ / Market Equity - mean: 0.0037 | stdev: 0.1142
eP2 - EPSPXQ (Earnings per share) / Prc - mean: 0.0024 | stdev: 0.1539


## eP1 vs eP2

The two measures both illustrate how the market values the company's quarterly earnings. The difference is that `eP1`, which is calulated as total income / total market equity, represents the ratio at which the entire company is valued compared to its earnings. On the other hand, `eP2` represents how the company's earnings is valued at a per-share basis.

# TODO
Statistically, we may say that the latter (eP2) is the better comaprison because it has a smaller standard deviation 

## Analysis
- We will plot the distribution of each statistic, in an **aggregate** histogram including all stocks being analyzed.

## Question 5
- We wish to calculate some descriptive statistics for the variables we made above
- We again assume we are doing so by aggregating all of the stocks together.

In [22]:
statistics_df = pd.DataFrame(columns=["", "lnSize", "bk2mkt", "eP1", "eP2"])
statistics_df.loc[0] = "mean"
statistics_df.loc[1] = "median"
statistics_df.loc[2] = "variance"
statistics_df.loc[3] = "standard deviation"
statistics_df.loc[4] = "5th Percentile"
statistics_df.loc[5] = "25th Percentile"
statistics_df.loc[6] = "50th Percentile"
statistics_df.loc[7] = "75th Percentile"
statistics_df.loc[8] = "95th Percentile"

statistics_df.set_index("", inplace=True)

variables = ["lnSize", "bk2mkt", "eP1", "eP2"]


# Mean + Median
for var in variables:
    statistics_df.loc["mean", var] = df_compustat_crsp_2000_2022[var].mean()
    statistics_df.loc["median", var] = df_compustat_crsp_2000_2022[var].median()

# Variance + Std Deviation
for var in variables:
    statistics_df.loc["variance", var] = df_compustat_crsp_2000_2022[var].var()
    statistics_df.loc["standard deviation", var] = df_compustat_crsp_2000_2022[var].std()

# Percentiles
percentiles = [5, 25, 50, 75, 95]
for var in variables:
    for p in percentiles:
        statistics_df.loc[f"{p}th Percentile", var] = df_compustat_crsp_2000_2022[var].quantile(p / 100)

statistics_df.astype(float).round(4)

Unnamed: 0,lnSize,bk2mkt,eP1,eP2
,,,,
mean,23.5527,3.7505,0.0037,0.0024
median,23.6426,2.5864,0.0086,0.0085
variance,2.5257,17.5253,0.013,0.0237
standard deviation,1.5892,4.1863,0.1142,0.1539
5th Percentile,20.8414,0.4667,-0.0192,-0.0194
25th Percentile,22.5964,1.4123,0.0033,0.0033
50th Percentile,23.6426,2.5864,0.0086,0.0085
75th Percentile,24.5537,4.6855,0.0136,0.0132
95th Percentile,25.9621,10.7952,0.0267,0.0242


## Data Clean-Up

We will choose to **winsorize** our data, since the validity of our data should be good. I.e. the data that we have downloaded is not incorrect, but just reflects the notion that surprise stock events are a reality and should be reflected in the data. So, instead of truncating them to be removed, we just winsorize them so that they are included in the data, but we do not influence the skew of the data too much.

### Group by month
Since we compare cross-sectionally across all stocks in a given month, we need to group by monthid to winsorize.

In [25]:
df_compustat_crsp_2000_2022[["monthid_crsp", "monthid_compustat"]]

Unnamed: 0,monthid_crsp,monthid_compustat
0,49,45.0
1,50,45.0
2,51,45.0
3,52,48.0
4,53,48.0
...,...,...
22015,320,315.0
22016,321,315.0
22017,322,318.0
22018,323,318.0


In [41]:
# Let's use the crsp monthly data to winsortize.
# We'll create a new dataframe that includes the 

winsorize_limits_df = pd.DataFrame(columns = ["monthid", "lnSize std", "bk2mkt std", "eP1 std", "eP2 std"])
winsorize_limits_df.loc[:,"monthid"] = list(set(df_compustat_crsp_2000_2022["monthid_crsp"]))
winsorize_limits_df.set_index("monthid", inplace=True)

winsorize_limits_df

Unnamed: 0_level_0,lnSize std,bk2mkt std,eP1 std,eP2 std
monthid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
49,,,,
50,,,,
51,,,,
52,,,,
53,,,,
...,...,...,...,...
320,,,,
321,,,,
322,,,,
323,,,,


In [42]:
for monthid in winsorize_limits_df.index:
    cross_section_df = df_compustat_crsp_2000_2022[df_compustat_crsp_2000_2022["monthid_crsp"] == monthid]
    winsorize_limits_df.loc[monthid, "lnSize std"] = cross_section_df["lnSize"].std()
    winsorize_limits_df.loc[monthid, "bk2mkt std"] = cross_section_df["bk2mkt"].std()
    winsorize_limits_df.loc[monthid, "eP1 std"] = cross_section_df["eP1"].std()
    winsorize_limits_df.loc[monthid, "eP2 std"] = cross_section_df["eP2"].std()
winsorize_limits_df

Unnamed: 0_level_0,lnSize std,bk2mkt std,eP1 std,eP2 std
monthid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
49,1.860556,10.669781,0.016697,0.017858
50,1.815236,15.087059,0.017461,0.014015
51,1.856632,12.200536,0.01595,0.013016
52,1.868604,8.290153,0.016809,0.017197
53,1.82503,7.294563,0.022943,0.024512
...,...,...,...,...
320,1.06967,2.95932,0.029081,0.035761
321,1.08189,2.732847,0.040488,0.046024
322,1.090649,2.802746,0.051561,0.053423
323,1.08367,2.773613,0.038551,0.042522


In [50]:
# Winsorize
df_compustat_crsp_2000_2022.loc[:,"lnSize (winsorized)"] = df_compustat_crsp_2000_2022.lnSize
df_compustat_crsp_2000_2022.loc[:,"bk2mkt (winsorized)"] = df_compustat_crsp_2000_2022.bk2mkt

df_compustat_crsp_2000_2022.loc[:,"eP1 (winsorized)"] = df_compustat_crsp_2000_2022.eP1
df_compustat_crsp_2000_2022.loc[:,"eP2 (winsorized)"] = df_compustat_crsp_2000_2022.eP2

columns = ["lnSize (winsorized)", "bk2mkt (winsorized)", "eP1 (winsorized)", "eP2 (winsorized)"]

def winsorize(month_section):
    for col in columns:
        mean = month_section[col].mean()
        std = month_section[col].std()

        upper = mean + 3 * std
        lower = mean - 3 * std

        month_section[col] = np.clip(month_section[col], lower, upper)
    return month_section


# Winsorize
df_compustat_crsp_2000_2022 = df_compustat_crsp_2000_2022.groupby(df_compustat_crsp_2000_2022["monthid_crsp"]).apply(winsorize)

In [69]:
# Reset the index
df_compustat_crsp_2000_2022.set_index("index", inplace=True)

In [70]:
# Now we will redo the statistics df

winsorized_statistics_df = pd.DataFrame(columns=["", "lnSize (winsorized)", "bk2mkt (winsorized)", "eP1 (winsorized)", "eP2 (winsorized)"])
winsorized_statistics_df.loc[0] = "mean"
winsorized_statistics_df.loc[1] = "median"
winsorized_statistics_df.loc[2] = "variance"
winsorized_statistics_df.loc[3] = "standard deviation"
winsorized_statistics_df.loc[4] = "5th Percentile"
winsorized_statistics_df.loc[5] = "25th Percentile"
winsorized_statistics_df.loc[6] = "50th Percentile"
winsorized_statistics_df.loc[7] = "75th Percentile"
winsorized_statistics_df.loc[8] = "95th Percentile"

winsorized_statistics_df.set_index("", inplace=True)

variables = ["lnSize (winsorized)", "bk2mkt (winsorized)", "eP1 (winsorized)", "eP2 (winsorized)"]


# Mean + Median
for var in variables:
    winsorized_statistics_df.loc["mean", var] = df_compustat_crsp_2000_2022[var].mean()
    winsorized_statistics_df.loc["median", var] = df_compustat_crsp_2000_2022[var].median()

# Variance + Std Deviation
for var in variables:
    winsorized_statistics_df.loc["variance", var] = df_compustat_crsp_2000_2022[var].var()
    winsorized_statistics_df.loc["standard deviation", var] = df_compustat_crsp_2000_2022[var].std()

# Percentiles
percentiles = [5, 25, 50, 75, 95]
for var in variables:
    for p in percentiles:
        winsorized_statistics_df.loc[f"{p}th Percentile", var] = df_compustat_crsp_2000_2022[var].quantile(p / 100)

winsorized_statistics_df.astype(float).round(4)

Unnamed: 0,lnSize (winsorized),bk2mkt (winsorized),eP1 (winsorized),eP2 (winsorized)
,,,,
mean,23.552,3.6576,0.0057,0.0048
median,23.6426,2.5864,0.0086,0.0085
variance,2.5042,12.5595,0.0023,0.0039
standard deviation,1.5825,3.5439,0.0484,0.0621
5th Percentile,20.8421,0.4667,-0.0192,-0.0191
25th Percentile,22.5964,1.4123,0.0033,0.0033
50th Percentile,23.6426,2.5864,0.0086,0.0085
75th Percentile,24.5537,4.6855,0.0136,0.0132
95th Percentile,25.9621,10.6826,0.0267,0.0242


In [71]:
statistics_df.astype(float).round(4)

Unnamed: 0,lnSize,bk2mkt,eP1,eP2
,,,,
mean,23.5527,3.7505,0.0037,0.0024
median,23.6426,2.5864,0.0086,0.0085
variance,2.5257,17.5253,0.013,0.0237
standard deviation,1.5892,4.1863,0.1142,0.1539
5th Percentile,20.8414,0.4667,-0.0192,-0.0194
25th Percentile,22.5964,1.4123,0.0033,0.0033
50th Percentile,23.6426,2.5864,0.0086,0.0085
75th Percentile,24.5537,4.6855,0.0136,0.0132
95th Percentile,25.9621,10.7952,0.0267,0.0242


***
## Testing Asset Pricing Model Validity