In [3]:
import pandas as pd
import numpy as np

# Calculate the following quarterly factors:

- Volatility: Standard deviation of daily returns for each stock each
quarter;
- Dollar daily turnover: each quarter, calculate the average of daily (vol*prc/shares outstanding) for each stock;

In [4]:
def load_daily_data():
    path = 'CRSP_daily.csv'
    df = pd.read_csv(path,dtype={'CUSIP':'str'},low_memory=False)
    df = df[df.EXCHCD.isin([1,2,3])] # EXCHCD is the exchange code, EXCHCD=1,2,3 stands for NYSE, AMEX and NASDAQ
    df = df[df.PRC >= 5] #PRC is Price or Bid/Ask Average
    df['date'] = pd.to_datetime(df['date'].astype(str))
    df['yyyyq'] = df['date'].apply(lambda x:x.year*10 + x.quarter)
    # e.g. 2020-01-20 is changed into 20201, "yyyyq" year+quarter
    df['MKTCAP'] = df['PRC'] * df['SHROUT'] #Market Capitalization=Price*Shares Outstanding
    return df

def load_month_data():
    stock = pd.read_csv("stock_monthly_2005.csv",dtype={'SICCD':'str'})
    stock['date'] = stock['date'].apply(lambda x:str(x)[:6])
    # e.g. 2020-01-20 is changed into 202001, "yyyymm" year+month
    stock['RET'] = stock['RET'].apply(lambda x:float_value(x))
    return stock

def quarter_size(data):
    df = data.copy() # data should come from load_daily_data()
    result = pd.DataFrame(df.groupby(['yyyyq','PERMNO'])['MKTCAP'].nth(-1))
    # PERMNO is a unique permanent security identification number assigned by CRSP to each security.
    # df.groupby(['columns'])['MKTCAP'].nth(-1)) means to retain the last 'MKTCAP' value in the same group of the 'columns'.
    # Here, we retain the Market Cap data on the last day of a quarter for each stock/security.
    result.reset_index(inplace = True)
    result.columns = ['yyyyq','PERMNO','SIZE']
    return result

def quarter_illiq(data):
    df = data.copy() # data should come from load_daily_data()
    # define illiquidity (below) as return over volume traded
    df['illiq'] = df['RET'].apply(lambda x:float_value(x)) / df['VOL'].apply(lambda x:float_value(x)) 
    result = pd.DataFrame(df.groupby(['yyyyq','PERMNO'])['illiq'].mean()) # Retain quartly mean of illiquidity value.
    result.reset_index(inplace = True)
    result.columns = ['yyyyq','PERMNO','illiq']
    return result

def quarter_turnover(data):
    df = data.copy() # data should come from load_daily_data()
    # define turnover below as volume traded(VOL) over shares outstanding(SHROUT)
    df['turnd'] = df['VOL'].apply(lambda x:float_value(x))  / df['SHROUT'].apply(lambda x:float_value(x)) 
    result = pd.DataFrame(df.groupby(['yyyyq','PERMNO'])['turnd'].mean()) # Retain quartly mean of turnover rate.
    result.reset_index(inplace = True)
    result.columns = ['yyyyq','PERMNO','turnd']
    return result

def month_mom(data):
    df = data.copy()
    #Monthly Momentum Factor: Previous 12 Months exclude the latest 2 months Cumulative Returns
    df = (df.set_index('PERMNO', append=True)
    .assign(mom=df.groupby('PERMNO')['RET'].rolling(10).apply(lambda x: np.prod(1+x)-1).shift(2)
    .swaplevel(0,1)).reset_index(1))
    # set_index: set 'PERMNO' as index. If append=False, default index of 0,1,2,3,...N will be dropped.
    # assign(mom=...): add new column "mom".
    # rolling(k).apply(lambda x: f(x)): do f(x) to the window of k numbers.
    # shift(2) means to move the data down 2 rows from the current position.
    # swaplevel(): swap level0 ("PERMNO") and level1 (default index).[*See end of this .ipynb about this function]
    return df

def merge_all(data1,data2,data3):
    df = data1.copy() # data should come from load_daily_data()
    df2 = data2.copy() # data should come from load_daily_data()
    df3 = data3.copy() # data should come from load_daily_data()
    result = pd.merge(df,df2,how='left',left_on=['yyyyq','PERMNO'],right_on=['yyyyq','PERMNO'])
    result = pd.merge(result,df3,how='left',left_on=['yyyyq','PERMNO'],right_on=['yyyyq','PERMNO'])
    return result

def float_value(x):
    try:
        return float(x)
    except:
        return np.nan

#### Here, in dataset "CRSP_daily.csv" or "CRSP_daily500.csv", Unnamed 0 is the 0th column, PERMNO is the 1st column, and so on.

CUSIP stands for Committee on Uniform Securities Identification Procedures.

A CUSIP number identifies most financial instruments, 
including: stocks of all registered U.S. and Canadian companies, commercial paper, 
    and U.S. government and municipal bonds.

According to the CRSP official Data Description Guide, unexpected letter 'C' in data stands for 'n/a' (not applicable) data.

In [5]:
df = load_daily_data()
df.tail()

Unnamed: 0.1,Unnamed: 0,PERMNO,date,EXCHCD,TICKER,COMNAM,CUSIP,BIDLO,ASKHI,PRC,VOL,RET,SHROUT,OPENPRC,yyyyq,MKTCAP
2408308,40134478,93436,2021-12-27,3.0,TSLA,TESLA INC,88160R10,1070.71521,1117.0,1093.93994,23695249.0,0.025248,1004265.0,1073.67004,20214,1098606000.0
2408309,40134479,93436,2021-12-28,3.0,TSLA,TESLA INC,88160R10,1078.42004,1118.99988,1088.46997,20025526.0,-0.005,1004265.0,1109.48999,20214,1093112000.0
2408310,40134480,93436,2021-12-29,3.0,TSLA,TESLA INC,88160R10,1064.14001,1104.0,1086.18994,18699667.0,-0.002095,1004265.0,1098.64001,20214,1090823000.0
2408311,40134481,93436,2021-12-30,3.0,TSLA,TESLA INC,88160R10,1053.15002,1095.55005,1070.33997,15595484.0,-0.014592,1004265.0,1061.32996,20214,1074905000.0
2408312,40134482,93436,2021-12-31,3.0,TSLA,TESLA INC,88160R10,1054.58997,1081.99988,1056.78003,13530074.0,-0.012669,1004265.0,1073.44446,20214,1061287000.0


In [6]:
merge_result=merge_all(data1 = quarter_size(data = df), 
          data2 = quarter_turnover(data = df), 
          data3 = quarter_illiq(data = df))
merge_result.to_csv("Quarterly_Factor.csv")
merge_result.tail()

Unnamed: 0,yyyyq,PERMNO,SIZE,turnd,illiq
38031,20214,93089,36862360.0,4.996881,3.31032e-09
38032,20214,93096,54643460.0,6.583379,1.636203e-09
38033,20214,93132,58761900.0,6.923593,5.176648e-09
38034,20214,93246,22202630.0,11.667123,-5.163749e-10
38035,20214,93436,1061287000.0,26.133754,1.735567e-10


#### In dataset "stock_monthly_2005.csv", SICCD stands for Standard Industrial Classification (SIC) Code, which serves as a Header.

In [7]:
monthly_momentum_factor=month_mom(data = load_month_data())[['PERMNO','date','mom']].dropna()
monthly_momentum_factor.to_csv("Monthly_Mom_Factor.csv")
monthly_momentum_factor.tail()

Unnamed: 0,PERMNO,date,mom
1371965,93436,202008,3.78618
1371966,93436,202009,4.939968
1371967,93436,202010,6.911852
1371968,93436,202011,5.501335
1371969,93436,202012,3.637967


### Explanations of functions:

In [10]:
%%capture
# * About swaplevel(i,j):
"""
If we don't swap levels, mom will be:
PERMNO         
10001   0               NaN
        1               NaN
        2               NaN
        3               NaN
        4               NaN
                     ...   
93436   1371965    3.786180
        1371966    4.939968
        1371967    6.911852
        1371968    5.501335
        1371969    3.637967
which will not be adequately added to df as a new column.
After swaping levels, mom becomes:
         PERMNO
0        10001          NaN
1        10001          NaN
2        10001          NaN
3        10001          NaN
4        10001          NaN
                     ...   
1371965  93436     3.786180
1371966  93436     4.939968
1371967  93436     6.911852
1371968  93436     5.501335
1371969  93436     3.637967
"""