#### Import the standard libraries:
 - pandas
 - numpy
 - datetime

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

#### Bring in simulated annual financial statement data (similar to Compustat Funda)

In [2]:
#comp1 = pd.read_excel('fina.xlsx', index=False)
comp1 = pd.read_excel('fina.xlsx')
comp1.head(20)

Unnamed: 0,gvkey,cik,datadate,tic,conm,ni,sale,cogs,oancf,at,lt,seq
0,331687,66740,2012-12-31,MMM,3M Company,4444.0,29904.0,15685.0,,33876.0,15836.0,17575.0
1,331687,66740,2013-12-31,MMM,3M Company,4659.0,30871.0,16106.0,,33550.0,15602.0,17502.0
2,331687,66740,2014-12-31,MMM,3M Company,4956.0,31821.0,16447.0,,31269.0,18127.0,13109.0
3,331687,66740,2015-12-31,MMM,3M Company,4833.0,30274.0,15383.0,6420.0,32718.0,20971.0,11708.0
4,331687,66740,2016-12-31,MMM,3M Company,5050.0,30109.0,15040.0,6662.0,32906.0,22563.0,10298.0
5,331687,66740,2017-12-31,MMM,3M Company,4858.0,31657.0,16001.0,6240.0,37987.0,26365.0,11563.0
6,331687,66740,2018-12-31,MMM,3M Company,5349.0,32765.0,16682.0,6439.0,36500.0,26652.0,9796.0
7,331687,66740,2019-12-31,MMM,3M Company,4570.0,32136.0,17136.0,7070.0,44659.0,34533.0,10063.0
8,281586,91142,2016-12-31,AOS,A.O. Smith Corp,326.5,2685.9,1566.6,446.6,2891.0,1375.7,1515.3
9,281586,91142,2017-12-31,AOS,A.O. Smith Corp,296.5,2996.7,1758.0,326.4,3197.3,1548.5,1648.8


**Inspect the data.**
 - Are variable types are appropriate?
 - which varaibles have missing values?
 - do variables look right?

In [3]:
comp1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3502 entries, 0 to 3501
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   gvkey     3502 non-null   int64         
 1   cik       3502 non-null   int64         
 2   datadate  3502 non-null   datetime64[ns]
 3   tic       3502 non-null   object        
 4   conm      3502 non-null   object        
 5   ni        3502 non-null   float64       
 6   sale      3502 non-null   float64       
 7   cogs      2322 non-null   float64       
 8   oancf     3122 non-null   float64       
 9   at        3475 non-null   float64       
 10  lt        3473 non-null   float64       
 11  seq       3498 non-null   float64       
dtypes: datetime64[ns](1), float64(7), int64(2), object(2)
memory usage: 328.4+ KB


pd.Series(comp1['gvkey'].unique()).tail(22)

In [4]:
pd.Series(comp1['gvkey'].unique()).tail(22)

459    558034
460     70671
461    243941
462    667968
463    998876
464    991369
465    564859
466    396129
467    205919
468    142177
469    848540
470    840203
471     67815
472    352027
473    857761
474    792043
475    287126
476    174091
477    231460
478    509122
479    640439
480    271912
dtype: int64

It looks like `gvkey` and `cik` are numeric dtype and do not have the appropriate length.  Convert them.  

In [5]:
#method 1: Filling in leading 0s so that len(gvkey) == 6
comp1['gvkey'] = comp1['gvkey'].apply('{:0>6}'.format)
pd.Series(comp1['gvkey'].unique()).tail(22)

459    558034
460    070671
461    243941
462    667968
463    998876
464    991369
465    564859
466    396129
467    205919
468    142177
469    848540
470    840203
471    067815
472    352027
473    857761
474    792043
475    287126
476    174091
477    231460
478    509122
479    640439
480    271912
dtype: object

In [6]:
#method 2: Filling in leading 0s so that len(cik) == 10
comp1['cik'] = comp1['cik'].astype(str)
comp1['cik'] = comp1['cik'].str.zfill(10)
comp1.head()

Unnamed: 0,gvkey,cik,datadate,tic,conm,ni,sale,cogs,oancf,at,lt,seq
0,331687,66740,2012-12-31,MMM,3M Company,4444.0,29904.0,15685.0,,33876.0,15836.0,17575.0
1,331687,66740,2013-12-31,MMM,3M Company,4659.0,30871.0,16106.0,,33550.0,15602.0,17502.0
2,331687,66740,2014-12-31,MMM,3M Company,4956.0,31821.0,16447.0,,31269.0,18127.0,13109.0
3,331687,66740,2015-12-31,MMM,3M Company,4833.0,30274.0,15383.0,6420.0,32718.0,20971.0,11708.0
4,331687,66740,2016-12-31,MMM,3M Company,5050.0,30109.0,15040.0,6662.0,32906.0,22563.0,10298.0


In [7]:
comp1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3502 entries, 0 to 3501
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   gvkey     3502 non-null   object        
 1   cik       3502 non-null   object        
 2   datadate  3502 non-null   datetime64[ns]
 3   tic       3502 non-null   object        
 4   conm      3502 non-null   object        
 5   ni        3502 non-null   float64       
 6   sale      3502 non-null   float64       
 7   cogs      2322 non-null   float64       
 8   oancf     3122 non-null   float64       
 9   at        3475 non-null   float64       
 10  lt        3473 non-null   float64       
 11  seq       3498 non-null   float64       
dtypes: datetime64[ns](1), float64(7), object(4)
memory usage: 328.4+ KB


Remove unneeded variables.  For this exercise, we won't need:
- Company name (`conm`)
- Ticker (`tic`)


In [8]:
comp1.drop(['conm', 'tic'], axis=1, inplace=True)
comp1.head()

Unnamed: 0,gvkey,cik,datadate,ni,sale,cogs,oancf,at,lt,seq
0,331687,66740,2012-12-31,4444.0,29904.0,15685.0,,33876.0,15836.0,17575.0
1,331687,66740,2013-12-31,4659.0,30871.0,16106.0,,33550.0,15602.0,17502.0
2,331687,66740,2014-12-31,4956.0,31821.0,16447.0,,31269.0,18127.0,13109.0
3,331687,66740,2015-12-31,4833.0,30274.0,15383.0,6420.0,32718.0,20971.0,11708.0
4,331687,66740,2016-12-31,5050.0,30109.0,15040.0,6662.0,32906.0,22563.0,10298.0


## Create variables / lag variables / indicator variables

#### Create the following variables:
 - $Year$ = extracted year from `datadate`
 - $Sales Growth =\dfrac {Revenues_t}
            {Revenues_{t-1}} - 1        $
 - $Total Accruals = \dfrac{NetIncome_t - CashFlowsfromOperations_t}{TotalAssets_{t-1}}$
 

In [9]:
#use dt to extract the year
comp1['year']=comp1['datadate'].dt.year

In [10]:
comp1.head()

Unnamed: 0,gvkey,cik,datadate,ni,sale,cogs,oancf,at,lt,seq,year
0,331687,66740,2012-12-31,4444.0,29904.0,15685.0,,33876.0,15836.0,17575.0,2012
1,331687,66740,2013-12-31,4659.0,30871.0,16106.0,,33550.0,15602.0,17502.0,2013
2,331687,66740,2014-12-31,4956.0,31821.0,16447.0,,31269.0,18127.0,13109.0,2014
3,331687,66740,2015-12-31,4833.0,30274.0,15383.0,6420.0,32718.0,20971.0,11708.0,2015
4,331687,66740,2016-12-31,5050.0,30109.0,15040.0,6662.0,32906.0,22563.0,10298.0,2016


   ### Lagged annual variables

##### The wrong way to lag annual variables

In [11]:
comp1['sale_lag1'] = comp1['sale'].shift(1)
comp1[['gvkey','year','sale','sale_lag1']].head(14)

Unnamed: 0,gvkey,year,sale,sale_lag1
0,331687,2012,29904.0,
1,331687,2013,30871.0,29904.0
2,331687,2014,31821.0,30871.0
3,331687,2015,30274.0,31821.0
4,331687,2016,30109.0,30274.0
5,331687,2017,31657.0,30109.0
6,331687,2018,32765.0,31657.0
7,331687,2019,32136.0,32765.0
8,281586,2016,2685.9,32136.0
9,281586,2017,2996.7,2685.9


##### Here's a better way to do it, but still not perfect

In [12]:
# Generate lag within each gvkey
comp1['sale_lag1'] = comp1.groupby('gvkey')['sale'].shift(1)

comp1[['gvkey','year','sale','sale_lag1']].head(20)

Unnamed: 0,gvkey,year,sale,sale_lag1
0,331687,2012,29904.0,
1,331687,2013,30871.0,29904.0
2,331687,2014,31821.0,30871.0
3,331687,2015,30274.0,31821.0
4,331687,2016,30109.0,30274.0
5,331687,2017,31657.0,30109.0
6,331687,2018,32765.0,31657.0
7,331687,2019,32136.0,32765.0
8,281586,2016,2685.9,
9,281586,2017,2996.7,2685.9


##### The right way to lag annual variables

In [13]:
# Generate lag within each gvkey
comp1['sale_lag1'] = comp1.groupby('gvkey')['sale'].shift(1)

# Set value equal missing if year difference is not 1 (e.g., 2019 follows 2015)
comp1['sale_lag1'] = comp1['sale_lag1'].where(comp1.groupby('gvkey').year.diff()==1, np.nan)

comp1[['gvkey','year','sale','sale_lag1']].head(14)

Unnamed: 0,gvkey,year,sale,sale_lag1
0,331687,2012,29904.0,
1,331687,2013,30871.0,29904.0
2,331687,2014,31821.0,30871.0
3,331687,2015,30274.0,31821.0
4,331687,2016,30109.0,30274.0
5,331687,2017,31657.0,30109.0
6,331687,2018,32765.0,31657.0
7,331687,2019,32136.0,32765.0
8,281586,2016,2685.9,
9,281586,2017,2996.7,2685.9


In [14]:
comp1['at_lag1'] = comp1.groupby('gvkey')['at'].shift(1)
comp1['at_lag1'] = comp1['at_lag1'].where(comp1.groupby('gvkey').year.diff()==1, np.nan)

##### Now that we have lagged variables, we can compute annual sales growth and total accruals

In [15]:
comp1['salegrowth'] = (comp1['sale']/comp1['sale_lag1']) - 1
comp1['tacc'] = (comp1['ni'] - comp1['oancf'])/comp1['at_lag1']

In [16]:
#pd.set_option('mode.chained_assignment', None)

In [17]:
#make sure the new variables look okay
comp1[['salegrowth','tacc']].describe([.01, .1, .9, .99])

Unnamed: 0,salegrowth,tacc
count,2996.0,2671.0
mean,inf,
std,,
min,-1.0,-inf
1%,-0.48466,-0.249754
10%,-0.062345,-0.108928
50%,0.051914,-0.040681
90%,0.233313,0.000253
99%,1.0985,0.126673
max,inf,inf


#### Dealing with `inf` values

In [18]:
comp1['salegrowth'].replace([np.inf,-np.inf], np.NaN, inplace=True)
comp1['tacc'].replace([np.inf,-np.inf], np.NaN, inplace=True)

comp1[['salegrowth','tacc']].describe([.01, .1, .9, .99])

Unnamed: 0,salegrowth,tacc
count,2991.0,2669.0
mean,0.432648,-1399.334
std,12.642442,72136.08
min,-1.0,-3726718.0
1%,-0.485922,-0.2470187
10%,-0.062486,-0.1089069
50%,0.051871,-0.0406806
90%,0.230158,8.939909e-05
99%,0.88315,0.1250004
max,649.791153,0.3211311


## Merging

Let's bring in some new data and start from scratch.


In [19]:
#comp1 = pd.read_excel('fina.xlsx', index=False, converters={'gvkey': '{:0>6}'.format})
comp1 = pd.read_excel('fina.xlsx', converters={'gvkey': '{:0>6}'.format})
comp1.head(20)

Unnamed: 0,gvkey,cik,datadate,tic,conm,ni,sale,cogs,oancf,at,lt,seq
0,331687,66740,2012-12-31,MMM,3M Company,4444.0,29904.0,15685.0,,33876.0,15836.0,17575.0
1,331687,66740,2013-12-31,MMM,3M Company,4659.0,30871.0,16106.0,,33550.0,15602.0,17502.0
2,331687,66740,2014-12-31,MMM,3M Company,4956.0,31821.0,16447.0,,31269.0,18127.0,13109.0
3,331687,66740,2015-12-31,MMM,3M Company,4833.0,30274.0,15383.0,6420.0,32718.0,20971.0,11708.0
4,331687,66740,2016-12-31,MMM,3M Company,5050.0,30109.0,15040.0,6662.0,32906.0,22563.0,10298.0
5,331687,66740,2017-12-31,MMM,3M Company,4858.0,31657.0,16001.0,6240.0,37987.0,26365.0,11563.0
6,331687,66740,2018-12-31,MMM,3M Company,5349.0,32765.0,16682.0,6439.0,36500.0,26652.0,9796.0
7,331687,66740,2019-12-31,MMM,3M Company,4570.0,32136.0,17136.0,7070.0,44659.0,34533.0,10063.0
8,281586,91142,2016-12-31,AOS,A.O. Smith Corp,326.5,2685.9,1566.6,446.6,2891.0,1375.7,1515.3
9,281586,91142,2017-12-31,AOS,A.O. Smith Corp,296.5,2996.7,1758.0,326.4,3197.3,1548.5,1648.8


Let's restrict the sample to a set of large tech firms in the USA.

In [20]:
tech = ['AAPL','GOOGL','MSFT', 'IBM','INTC','CSCO','FB', 'ADBE', 'ORCL','CRM']
comp_tech =  comp1[comp1['tic'].isin(tech)]
pd.DataFrame(comp_tech.conm.unique())

Unnamed: 0,0
0,Adobe Inc.
1,Alphabet Inc. (Class A)
2,Apple Inc.
3,Cisco Systems
4,"Facebook, Inc."
5,Intel Corp.
6,International Business Machines
7,Microsoft Corp.
8,Oracle Corp.
9,Salesforce.com


In [21]:
comp_tech.info()

<class 'pandas.core.frame.DataFrame'>
Index: 80 entries, 90 to 2830
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   gvkey     80 non-null     object        
 1   cik       80 non-null     int64         
 2   datadate  80 non-null     datetime64[ns]
 3   tic       80 non-null     object        
 4   conm      80 non-null     object        
 5   ni        80 non-null     float64       
 6   sale      80 non-null     float64       
 7   cogs      70 non-null     float64       
 8   oancf     67 non-null     float64       
 9   at        79 non-null     float64       
 10  lt        79 non-null     float64       
 11  seq       80 non-null     float64       
dtypes: datetime64[ns](1), float64(7), int64(1), object(3)
memory usage: 8.1+ KB


#### One-to-one merge

Let's merge the tech company financial statement data (`comp_tech`) with a linktable (`linktable`) that includes a host of identifying variables.

In [22]:
#linktable = pd.read_excel('linktable.xlsx',index=False,)
linktable = pd.read_excel('linktable.xlsx')
linktable.head()

Unnamed: 0,tic,cik,permno,gvkey
0,QRVO,1604778,3582,481822
1,ICE,1571949,60498,33871
2,PRGO,1585364,53227,613134
3,MDT,1613103,21333,473521
4,FTI,1681459,3885,720180


We need to clean up the identifiers to facilitate the merge.  They should be as follows:
 - CIK = 10 digit identifier from the SEC
 - GVKEY = 6-digit identifier from Compustat
 - PERMNO = 5-digit identifier from CRSP

In [23]:
linktable['cik'] = linktable['cik'].apply('{:0>10}'.format)
linktable['gvkey'] = linktable['gvkey'].apply('{:0>6}'.format)
linktable['permno'] = linktable['permno'].apply('{:0>5}'.format)

Let's merge the two, with `comp_tech` on the left and `linktable` on the right.  The common identifier is `gvkey`.

In [24]:
comp_tech2 = pd.merge(comp_tech, linktable[['gvkey','permno']], how='inner', on=['gvkey'])
comp_tech2.head(10)

Unnamed: 0,gvkey,cik,datadate,tic,conm,ni,sale,cogs,oancf,at,lt,seq,permno
0,597882,796343,2012-11-30,ADBE,Adobe Inc.,832.775,4403.677,121.663,1499.58,9974.523,3309.341,6665.182,4368
1,597882,796343,2013-11-30,ADBE,Adobe Inc.,289.985,4055.24,138.154,1151.686,10380.298,3655.664,6724.634,4368
2,597882,796343,2014-11-30,ADBE,Adobe Inc.,268.395,4147.065,622.08,1287.482,10785.829,4009.924,6775.905,4368
3,597882,796343,2015-11-30,ADBE,Adobe Inc.,629.551,4795.511,90.035,1469.502,11726.472,4724.892,7001.58,4368
4,597882,796343,2016-11-30,ADBE,Adobe Inc.,1168.782,5854.43,68.917,2199.728,12707.114,5282.279,7424.835,4368
5,597882,796343,2017-11-30,ADBE,Adobe Inc.,1693.954,7301.505,57.082,2912.853,14535.556,6075.687,8459.869,4368
6,597882,796343,2018-11-30,ADBE,Adobe Inc.,2590.774,9030.008,1194.999,4029.304,18768.682,9406.568,9362.114,4368
7,597882,796343,2019-11-30,ADBE,Adobe Inc.,2951.458,11171.297,1672.72,4421.813,20762.4,10232.245,10530.155,4368
8,765942,1652044,2013-12-31,GOOGL,Alphabet Inc. (Class A),12733.0,55519.0,21993.0,18659.0,,,86977.0,69132
9,765942,1652044,2014-12-31,GOOGL,Alphabet Inc. (Class A),14136.0,66001.0,25691.0,22376.0,130426.0,26566.0,103860.0,69132


In [25]:
comp_tech2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   gvkey     80 non-null     object        
 1   cik       80 non-null     int64         
 2   datadate  80 non-null     datetime64[ns]
 3   tic       80 non-null     object        
 4   conm      80 non-null     object        
 5   ni        80 non-null     float64       
 6   sale      80 non-null     float64       
 7   cogs      70 non-null     float64       
 8   oancf     67 non-null     float64       
 9   at        79 non-null     float64       
 10  lt        79 non-null     float64       
 11  seq       80 non-null     float64       
 12  permno    80 non-null     object        
dtypes: datetime64[ns](1), float64(7), int64(1), object(4)
memory usage: 8.3+ KB


### Another one-to-one merge

Let's bring in the year-end stock price for each firm.  We'll do this by merging `comp_tech` with the simulated CRSP daily file, `crspd`.

The common identifiers between the two datasets are: **permno** and **date**.

First, let's read in simulated CRSP daily stock return data. As before, make sure the permno has exactly 5 digits.

In [26]:
#crspd = pd.read_excel('dsf.xlsx', index=False, parse_dates=['date'], converters={'permno': '{:0>5}'.format})
crspd = pd.read_excel('dsf.xlsx', parse_dates=['date'], converters={'permno': '{:0>5}'.format})
crspd.head()

Unnamed: 0,permno,date,vol,prc,ret
0,92040,2012-01-03,3380100.0,66.56,0.113376
1,92040,2012-01-04,3007400.0,67.11,0.008263
2,92040,2012-01-05,3116400.0,66.81,-0.00447
3,92040,2012-01-06,2839200.0,66.46,-0.005239
4,92040,2012-01-09,2796600.0,66.86,0.006019


In [27]:
crspd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 972261 entries, 0 to 972260
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   permno  972261 non-null  object        
 1   date    972261 non-null  datetime64[ns]
 2   vol     969679 non-null  float64       
 3   prc     972261 non-null  float64       
 4   ret     972261 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(1)
memory usage: 37.1+ MB


What do you notice?  There are only about 80 observations in `comp_tech`, but there are nearly a million in `crspd`.  We'd better merge carefully, or we'll end up with a giant (or bad) dataset. 

In [28]:
m1 = pd.merge(comp_tech2, crspd[['permno', 'date', 'prc']], how='left', left_on=['permno', 'datadate'], right_on=['permno', 'date'])

m1.head()

Unnamed: 0,gvkey,cik,datadate,tic,conm,ni,sale,cogs,oancf,at,lt,seq,permno,date,prc
0,597882,796343,2012-11-30,ADBE,Adobe Inc.,832.775,4403.677,121.663,1499.58,9974.523,3309.341,6665.182,4368,2012-11-30,34.61
1,597882,796343,2013-11-30,ADBE,Adobe Inc.,289.985,4055.24,138.154,1151.686,10380.298,3655.664,6724.634,4368,NaT,
2,597882,796343,2014-11-30,ADBE,Adobe Inc.,268.395,4147.065,622.08,1287.482,10785.829,4009.924,6775.905,4368,NaT,
3,597882,796343,2015-11-30,ADBE,Adobe Inc.,629.551,4795.511,90.035,1469.502,11726.472,4724.892,7001.58,4368,2015-11-30,91.46
4,597882,796343,2016-11-30,ADBE,Adobe Inc.,1168.782,5854.43,68.917,2199.728,12707.114,5282.279,7424.835,4368,2016-11-30,102.81


In [29]:
m1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   gvkey     80 non-null     object        
 1   cik       80 non-null     int64         
 2   datadate  80 non-null     datetime64[ns]
 3   tic       80 non-null     object        
 4   conm      80 non-null     object        
 5   ni        80 non-null     float64       
 6   sale      80 non-null     float64       
 7   cogs      70 non-null     float64       
 8   oancf     67 non-null     float64       
 9   at        79 non-null     float64       
 10  lt        79 non-null     float64       
 11  seq       80 non-null     float64       
 12  permno    80 non-null     object        
 13  date      56 non-null     datetime64[ns]
 14  prc       56 non-null     float64       
dtypes: datetime64[ns](2), float64(8), int64(1), object(4)
memory usage: 9.5+ KB


###  Why are we losing data?

It's probably because our merge-on variables (permno, date) don't align for some reason.  

When this happens, you want to double check the identifiers in each dataset.  That is, we need to make sure that Python views `permno` and `date` equivalently in each dataset.  If not, the observations won't merge properly.

We know that the `permno` is measured correctly, because we formatted it in both datasets up above.  Thus, it's very likely that it's an issue with the dates.

So, the first thing we need to do is change up the dates.  

In [30]:
# create a year variable using pandas datetimeindex
crspd['year'] = pd.DatetimeIndex(crspd['date']).year
crspd['month'] = pd.DatetimeIndex(crspd['date']).month
crspd.head()


Unnamed: 0,permno,date,vol,prc,ret,year,month
0,92040,2012-01-03,3380100.0,66.56,0.113376,2012,1
1,92040,2012-01-04,3007400.0,67.11,0.008263,2012,1
2,92040,2012-01-05,3116400.0,66.81,-0.00447,2012,1
3,92040,2012-01-06,2839200.0,66.46,-0.005239,2012,1
4,92040,2012-01-09,2796600.0,66.86,0.006019,2012,1


In [31]:
#sort values by permno and date, then -- within each permo-year-month grouping -- take the last observation
crsp2 = crspd.sort_values(['permno','year','month']).groupby(['permno','year','month']).tail(1)
crsp2.head()

Unnamed: 0,permno,date,vol,prc,ret,year,month
87352,184,2012-01-31,2617500.0,84.66,0.000236,2012,1
87372,184,2012-02-29,2829300.0,92.41,-0.015134,2012,2
87394,184,2012-03-30,2388600.0,86.0,0.016548,2012,3
87414,184,2012-04-30,2185000.0,82.29,0.010809,2012,4
87436,184,2012-05-31,3532700.0,69.81,-0.001859,2012,5


In [32]:
crsp2[(crsp2['permno']=='93175') & (crsp2['month'] == 9)]

Unnamed: 0,permno,date,vol,prc,ret,year,month
91503,93175,2012-09-28,133777700.0,82.86,-0.020799,2012,9
91753,93175,2013-09-30,65039100.0,60.62,-0.012543,2013,9
92005,93175,2014-09-30,55264100.0,91.7,0.006365,2014,9
92257,93175,2015-09-30,66473000.0,102.11,0.01129,2015,9
92510,93175,2016-09-30,36379100.0,106.9,0.007825,2016,9
92761,93175,2017-09-29,26299800.0,148.3,0.005492,2017,9
93012,93175,2018-09-28,22929400.0,220.5,0.003504,2018,9
93263,93175,2019-09-30,25977400.0,222.17,0.023542,2019,9


In [33]:
comp_tech2[comp_tech2['permno']=='93175']

Unnamed: 0,gvkey,cik,datadate,tic,conm,ni,sale,cogs,oancf,at,lt,seq,permno
15,107764,320193,2012-09-30,AAPL,Apple Inc.,41733.0,156508.0,87846.0,50856.0,176064.0,57854.0,118210.0,93175
16,107764,320193,2013-09-30,AAPL,Apple Inc.,37037.0,170910.0,106606.0,53666.0,207000.0,83451.0,123549.0,93175
17,107764,320193,2014-09-30,AAPL,Apple Inc.,39510.0,182795.0,112258.0,,231839.0,120292.0,111547.0,93175
18,107764,320193,2015-09-30,AAPL,Apple Inc.,53394.0,233715.0,140089.0,81266.0,290479.0,171124.0,119355.0,93175
19,107764,320193,2016-09-30,AAPL,Apple Inc.,45687.0,215639.0,131376.0,65824.0,321686.0,193437.0,128249.0,93175
20,107764,320193,2017-09-30,AAPL,Apple Inc.,48351.0,229234.0,141048.0,63598.0,375319.0,241272.0,134047.0,93175
21,107764,320193,2018-09-30,AAPL,Apple Inc.,59531.0,265595.0,163756.0,77434.0,365725.0,258578.0,107147.0,93175
22,107764,320193,2019-09-30,AAPL,Apple Inc.,55256.0,260174.0,161782.0,69391.0,338516.0,248028.0,90488.0,93175


In [34]:
crsp2.drop(['date','vol','ret'], axis=1, inplace=True)
crsp2.head()

Unnamed: 0,permno,prc,year,month
87352,184,84.66,2012,1
87372,184,92.41,2012,2
87394,184,86.0,2012,3
87414,184,82.29,2012,4
87436,184,69.81,2012,5


In [35]:
comp_tech2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   gvkey     80 non-null     object        
 1   cik       80 non-null     int64         
 2   datadate  80 non-null     datetime64[ns]
 3   tic       80 non-null     object        
 4   conm      80 non-null     object        
 5   ni        80 non-null     float64       
 6   sale      80 non-null     float64       
 7   cogs      70 non-null     float64       
 8   oancf     67 non-null     float64       
 9   at        79 non-null     float64       
 10  lt        79 non-null     float64       
 11  seq       80 non-null     float64       
 12  permno    80 non-null     object        
dtypes: datetime64[ns](1), float64(7), int64(1), object(4)
memory usage: 8.3+ KB


In [36]:
comp_tech2['year'] = pd.DatetimeIndex(comp_tech2['datadate']).year
comp_tech2['month'] = pd.DatetimeIndex(comp_tech2['datadate']).month

In [37]:
comp_tech2.head()

Unnamed: 0,gvkey,cik,datadate,tic,conm,ni,sale,cogs,oancf,at,lt,seq,permno,year,month
0,597882,796343,2012-11-30,ADBE,Adobe Inc.,832.775,4403.677,121.663,1499.58,9974.523,3309.341,6665.182,4368,2012,11
1,597882,796343,2013-11-30,ADBE,Adobe Inc.,289.985,4055.24,138.154,1151.686,10380.298,3655.664,6724.634,4368,2013,11
2,597882,796343,2014-11-30,ADBE,Adobe Inc.,268.395,4147.065,622.08,1287.482,10785.829,4009.924,6775.905,4368,2014,11
3,597882,796343,2015-11-30,ADBE,Adobe Inc.,629.551,4795.511,90.035,1469.502,11726.472,4724.892,7001.58,4368,2015,11
4,597882,796343,2016-11-30,ADBE,Adobe Inc.,1168.782,5854.43,68.917,2199.728,12707.114,5282.279,7424.835,4368,2016,11


In [38]:
m1 = pd.merge(comp_tech2, crsp2, how='left', on=['permno','month','year'])

m1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   gvkey     80 non-null     object        
 1   cik       80 non-null     int64         
 2   datadate  80 non-null     datetime64[ns]
 3   tic       80 non-null     object        
 4   conm      80 non-null     object        
 5   ni        80 non-null     float64       
 6   sale      80 non-null     float64       
 7   cogs      70 non-null     float64       
 8   oancf     67 non-null     float64       
 9   at        79 non-null     float64       
 10  lt        79 non-null     float64       
 11  seq       80 non-null     float64       
 12  permno    80 non-null     object        
 13  year      80 non-null     int32         
 14  month     80 non-null     int32         
 15  prc       79 non-null     float64       
dtypes: datetime64[ns](1), float64(8), int32(2), int64(1), object(4)


In [39]:
m1[m1.prc.isnull()]

Unnamed: 0,gvkey,cik,datadate,tic,conm,ni,sale,cogs,oancf,at,lt,seq,permno,year,month,prc
79,821276,1108524,2020-01-31,CRM,Salesforce.com,126.0,17098.0,4235.0,4331.0,55126.0,21241.0,33885.0,44205,2020,1,


In [40]:
m1.rename(columns={'prc':'end_price'}, inplace=True)


In [41]:
m1.tail(10)

Unnamed: 0,gvkey,cik,datadate,tic,conm,ni,sale,cogs,oancf,at,lt,seq,permno,year,month,end_price
70,215379,1341439,2019-05-31,ORCL,Oracle Corp.,11083.0,39506.0,,14551.0,108709.0,86924.0,21785.0,98315,2019,5,49.71
71,821276,1108524,2012-01-31,CRM,Salesforce.com,-11.572,2266.539,488.886,591.507,4164.154,2498.053,1587.36,44205,2012,1,29.2
72,821276,1108524,2013-01-31,CRM,Salesforce.com,-270.445,3050.195,683.579,736.897,5528.956,3157.711,2317.633,44205,2013,1,43.03
73,821276,1108524,2014-01-31,CRM,Salesforce.com,-232.175,4071.003,968.428,875.469,9152.93,6087.715,3038.51,44205,2014,1,60.53
74,821276,1108524,2015-01-31,CRM,Salesforce.com,-262.688,5373.586,,1173.714,10692.982,6717.799,3975.183,44205,2015,1,56.45
75,821276,1108524,2016-01-31,CRM,Salesforce.com,-47.426,6667.216,,1612.585,12770.772,7767.903,5002.869,44205,2016,1,68.06
76,821276,1108524,2017-01-31,CRM,Salesforce.com,179.632,8391.984,2234.0,2162.198,17584.923,10084.796,7500.127,44205,2017,1,79.1
77,821276,1108524,2018-01-31,CRM,Salesforce.com,127.478,10540.0,2773.0,2737.965,21009.802,11617.439,9388.496,44205,2018,1,113.91
78,821276,1108524,2019-01-31,CRM,Salesforce.com,1110.0,13282.0,3451.0,3398.0,30737.0,15132.0,15605.0,44205,2019,1,151.97
79,821276,1108524,2020-01-31,CRM,Salesforce.com,126.0,17098.0,4235.0,4331.0,55126.0,21241.0,33885.0,44205,2020,1,
