### Set up:
If you don't already have it loaded from the previous video(s), you will need some additional code:


In [1]:
import pandas as pd
import numpy as np
import datetime as dt

#bring in data file: `fina`.
df = pd.read_csv('fina.csv', parse_dates=['datadate'], date_format='%Y%m%d')
df.head(20)

#create year variable
df['year'] = df['datadate'].dt.year

#preview the dataframe
df.head()


Unnamed: 0,gvkey,datadate,fyr,tic,cik,conm,ni,sale,cogs,oancf,at,lt,seq,year
0,2575,2012-12-31,12,MTD,1037646,Mettler Toledo,290.847,2341.528,811.204,327.704,2117.4,1290.181,827.219,2012
1,2575,2013-12-31,12,MTD,1037646,Mettler Toledo,306.094,2378.972,794.915,345.928,2152.819,1217.767,935.052,2013
2,2575,2014-12-31,12,MTD,1037646,Mettler Toledo,338.241,2485.983,809.537,418.912,2009.11,1289.515,719.595,2014
3,2575,2015-12-31,12,MTD,1037646,Mettler Toledo,352.82,2395.447,744.867,426.868,2018.485,1438.028,580.457,2015
4,2575,2016-12-31,12,MTD,1037646,Mettler Toledo,384.37,2508.257,767.753,443.078,2166.777,1731.834,434.943,2016


## Missing data

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3502 entries, 0 to 3501
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   gvkey     3502 non-null   int64         
 1   datadate  3502 non-null   datetime64[ns]
 2   fyr       3502 non-null   int64         
 3   tic       3502 non-null   object        
 4   cik       3502 non-null   int64         
 5   conm      3502 non-null   object        
 6   ni        3502 non-null   float64       
 7   sale      3502 non-null   float64       
 8   cogs      2322 non-null   float64       
 9   oancf     3122 non-null   float64       
 10  at        3475 non-null   float64       
 11  lt        3473 non-null   float64       
 12  seq       3498 non-null   float64       
 13  year      3502 non-null   int32         
dtypes: datetime64[ns](1), float64(7), int32(1), int64(3), object(2)
memory usage: 369.5+ KB


In [3]:
#`isnull` creates a boolean series for whether an observation is missing; `isna()` also works here
df['oancf'].isnull()

missing = df['oancf'].isnull()
df4 = df[missing]
df4

Unnamed: 0,gvkey,datadate,fyr,tic,cik,conm,ni,sale,cogs,oancf,at,lt,seq,year
27,9402,2014-06-30,6,MSFT,789019,Microsoft Corp.,22074.000,86833.000,26934.000,,172384.000,82600.000,89784.000,2014
28,9402,2015-06-30,6,MSFT,789019,Microsoft Corp.,12193.000,93580.000,33038.000,,176223.000,96140.000,80083.000,2015
34,9570,2013-12-31,12,INTC,50863,Intel Corp.,9620.000,52708.000,21187.000,,92358.000,34102.000,58256.000,2013
35,9570,2014-12-31,12,INTC,50863,Intel Corp.,11704.000,55870.000,20261.000,,91956.000,36091.000,55865.000,2014
36,9570,2015-12-31,12,INTC,50863,Intel Corp.,11420.000,55355.000,20676.000,,103065.000,41980.000,61085.000,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3364,966079,2015-09-30,9,TDG,1260221,TransDigm Group,447.212,2707.115,1257.270,,8427.050,9465.356,-1038.306,2015
3365,966079,2016-09-30,9,TDG,1260221,TransDigm Group,586.414,3171.411,1443.348,,10726.277,11377.767,-651.490,2016
3370,967124,2013-12-31,12,PNC,713676,PNC Financial Services,3953.000,16012.000,,,320296.000,276199.000,42408.000,2013
3371,967124,2014-12-31,12,PNC,713676,PNC Financial Services,4207.000,15375.000,,,345072.000,298998.000,44551.000,2014


In [4]:
# This is another way to check # of missing values
df.isnull().sum()

gvkey          0
datadate       0
fyr            0
tic            0
cik            0
conm           0
ni             0
sale           0
cogs        1180
oancf        380
at            27
lt            29
seq            4
year           0
dtype: int64

In [5]:
#`notnull` creates a boolean series for whether an observation is NOT  missing; `notna()` also works here
#df3 = df['oancf'].notnull()
notmissing = df['oancf'].notnull()
df4 = df[notmissing]
df4

Unnamed: 0,gvkey,datadate,fyr,tic,cik,conm,ni,sale,cogs,oancf,at,lt,seq,year
0,2575,2012-12-31,12,MTD,1037646,Mettler Toledo,290.847,2341.528,811.204,327.704,2117.400,1290.181,827.219,2012
1,2575,2013-12-31,12,MTD,1037646,Mettler Toledo,306.094,2378.972,794.915,345.928,2152.819,1217.767,935.052,2013
2,2575,2014-12-31,12,MTD,1037646,Mettler Toledo,338.241,2485.983,809.537,418.912,2009.110,1289.515,719.595,2014
3,2575,2015-12-31,12,MTD,1037646,Mettler Toledo,352.820,2395.447,744.867,426.868,2018.485,1438.028,580.457,2015
4,2575,2016-12-31,12,MTD,1037646,Mettler Toledo,384.370,2508.257,767.753,443.078,2166.777,1731.834,434.943,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3497,998876,2015-12-31,12,WST,105770,West Pharmaceutical Services,95.600,1399.800,944.000,212.400,1695.100,671.200,1023.900,2015
3498,998876,2016-12-31,12,WST,105770,West Pharmaceutical Services,143.600,1509.100,1008.000,219.400,1716.700,599.200,1117.500,2016
3499,998876,2017-12-31,12,WST,105770,West Pharmaceutical Services,150.700,1599.100,1086.500,263.300,1862.800,582.900,1279.900,2017
3500,998876,2018-12-31,12,WST,105770,West Pharmaceutical Services,206.900,1717.400,1172.000,288.600,1978.900,582.600,1396.300,2018


In [6]:
# Drop instances where oancf is missing
df3 = df.dropna(subset=['oancf'])
df3
df3.isnull().sum()

gvkey          0
datadate       0
fyr            0
tic            0
cik            0
conm           0
ni             0
sale           0
cogs        1070
oancf          0
at            20
lt            22
seq            3
year           0
dtype: int64

In [7]:
# Drop any observations with missing values
df3 = df.dropna()
df3

Unnamed: 0,gvkey,datadate,fyr,tic,cik,conm,ni,sale,cogs,oancf,at,lt,seq,year
0,2575,2012-12-31,12,MTD,1037646,Mettler Toledo,290.847,2341.528,811.204,327.704,2117.400,1290.181,827.219,2012
1,2575,2013-12-31,12,MTD,1037646,Mettler Toledo,306.094,2378.972,794.915,345.928,2152.819,1217.767,935.052,2013
2,2575,2014-12-31,12,MTD,1037646,Mettler Toledo,338.241,2485.983,809.537,418.912,2009.110,1289.515,719.595,2014
3,2575,2015-12-31,12,MTD,1037646,Mettler Toledo,352.820,2395.447,744.867,426.868,2018.485,1438.028,580.457,2015
4,2575,2016-12-31,12,MTD,1037646,Mettler Toledo,384.370,2508.257,767.753,443.078,2166.777,1731.834,434.943,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3497,998876,2015-12-31,12,WST,105770,West Pharmaceutical Services,95.600,1399.800,944.000,212.400,1695.100,671.200,1023.900,2015
3498,998876,2016-12-31,12,WST,105770,West Pharmaceutical Services,143.600,1509.100,1008.000,219.400,1716.700,599.200,1117.500,2016
3499,998876,2017-12-31,12,WST,105770,West Pharmaceutical Services,150.700,1599.100,1086.500,263.300,1862.800,582.900,1279.900,2017
3500,998876,2018-12-31,12,WST,105770,West Pharmaceutical Services,206.900,1717.400,1172.000,288.600,1978.900,582.600,1396.300,2018


In [8]:
# .fillna() method - backfills with specified value
df3 = df.copy()
df3['oancf'] = df3['oancf'].fillna(0)
df3

Unnamed: 0,gvkey,datadate,fyr,tic,cik,conm,ni,sale,cogs,oancf,at,lt,seq,year
0,2575,2012-12-31,12,MTD,1037646,Mettler Toledo,290.847,2341.528,811.204,327.704,2117.400,1290.181,827.219,2012
1,2575,2013-12-31,12,MTD,1037646,Mettler Toledo,306.094,2378.972,794.915,345.928,2152.819,1217.767,935.052,2013
2,2575,2014-12-31,12,MTD,1037646,Mettler Toledo,338.241,2485.983,809.537,418.912,2009.110,1289.515,719.595,2014
3,2575,2015-12-31,12,MTD,1037646,Mettler Toledo,352.820,2395.447,744.867,426.868,2018.485,1438.028,580.457,2015
4,2575,2016-12-31,12,MTD,1037646,Mettler Toledo,384.370,2508.257,767.753,443.078,2166.777,1731.834,434.943,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3497,998876,2015-12-31,12,WST,105770,West Pharmaceutical Services,95.600,1399.800,944.000,212.400,1695.100,671.200,1023.900,2015
3498,998876,2016-12-31,12,WST,105770,West Pharmaceutical Services,143.600,1509.100,1008.000,219.400,1716.700,599.200,1117.500,2016
3499,998876,2017-12-31,12,WST,105770,West Pharmaceutical Services,150.700,1599.100,1086.500,263.300,1862.800,582.900,1279.900,2017
3500,998876,2018-12-31,12,WST,105770,West Pharmaceutical Services,206.900,1717.400,1172.000,288.600,1978.900,582.600,1396.300,2018


In [9]:
# Average of oancf
mean_cf = df3['oancf'].mean()
mean_cf

np.float64(3149.4543812107368)

In [10]:
df3 = df.copy()
df3['oancf'] = df3['oancf'].fillna(mean_cf)
df3

Unnamed: 0,gvkey,datadate,fyr,tic,cik,conm,ni,sale,cogs,oancf,at,lt,seq,year
0,2575,2012-12-31,12,MTD,1037646,Mettler Toledo,290.847,2341.528,811.204,327.704,2117.400,1290.181,827.219,2012
1,2575,2013-12-31,12,MTD,1037646,Mettler Toledo,306.094,2378.972,794.915,345.928,2152.819,1217.767,935.052,2013
2,2575,2014-12-31,12,MTD,1037646,Mettler Toledo,338.241,2485.983,809.537,418.912,2009.110,1289.515,719.595,2014
3,2575,2015-12-31,12,MTD,1037646,Mettler Toledo,352.820,2395.447,744.867,426.868,2018.485,1438.028,580.457,2015
4,2575,2016-12-31,12,MTD,1037646,Mettler Toledo,384.370,2508.257,767.753,443.078,2166.777,1731.834,434.943,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3497,998876,2015-12-31,12,WST,105770,West Pharmaceutical Services,95.600,1399.800,944.000,212.400,1695.100,671.200,1023.900,2015
3498,998876,2016-12-31,12,WST,105770,West Pharmaceutical Services,143.600,1509.100,1008.000,219.400,1716.700,599.200,1117.500,2016
3499,998876,2017-12-31,12,WST,105770,West Pharmaceutical Services,150.700,1599.100,1086.500,263.300,1862.800,582.900,1279.900,2017
3500,998876,2018-12-31,12,WST,105770,West Pharmaceutical Services,206.900,1717.400,1172.000,288.600,1978.900,582.600,1396.300,2018


In [11]:
# .drop() is used to drop rows or columns
df5 = df.drop(['cogs','oancf'], axis=1)
df5.head()

Unnamed: 0,gvkey,datadate,fyr,tic,cik,conm,ni,sale,at,lt,seq,year
0,2575,2012-12-31,12,MTD,1037646,Mettler Toledo,290.847,2341.528,2117.4,1290.181,827.219,2012
1,2575,2013-12-31,12,MTD,1037646,Mettler Toledo,306.094,2378.972,2152.819,1217.767,935.052,2013
2,2575,2014-12-31,12,MTD,1037646,Mettler Toledo,338.241,2485.983,2009.11,1289.515,719.595,2014
3,2575,2015-12-31,12,MTD,1037646,Mettler Toledo,352.82,2395.447,2018.485,1438.028,580.457,2015
4,2575,2016-12-31,12,MTD,1037646,Mettler Toledo,384.37,2508.257,2166.777,1731.834,434.943,2016


In [12]:
df.isnull().sum()

gvkey          0
datadate       0
fyr            0
tic            0
cik            0
conm           0
ni             0
sale           0
cogs        1180
oancf        380
at            27
lt            29
seq            4
year           0
dtype: int64