### Set up:
If you don't already have it loaded from the previous video(s), you will need some additional code:


In [1]:
import pandas as pd
import numpy as np
import datetime as dt

#bring in data file: `fina`.
df = pd.read_csv('fina.csv', parse_dates=['datadate'], date_format='%Y%m%d')

#create year variable
df['year'] = df['datadate'].dt.year

#preview the dataframe
df.head()


Unnamed: 0,gvkey,datadate,fyr,tic,cik,conm,ni,sale,cogs,oancf,at,lt,seq,year
0,2575,2012-12-31,12,MTD,1037646,Mettler Toledo,290.847,2341.528,811.204,327.704,2117.4,1290.181,827.219,2012
1,2575,2013-12-31,12,MTD,1037646,Mettler Toledo,306.094,2378.972,794.915,345.928,2152.819,1217.767,935.052,2013
2,2575,2014-12-31,12,MTD,1037646,Mettler Toledo,338.241,2485.983,809.537,418.912,2009.11,1289.515,719.595,2014
3,2575,2015-12-31,12,MTD,1037646,Mettler Toledo,352.82,2395.447,744.867,426.868,2018.485,1438.028,580.457,2015
4,2575,2016-12-31,12,MTD,1037646,Mettler Toledo,384.37,2508.257,767.753,443.078,2166.777,1731.834,434.943,2016


### Summary statistics and aggregations

In [2]:
# Count tic
df['tic'].count()

np.int64(3502)

In [3]:
# Create a frequency table
df['conm'].value_counts()

conm
Leidos Holdings       10
Best Buy Co. Inc.     10
Gap Inc.               9
L Brands Inc.          9
Walmart                9
                      ..
SVB Financial          2
Corteva                2
Dow Inc.               2
Comerica Inc.          2
The Mosaic Company     1
Name: count, Length: 481, dtype: int64

In [4]:
# of unique tic
df['tic'].nunique()

481

In [5]:
# sum all values of at
df['at'].sum()

np.float64(217257628.372755)

In [6]:
# 99 percentile value of at
df['at'].quantile(0.99)

np.float64(832124.0599999998)

In [7]:
# Summary statistics of at and lt
df[['at', 'lt']].aggregate(['count', 'mean', 'median', 'std', 'min', 'max'])

Unnamed: 0,at,lt
count,3475.0,3473.0
mean,62520.18,48291.11
median,16866.0,10458.0
std,212172.6,187951.4
min,0.0,0.0
max,2687379.0,2426049.0


In [8]:
# A different way to generate summary statistics
df[['at','lt','seq']].describe()

Unnamed: 0,at,lt,seq
count,3475.0,3473.0,3498.0
mean,62520.18,48291.11,14010.242553
std,212172.6,187951.4,31290.631652
min,0.0,0.0,-13244.0
25%,6876.15,4044.8,2198.14675
50%,16866.0,10458.0,5372.9145
75%,40518.0,28055.6,12079.75
max,2687379.0,2426049.0,424791.0


### Groupby

In [9]:
# Calculate the median of at within each tic
df.groupby('tic')['at'].median() 

tic
A         8996.500
AAL      51274.000
AAP       8224.799
AAPL    306082.500
ABBV     56201.000
           ...    
XYL       5685.000
YUM       6776.500
ZBH      25301.600
ZBRA      4485.500
ZTS       7781.000
Name: at, Length: 481, dtype: float64

In [10]:
# Calculate the median of at within each year
df.groupby('year')['at'].median()

year
2012    13206.0000
2013    14784.7000
2014    15405.5000
2015    15729.0000
2016    15777.5460
2017    18173.6500
2018    19156.2895
2019    19791.0000
2020    16453.0000
Name: at, dtype: float64

In [11]:
df.groupby('year')[['at','lt','seq']].max()

Unnamed: 0_level_0,at,lt,seq
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012,2359141.0,2155072.0,236956.0
2013,2415689.0,2204511.0,232685.0
2014,2573126.0,2341061.0,243471.0
2015,2351698.0,2104125.0,256205.0
2016,2490972.0,2236782.0,283001.0
2017,2533600.0,2277907.0,348296.0
2018,2622532.0,2366017.0,348703.0
2019,2687379.0,2426049.0,424791.0
2020,236495.0,161826.0,74669.0


In [12]:
df.groupby('tic')['at'].count()

tic
A       8
AAL     7
AAP     8
AAPL    8
ABBV    8
       ..
XYL     8
YUM     8
ZBH     4
ZBRA    8
ZTS     8
Name: at, Length: 481, dtype: int64

#### Useful aggregate methods on Pandas
- `count()`, `.sum()`	Total number of items
- `first()`, `last()`	First and last item
- `mean()`, `median()`	Mean and median
- `min()`, `max()`	Minimum and maximum
- `std()`, `var()`	Standard deviation and variance
- `prod()`	Product of all items
- `sum()`	Sum of all items

In [13]:
# If you want g1 to be DataFrame then do reset_index()
g1 = df.groupby('year')['at'].mean().reset_index()
g1

Unnamed: 0,year,at
0,2012,57566.745309
1,2013,59085.052503
2,2014,59977.060626
3,2015,58570.242856
4,2016,62257.904096
5,2017,64607.939739
6,2018,66884.026026
7,2019,70380.645148
8,2020,35556.263937


In [14]:
# A different way of doing it
g1 = df.groupby('year', as_index=False)['at'].mean()
g1

Unnamed: 0,year,at
0,2012,57566.745309
1,2013,59085.052503
2,2014,59977.060626
3,2015,58570.242856
4,2016,62257.904096
5,2017,64607.939739
6,2018,66884.026026
7,2019,70380.645148
8,2020,35556.263937


### Duplicates

In [15]:
# Drop instances where we have duplicate observations regarding a gvkey and year pair
g2 = df.sort_values(['gvkey','year']).drop_duplicates()
g2

Unnamed: 0,gvkey,datadate,fyr,tic,cik,conm,ni,sale,cogs,oancf,at,lt,seq,year
0,2575,2012-12-31,12,MTD,1037646,Mettler Toledo,290.847,2341.528,811.204,327.704,2117.400,1290.181,827.219,2012
1,2575,2013-12-31,12,MTD,1037646,Mettler Toledo,306.094,2378.972,794.915,345.928,2152.819,1217.767,935.052,2013
2,2575,2014-12-31,12,MTD,1037646,Mettler Toledo,338.241,2485.983,809.537,418.912,2009.110,1289.515,719.595,2014
3,2575,2015-12-31,12,MTD,1037646,Mettler Toledo,352.820,2395.447,744.867,426.868,2018.485,1438.028,580.457,2015
4,2575,2016-12-31,12,MTD,1037646,Mettler Toledo,384.370,2508.257,767.753,443.078,2166.777,1731.834,434.943,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3497,998876,2015-12-31,12,WST,105770,West Pharmaceutical Services,95.600,1399.800,944.000,212.400,1695.100,671.200,1023.900,2015
3498,998876,2016-12-31,12,WST,105770,West Pharmaceutical Services,143.600,1509.100,1008.000,219.400,1716.700,599.200,1117.500,2016
3499,998876,2017-12-31,12,WST,105770,West Pharmaceutical Services,150.700,1599.100,1086.500,263.300,1862.800,582.900,1279.900,2017
3500,998876,2018-12-31,12,WST,105770,West Pharmaceutical Services,206.900,1717.400,1172.000,288.600,1978.900,582.600,1396.300,2018
