### Set up:
If you don't already have it loaded from the previous video(s), you will need some additional code:


In [1]:
import pandas as pd
import numpy as np
import datetime as dt

#bring in data file: `fina`.
df = pd.read_csv('fina.csv', parse_dates=['datadate'], date_format='%Y%m%d')

#create year variable
df['year'] = df['datadate'].dt.year

#preview the dataframe
df.head()


Unnamed: 0,gvkey,datadate,fyr,tic,cik,conm,ni,sale,cogs,oancf,at,lt,seq,year
0,2575,2012-12-31,12,MTD,1037646,Mettler Toledo,290.847,2341.528,811.204,327.704,2117.4,1290.181,827.219,2012
1,2575,2013-12-31,12,MTD,1037646,Mettler Toledo,306.094,2378.972,794.915,345.928,2152.819,1217.767,935.052,2013
2,2575,2014-12-31,12,MTD,1037646,Mettler Toledo,338.241,2485.983,809.537,418.912,2009.11,1289.515,719.595,2014
3,2575,2015-12-31,12,MTD,1037646,Mettler Toledo,352.82,2395.447,744.867,426.868,2018.485,1438.028,580.457,2015
4,2575,2016-12-31,12,MTD,1037646,Mettler Toledo,384.37,2508.257,767.753,443.078,2166.777,1731.834,434.943,2016


### Stacking data

Also known as "appending" data.

In [2]:
# Take first 1700 observations
y1 = df[0:1700]
y1

Unnamed: 0,gvkey,datadate,fyr,tic,cik,conm,ni,sale,cogs,oancf,at,lt,seq,year
0,2575,2012-12-31,12,MTD,1037646,Mettler Toledo,290.847,2341.528,811.204,327.704,2117.400,1290.181,827.219,2012
1,2575,2013-12-31,12,MTD,1037646,Mettler Toledo,306.094,2378.972,794.915,345.928,2152.819,1217.767,935.052,2013
2,2575,2014-12-31,12,MTD,1037646,Mettler Toledo,338.241,2485.983,809.537,418.912,2009.110,1289.515,719.595,2014
3,2575,2015-12-31,12,MTD,1037646,Mettler Toledo,352.820,2395.447,744.867,426.868,2018.485,1438.028,580.457,2015
4,2575,2016-12-31,12,MTD,1037646,Mettler Toledo,384.370,2508.257,767.753,443.078,2166.777,1731.834,434.943,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1695,503098,2014-12-31,12,AAL,6201,American Airlines Group,2882.000,42650.000,,3080.000,43771.000,41750.000,2021.000,2014
1696,503098,2015-12-31,12,AAL,6201,American Airlines Group,7610.000,40990.000,,6249.000,48415.000,42780.000,5635.000,2015
1697,503098,2016-12-31,12,AAL,6201,American Airlines Group,2676.000,40180.000,,6524.000,51274.000,47489.000,3785.000,2016
1698,503098,2017-12-31,12,AAL,6201,American Airlines Group,1919.000,42207.000,,4744.000,51396.000,47470.000,3926.000,2017


In [3]:
# Take 1701th to the last observations
z1 = df[1700:]
z1

Unnamed: 0,gvkey,datadate,fyr,tic,cik,conm,ni,sale,cogs,oancf,at,lt,seq,year
1700,503098,2019-12-31,12,AAL,6201,American Airlines Group,1686.000,45768.000,,3815.000,59995.000,60113.000,-118.000,2019
1701,503503,2012-12-31,12,DPZ,1286681,Domino's Pizza,112.392,1678.439,1177.101,176.320,478.197,1813.720,-1335.523,2012
1702,503503,2013-12-31,12,DPZ,1286681,Domino's Pizza,142.985,1802.223,1253.249,193.989,525.255,1815.457,-1290.202,2013
1703,503503,2014-12-31,12,DPZ,1286681,Domino's Pizza,162.587,1993.833,1399.067,,619.280,1838.745,-1219.465,2014
1704,503503,2015-12-31,12,DPZ,1286681,Domino's Pizza,192.789,2216.528,1533.397,,799.845,2600.096,-1800.251,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3497,998876,2015-12-31,12,WST,105770,West Pharmaceutical Services,95.600,1399.800,944.000,212.400,1695.100,671.200,1023.900,2015
3498,998876,2016-12-31,12,WST,105770,West Pharmaceutical Services,143.600,1509.100,1008.000,219.400,1716.700,599.200,1117.500,2016
3499,998876,2017-12-31,12,WST,105770,West Pharmaceutical Services,150.700,1599.100,1086.500,263.300,1862.800,582.900,1279.900,2017
3500,998876,2018-12-31,12,WST,105770,West Pharmaceutical Services,206.900,1717.400,1172.000,288.600,1978.900,582.600,1396.300,2018


In [4]:
# Stack them together
stacked = pd.concat([y1,z1], axis=0, ignore_index=True)
stacked

Unnamed: 0,gvkey,datadate,fyr,tic,cik,conm,ni,sale,cogs,oancf,at,lt,seq,year
0,2575,2012-12-31,12,MTD,1037646,Mettler Toledo,290.847,2341.528,811.204,327.704,2117.400,1290.181,827.219,2012
1,2575,2013-12-31,12,MTD,1037646,Mettler Toledo,306.094,2378.972,794.915,345.928,2152.819,1217.767,935.052,2013
2,2575,2014-12-31,12,MTD,1037646,Mettler Toledo,338.241,2485.983,809.537,418.912,2009.110,1289.515,719.595,2014
3,2575,2015-12-31,12,MTD,1037646,Mettler Toledo,352.820,2395.447,744.867,426.868,2018.485,1438.028,580.457,2015
4,2575,2016-12-31,12,MTD,1037646,Mettler Toledo,384.370,2508.257,767.753,443.078,2166.777,1731.834,434.943,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3497,998876,2015-12-31,12,WST,105770,West Pharmaceutical Services,95.600,1399.800,944.000,212.400,1695.100,671.200,1023.900,2015
3498,998876,2016-12-31,12,WST,105770,West Pharmaceutical Services,143.600,1509.100,1008.000,219.400,1716.700,599.200,1117.500,2016
3499,998876,2017-12-31,12,WST,105770,West Pharmaceutical Services,150.700,1599.100,1086.500,263.300,1862.800,582.900,1279.900,2017
3500,998876,2018-12-31,12,WST,105770,West Pharmaceutical Services,206.900,1717.400,1172.000,288.600,1978.900,582.600,1396.300,2018


In [5]:
# Sort data
stacked.sort_values(by=['gvkey', 'datadate'], inplace=True)
stacked

Unnamed: 0,gvkey,datadate,fyr,tic,cik,conm,ni,sale,cogs,oancf,at,lt,seq,year
0,2575,2012-12-31,12,MTD,1037646,Mettler Toledo,290.847,2341.528,811.204,327.704,2117.400,1290.181,827.219,2012
1,2575,2013-12-31,12,MTD,1037646,Mettler Toledo,306.094,2378.972,794.915,345.928,2152.819,1217.767,935.052,2013
2,2575,2014-12-31,12,MTD,1037646,Mettler Toledo,338.241,2485.983,809.537,418.912,2009.110,1289.515,719.595,2014
3,2575,2015-12-31,12,MTD,1037646,Mettler Toledo,352.820,2395.447,744.867,426.868,2018.485,1438.028,580.457,2015
4,2575,2016-12-31,12,MTD,1037646,Mettler Toledo,384.370,2508.257,767.753,443.078,2166.777,1731.834,434.943,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3497,998876,2015-12-31,12,WST,105770,West Pharmaceutical Services,95.600,1399.800,944.000,212.400,1695.100,671.200,1023.900,2015
3498,998876,2016-12-31,12,WST,105770,West Pharmaceutical Services,143.600,1509.100,1008.000,219.400,1716.700,599.200,1117.500,2016
3499,998876,2017-12-31,12,WST,105770,West Pharmaceutical Services,150.700,1599.100,1086.500,263.300,1862.800,582.900,1279.900,2017
3500,998876,2018-12-31,12,WST,105770,West Pharmaceutical Services,206.900,1717.400,1172.000,288.600,1978.900,582.600,1396.300,2018


### Merging data

In [6]:
# Take a subset of columns
y1 = df[['gvkey','tic','conm','datadate','year','sale','cogs','ni','oancf']]
y1.tail()

Unnamed: 0,gvkey,tic,conm,datadate,year,sale,cogs,ni,oancf
3497,998876,WST,West Pharmaceutical Services,2015-12-31,2015,1399.8,944.0,95.6,212.4
3498,998876,WST,West Pharmaceutical Services,2016-12-31,2016,1509.1,1008.0,143.6,219.4
3499,998876,WST,West Pharmaceutical Services,2017-12-31,2017,1599.1,1086.5,150.7,263.3
3500,998876,WST,West Pharmaceutical Services,2018-12-31,2018,1717.4,1172.0,206.9,288.6
3501,998876,WST,West Pharmaceutical Services,2019-12-31,2019,1839.9,1234.2,241.7,367.2


In [7]:
# Take a subset of columns
z1 = df[['gvkey','tic','conm','datadate','year','at','lt','seq']]
z1.tail()

Unnamed: 0,gvkey,tic,conm,datadate,year,at,lt,seq
3497,998876,WST,West Pharmaceutical Services,2015-12-31,2015,1695.1,671.2,1023.9
3498,998876,WST,West Pharmaceutical Services,2016-12-31,2016,1716.7,599.2,1117.5
3499,998876,WST,West Pharmaceutical Services,2017-12-31,2017,1862.8,582.9,1279.9
3500,998876,WST,West Pharmaceutical Services,2018-12-31,2018,1978.9,582.6,1396.3
3501,998876,WST,West Pharmaceutical Services,2019-12-31,2019,2341.4,768.2,1573.2


In [8]:
# Merge y1 and z1
merged = pd.merge(y1, z1, how='inner',on=['gvkey', 'datadate'])
merged

Unnamed: 0,gvkey,tic_x,conm_x,datadate,year_x,sale,cogs,ni,oancf,tic_y,conm_y,year_y,at,lt,seq
0,2575,MTD,Mettler Toledo,2012-12-31,2012,2341.528,811.204,290.847,327.704,MTD,Mettler Toledo,2012,2117.400,1290.181,827.219
1,2575,MTD,Mettler Toledo,2013-12-31,2013,2378.972,794.915,306.094,345.928,MTD,Mettler Toledo,2013,2152.819,1217.767,935.052
2,2575,MTD,Mettler Toledo,2014-12-31,2014,2485.983,809.537,338.241,418.912,MTD,Mettler Toledo,2014,2009.110,1289.515,719.595
3,2575,MTD,Mettler Toledo,2015-12-31,2015,2395.447,744.867,352.820,426.868,MTD,Mettler Toledo,2015,2018.485,1438.028,580.457
4,2575,MTD,Mettler Toledo,2016-12-31,2016,2508.257,767.753,384.370,443.078,MTD,Mettler Toledo,2016,2166.777,1731.834,434.943
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3497,998876,WST,West Pharmaceutical Services,2015-12-31,2015,1399.800,944.000,95.600,212.400,WST,West Pharmaceutical Services,2015,1695.100,671.200,1023.900
3498,998876,WST,West Pharmaceutical Services,2016-12-31,2016,1509.100,1008.000,143.600,219.400,WST,West Pharmaceutical Services,2016,1716.700,599.200,1117.500
3499,998876,WST,West Pharmaceutical Services,2017-12-31,2017,1599.100,1086.500,150.700,263.300,WST,West Pharmaceutical Services,2017,1862.800,582.900,1279.900
3500,998876,WST,West Pharmaceutical Services,2018-12-31,2018,1717.400,1172.000,206.900,288.600,WST,West Pharmaceutical Services,2018,1978.900,582.600,1396.300


In [9]:
# To avoid creations of tic_x, tic_y, etc
merged = pd.merge(y1, z1[['gvkey','datadate','at', 'lt','seq']], how='inner',on=['gvkey', 'datadate'])
merged

Unnamed: 0,gvkey,tic,conm,datadate,year,sale,cogs,ni,oancf,at,lt,seq
0,2575,MTD,Mettler Toledo,2012-12-31,2012,2341.528,811.204,290.847,327.704,2117.400,1290.181,827.219
1,2575,MTD,Mettler Toledo,2013-12-31,2013,2378.972,794.915,306.094,345.928,2152.819,1217.767,935.052
2,2575,MTD,Mettler Toledo,2014-12-31,2014,2485.983,809.537,338.241,418.912,2009.110,1289.515,719.595
3,2575,MTD,Mettler Toledo,2015-12-31,2015,2395.447,744.867,352.820,426.868,2018.485,1438.028,580.457
4,2575,MTD,Mettler Toledo,2016-12-31,2016,2508.257,767.753,384.370,443.078,2166.777,1731.834,434.943
...,...,...,...,...,...,...,...,...,...,...,...,...
3497,998876,WST,West Pharmaceutical Services,2015-12-31,2015,1399.800,944.000,95.600,212.400,1695.100,671.200,1023.900
3498,998876,WST,West Pharmaceutical Services,2016-12-31,2016,1509.100,1008.000,143.600,219.400,1716.700,599.200,1117.500
3499,998876,WST,West Pharmaceutical Services,2017-12-31,2017,1599.100,1086.500,150.700,263.300,1862.800,582.900,1279.900
3500,998876,WST,West Pharmaceutical Services,2018-12-31,2018,1717.400,1172.000,206.900,288.600,1978.900,582.600,1396.300
