# BipartiteData example

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Add pytwoway to system path, do not run this
import sys
sys.path.append('../../..')

In [3]:
# Import the pytwoway package 
# (Make sure you have installed it using pip install pytwoway)
import pytwoway as tw

## Simulate some data

The package contains functions to simulate data. We use this here to keep things simple.

In [4]:
# For the example, we simulate data
sim_data = tw.SimTwoWay().sim_network()[['wid', 'fid', 'comp', 'year']]
display(sim_data)

Unnamed: 0,wid,fid,comp,year
0,0,68,0.980166,1
1,0,19,-0.879698,2
2,0,19,-1.004502,3
3,0,19,-0.824520,4
4,0,19,0.500447,5
...,...,...,...,...
49995,9999,26,-1.785344,1
49996,9999,143,1.936228,2
49997,9999,143,-0.024048,3
49998,9999,76,1.253754,4


# BipartiteData

BipartiteData is a superclass with 3 subclasses:
- BipartiteLong
- BipartiteLongCollapsed
- BipartiteEventStudy

The user interfaces with BipartiteData which converts between the 3 subclasses.

In [5]:
# Our data is in long form (each row gives a single observation)
bd = tw.BipartiteData(sim_data, formatting='long') # Long is default formatting, most data is long
bd.clean_data() # Should always clean data after initializing

In [6]:
# Let's see how our data looks
display(bd.data)

Unnamed: 0,wid,fid,comp,year
0,0,68,0.980166,1
1,0,19,-0.879698,2
2,0,19,-1.004502,3
3,0,19,-0.824520,4
4,0,19,0.500447,5
...,...,...,...,...
49995,9999,26,-1.785344,1
49996,9999,143,1.936228,2
49997,9999,143,-0.024048,3
49998,9999,76,1.253754,4


## Converting formats

In [7]:
# While our original data is long, we might want it to be in event study form (each row gives two consecutive observations)
bd.long_to_es()
display(bd.data)

Unnamed: 0,wid,f1i,f2i,y1,y2,year_1,year_2,m
0,12,116,116,0.027789,0.027789,1,1,0
1,12,116,116,0.785875,0.785875,2,2,0
2,12,116,116,0.740557,0.740557,3,3,0
3,12,116,116,0.742051,0.742051,4,4,0
4,12,116,116,0.927807,0.927807,5,5,0
...,...,...,...,...,...,...,...,...
40614,9998,45,45,-0.479161,0.323008,4,5,1
40615,9999,26,143,-1.785344,1.936228,1,2,1
40616,9999,143,143,1.936228,-0.024048,2,3,1
40617,9999,143,76,-0.024048,1.253754,3,4,1


In [8]:
# We can also use event study data to retrive cross section data (cs=1 gives y1 as y1 for both stayers and movers; cs=0 gives y2 as y1 for only movers - this allows (almost) all income data to be accessed from the y1 column. Note that for movers, the last observation for each worker is not available without manipulation as it is shifted to the y2 column. Also note that the y1 row contains duplicates for all mover income, except for the first period.)
display(bd.get_cs())

Unnamed: 0,wid,f1i,f2i,y1,y2,year_1,year_2,m,cs
0,12,116,116,0.027789,0.027789,1,1,0,1
1,12,116,116,0.785875,0.785875,2,2,0,1
2,12,116,116,0.740557,0.740557,3,3,0,1
3,12,116,116,0.742051,0.742051,4,4,0,1
4,12,116,116,0.927807,0.927807,5,5,0,1
...,...,...,...,...,...,...,...,...,...
78138,9998,45,45,0.323008,-0.479161,5,4,1,0
78139,9999,143,26,1.936228,-1.785344,2,1,1,0
78140,9999,143,143,-0.024048,1.936228,3,2,1,0
78141,9999,76,143,1.253754,-0.024048,4,3,1,0


In [9]:
# Maybe we want to convert back into long form
bd.es_to_long()
display(bd.data)

Unnamed: 0,wid,fid,comp,year,m
0,0,68,0.980166,1,1
1,0,19,-0.879698,2,1
2,0,19,-1.004502,3,1
3,0,19,-0.824520,4,1
4,0,19,0.500447,5,1
...,...,...,...,...,...
49995,9999,26,-1.785344,1,1
49996,9999,143,1.936228,2,1
49997,9999,143,-0.024048,3,1
49998,9999,76,1.253754,4,1


In [10]:
# Now suppose we want to collapse by employment spells (so any consecutive observations with the same worker in the same firm are collapsed into 1 observation)
bd.long_to_collapsed_long()
display(bd.data)

Unnamed: 0,wid,fid,comp,year_start,year_end,weight,m
0,0,68,0.980166,1,1,1,1
1,0,19,-0.552068,2,5,4,1
2,1,136,0.891230,1,1,1,1
3,1,149,0.389932,2,2,1,1
4,1,73,3.009841,3,3,1,1
...,...,...,...,...,...,...,...
29983,9998,45,-0.078076,4,5,2,1
29984,9999,26,-1.785344,1,1,1,1
29985,9999,143,0.956090,2,3,2,1
29986,9999,76,1.253754,4,4,1,1


In [11]:
# We can then check out the event study using collapsed data
bd.collapsed_long_to_es()
display(bd.data)

Unnamed: 0,wid,f1i,f2i,y1,y2,year_start_1,year_end_1,year_start_2,year_end_2,w1,w2,m
0,12,116,116,0.644816,0.644816,1,5,1,5,5,5,0
1,37,58,58,0.193902,0.193902,1,5,1,5,5,5,0
2,55,178,178,2.306565,2.306565,1,5,1,5,5,5,0
3,86,88,88,0.192757,0.192757,1,5,1,5,5,5,0
4,122,92,92,0.966849,0.966849,1,5,1,5,5,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...
20602,9998,177,53,0.603786,-0.621124,1,1,2,3,1,2,1
20603,9998,53,45,-0.621124,-0.078076,2,3,4,5,2,2,1
20604,9999,26,143,-1.785344,0.956090,1,1,2,3,1,2,1
20605,9999,143,76,0.956090,1.253754,2,3,4,4,2,1,1


In [12]:
# We can then go back to collapsed long
bd.es_to_collapsed_long()
display(bd.data)

Unnamed: 0,wid,fid,comp,year_start,year_end,weight,m
0,0,68,0.980166,1,1,1,1
1,0,19,-0.552068,2,5,4,1
2,1,136,0.891230,1,1,1,1
3,1,149,0.389932,2,2,1,1
4,1,73,3.009841,3,3,1,1
...,...,...,...,...,...,...,...
29983,9998,45,-0.078076,4,5,2,1
29984,9999,26,-1.785344,1,1,1,1
29985,9999,143,0.956090,2,3,2,1
29986,9999,76,1.253754,4,4,1,1


## Clustering

In [13]:
# Starting over
bd = tw.BipartiteData(sim_data, formatting='long')
bd.clean_data()
display(bd.data)

Unnamed: 0,wid,fid,comp,year
0,0,68,0.980166,1
1,0,19,-0.879698,2
2,0,19,-1.004502,3
3,0,19,-0.824520,4
4,0,19,0.500447,5
...,...,...,...,...
49995,9999,26,-1.785344,1
49996,9999,143,1.936228,2
49997,9999,143,-0.024048,3
49998,9999,76,1.253754,4


In [14]:
# We can cluster from any format and clusters stay when reformatting
bd.cluster()
display(bd.data)

Unnamed: 0,wid,fid,comp,year,j,m
0,0,68,0.980166,1,9,1
1,0,19,-0.879698,2,6,1
2,0,19,-1.004502,3,6,1
3,0,19,-0.824520,4,6,1
4,0,19,0.500447,5,6,1
...,...,...,...,...,...,...
49995,9999,26,-1.785344,1,6,1
49996,9999,143,1.936228,2,7,1
49997,9999,143,-0.024048,3,7,1
49998,9999,76,1.253754,4,9,1


In [15]:
bd.long_to_es()
display(bd.data)

Unnamed: 0,wid,f1i,f2i,y1,y2,year_1,year_2,j1,j2,m
0,12,116,116,0.027789,0.027789,1,1,0,0,0
1,12,116,116,0.785875,0.785875,2,2,0,0,0
2,12,116,116,0.740557,0.740557,3,3,0,0,0
3,12,116,116,0.742051,0.742051,4,4,0,0,0
4,12,116,116,0.927807,0.927807,5,5,0,0,0
...,...,...,...,...,...,...,...,...,...,...
40614,9998,45,45,-0.479161,0.323008,4,5,1,1,1
40615,9999,26,143,-1.785344,1.936228,1,2,6,7,1
40616,9999,143,143,1.936228,-0.024048,2,3,7,7,1
40617,9999,143,76,-0.024048,1.253754,3,4,7,9,1


In [16]:
bd.es_to_long()
bd.long_to_collapsed_long()
display(bd.data)

Unnamed: 0,wid,fid,comp,year_start,year_end,weight,j,m
0,0,68,0.980166,1,1,1,9,1
1,0,19,-0.552068,2,5,4,6,1
2,1,136,0.891230,1,1,1,7,1
3,1,149,0.389932,2,2,1,4,1
4,1,73,3.009841,3,3,1,9,1
...,...,...,...,...,...,...,...,...
29983,9998,45,-0.078076,4,5,2,1,1
29984,9999,26,-1.785344,1,1,1,6,1
29985,9999,143,0.956090,2,3,2,7,1
29986,9999,76,1.253754,4,4,1,9,1


In [17]:
bd.collapsed_long_to_es()
display(bd.data)

Unnamed: 0,wid,f1i,f2i,y1,y2,year_start_1,year_end_1,year_start_2,year_end_2,w1,w2,j1,j2,m
0,12,116,116,0.644816,0.644816,1,5,1,5,5,5,0,0,0
1,37,58,58,0.193902,0.193902,1,5,1,5,5,5,1,1,0
2,55,178,178,2.306565,2.306565,1,5,1,5,5,5,2,2,0
3,86,88,88,0.192757,0.192757,1,5,1,5,5,5,5,5,0
4,122,92,92,0.966849,0.966849,1,5,1,5,5,5,5,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20602,9998,177,53,0.603786,-0.621124,1,1,2,3,1,2,2,1,1
20603,9998,53,45,-0.621124,-0.078076,2,3,4,5,2,2,1,1,1
20604,9999,26,143,-1.785344,0.956090,1,1,2,3,1,2,6,7,1
20605,9999,143,76,0.956090,1.253754,2,3,4,4,2,1,7,9,1


## Clustering options

In [18]:
# We can cluster on a specific year in long form
bd = tw.BipartiteData(sim_data, formatting='long')
bd.clean_data()
bd.cluster({'year': 1})
display(bd.data)

Unnamed: 0,wid,fid,comp,year,j,m
0,0,68,0.980166,1,7,1
1,0,19,-0.879698,2,8,1
2,0,19,-1.004502,3,8,1
3,0,19,-0.824520,4,8,1
4,0,19,0.500447,5,8,1
...,...,...,...,...,...,...
49995,9999,26,-1.785344,1,8,1
49996,9999,143,1.936228,2,5,1
49997,9999,143,-0.024048,3,5,1
49998,9999,76,1.253754,4,6,1


In [19]:
# We can cluster on movers or stayers in any form
# Note the clusters become floats because not all firms are clustered, so some rows have NaNs
bd = tw.BipartiteData(sim_data, formatting='long')
bd.clean_data()
bd.cluster({'stayers_movers': 'stayers'})
display(bd.data)

Unnamed: 0,wid,fid,comp,year,j,m
0,0,68,0.980166,1,9.0,1
1,0,19,-0.879698,2,0.0,1
2,0,19,-1.004502,3,0.0,1
3,0,19,-0.824520,4,0.0,1
4,0,19,0.500447,5,0.0,1
...,...,...,...,...,...,...
49995,9999,26,-1.785344,1,9.0,1
49996,9999,143,1.936228,2,6.0,1
49997,9999,143,-0.024048,3,6.0,1
49998,9999,76,1.253754,4,6.0,1


In [20]:
# We can cluster on movers or stayers in any form
# Set 'dropna': True if you want to drop firms that don't get clustered (note that this fully resets firm and worker ids, since they must be contiguous for the FE/CRE/BLM estimators to work)
bd = tw.BipartiteData(sim_data, formatting='long')
bd.clean_data()
bd.cluster({'stayers_movers': 'stayers', 'dropna': True})
display(bd.data)

Unnamed: 0,wid,fid,comp,year,j,m
0,0,68,0.980166,1,1,1
1,0,19,-0.879698,2,6,1
2,0,19,-1.004502,3,6,1
3,0,19,-0.824520,4,6,1
4,0,19,0.500447,5,6,1
...,...,...,...,...,...,...
48167,9964,26,-1.785344,1,1,1
48168,9964,143,1.936228,2,7,1
48169,9964,143,-0.024048,3,7,1
48170,9964,76,1.253754,4,7,1


In [21]:
# We can cluster on movers or stayers in any form
bd = tw.BipartiteData(sim_data, formatting='long')
bd.clean_data()
bd.cluster({'stayers_movers': 'movers'})
display(bd.data)

Unnamed: 0,wid,fid,comp,year,j,m
0,0,68,0.980166,1,9,1
1,0,19,-0.879698,2,2,1
2,0,19,-1.004502,3,2,1
3,0,19,-0.824520,4,2,1
4,0,19,0.500447,5,2,1
...,...,...,...,...,...,...
49995,9999,26,-1.785344,1,2,1
49996,9999,143,1.936228,2,5,1
49997,9999,143,-0.024048,3,5,1
49998,9999,76,1.253754,4,9,1


## Some extra features

In [22]:
# We can get some summary statistics by printing the BipartiteData object
sim_data = tw.SimTwoWay().sim_network()[['wid', 'fid', 'comp', 'year']]
sim_data = sim_data.rename({'fid': 'f1i'}, axis=1)
bd = tw.BipartiteData(sim_data, col_dict={'fid': 'f1i'}, formatting='long')
bd.clean_data()
display(bd)

format: long
number of workers: 10000
number of firms: 196
number of observations: 50000
mean wage: 0.39897621450125
max wage: 6.148264501194731
min wage: -5.327666340483833
collapsed by spell: False
connected: True
contiguous firm and worker ids: True
no nans: True
no duplicates: True

In [23]:
# BipartiteData does some nice column inference
# Suppose some of our columns are mislabeled - create a col_dict to clarify which columns to correct
sim_data = tw.SimTwoWay().sim_network()[['wid', 'fid', 'comp', 'year']]
sim_data = sim_data.rename({'fid': 'f1i'}, axis=1)
bd = tw.BipartiteData(sim_data, col_dict={'fid': 'f1i'}, formatting='long')
bd.clean_data()
display(bd.data)

Unnamed: 0,wid,fid,comp,year
0,0,108,0.291487,1
1,0,108,0.984671,2
2,0,163,0.465247,3
3,0,117,-0.149557,4
4,0,161,0.243091,5
...,...,...,...,...
49995,9999,36,-1.319178,1
49996,9999,36,-2.152725,2
49997,9999,33,0.154655,3
49998,9999,33,-1.828224,4


In [24]:
# Suppose we have already clustered - the class determines this automatically
sim_data = tw.SimTwoWay().sim_network()[['wid', 'fid', 'comp', 'year']]
sim_data['j'] = 1
bd = tw.BipartiteData(sim_data, formatting='long')
bd.clean_data()
display(bd.data)

Unnamed: 0,wid,fid,comp,year,j
0,0,94,0.603753,1,1
1,0,94,-0.220369,2,1
2,0,94,0.544065,3,1
3,0,94,0.236325,4,1
4,0,176,2.434463,5,1
...,...,...,...,...,...
49995,9999,178,0.457938,1,1
49996,9999,178,0.675930,2,1
49997,9999,178,2.320511,3,1
49998,9999,178,2.068722,4,1


In [25]:
# Now suppose we mislabeled our clusters - the class drops this column
sim_data = tw.SimTwoWay().sim_network()[['wid', 'fid', 'comp', 'year']]
sim_data['j1'] = 1
bd = tw.BipartiteData(sim_data, formatting='long')
bd.clean_data()
display(bd.data)

Unnamed: 0,wid,fid,comp,year
0,0,0,-1.981071,1
1,0,0,-1.310500,2
2,0,31,0.823785,3
3,0,31,-1.011489,4
4,0,31,0.284867,5
...,...,...,...,...
49995,9999,114,1.598984,1
49996,9999,114,0.353019,2
49997,9999,102,1.847748,3
49998,9999,102,2.751690,4


In [26]:
# We can make sure our columns are included by specifying the corrected labels in col_dict (note that if the variable of interest requires 2 columns, e.g. weight 1 and weight 2 for event study data, BOTH columns are required - if only one column is included, it is automatically dropped.)
sim_data = tw.SimTwoWay().sim_network()[['wid', 'fid', 'comp', 'year']]
sim_data['j1'] = 1
bd = tw.BipartiteData(sim_data, col_dict={'j': 'j1'}, formatting='long')
bd.clean_data()
display(bd.data)

Unnamed: 0,wid,fid,comp,year,j
0,0,103,-0.401762,1,1
1,0,103,-0.681114,2,1
2,0,103,-1.369171,3,1
3,0,103,0.115274,4,1
4,0,8,-1.954387,5,1
...,...,...,...,...,...
49995,9999,189,1.648261,1,1
49996,9999,189,2.552486,2,1
49997,9999,189,2.047466,3,1
49998,9999,189,1.454298,4,1


In [27]:
# We may also specify a col_dict that overwrites a column label that would normally be inferred, and inference is skipped
sim_data = tw.SimTwoWay().sim_network()[['wid', 'fid', 'comp', 'year']]
sim_data = sim_data.rename({'fid': 'j'}, axis=1)
bd = tw.BipartiteData(sim_data, col_dict={'fid': 'j'}, formatting='long')
# Note that even though we have a column named j, because we specify it is the 'fid' column it is not inferred as the cluster values
bd.clean_data()
display(bd.data)

Unnamed: 0,wid,fid,comp,year
0,0,130,0.359232,1
1,0,130,-0.023366,2
2,0,104,1.932692,3
3,0,35,0.554162,4
4,0,35,-0.443619,5
...,...,...,...,...
49995,9999,152,2.398704,1
49996,9999,152,1.894624,2
49997,9999,152,2.421391,3
49998,9999,152,1.602466,4
