# BipartitePandas example

In [1]:
# Add pytwoway to system path, do not run this
import sys
sys.path.append('../../..')

In [2]:
# Import the pytwoway package 
# (Make sure you have installed it using pip install pytwoway)
import pytwoway as tw

## Simulate some data

The package contains functions to simulate data. We use this here to keep things simple.

In [3]:
# For the example, we simulate data
sim_data = tw.SimTwoWay().sim_network()[['wid', 'fid', 'comp', 'year']]
display(sim_data)

Unnamed: 0,wid,fid,comp,year
0,0,113,0.445394,1
1,0,131,2.093489,2
2,0,131,0.186537,3
3,0,131,1.925646,4
4,0,185,1.984892,5
...,...,...,...,...
49995,9999,35,-1.777569,1
49996,9999,35,0.378292,2
49997,9999,35,-2.925798,3
49998,9999,74,0.145590,4


## BipartitePandas

BipartitePandas includes 4 classes:
- BipartiteLong
- BipartiteLongCollapsed
- BipartiteEventStudy
- BipartiteEventStudyCollapsed

For long data, each row gives a single observation. For collapsed long data, each row gives a single observation where employment spells at the same firm are collapsed into that observation. For event study data, each row gives the firms and salaries for a single employee over two consecutive years (this is created from BipartiteEventStudy). For collapsed event study data, each row gives the firms and salaries for a single employee over two consecutive employment spells (this is created from BipartiteLongCollapsed).

In [4]:
# Our data is in long form (each row gives a single observation)
bd = tw.BipartiteLong(sim_data)
display(bd)
# Note the type
print(type(bd))

Unnamed: 0,wid,fid,comp,year
0,0,113,0.445394,1
1,0,131,2.093489,2
2,0,131,0.186537,3
3,0,131,1.925646,4
4,0,185,1.984892,5
...,...,...,...,...
49995,9999,35,-1.777569,1
49996,9999,35,0.378292,2
49997,9999,35,-2.925798,3
49998,9999,74,0.145590,4


<class 'pytwoway.bipartite_pandas.BipartiteLong'>


In [5]:
# We can check the number of workers, firms, and clusters (which is None because the data is not clustered)
print(bd.n_workers())
print(bd.n_firms())
print(bd.n_clusters())

10000
196
None


In [6]:
# Whenever we want to use the data, we need to clean it
bd.clean_data()

Unnamed: 0,wid,fid,comp,year
0,0,113,0.445394,1
1,0,131,2.093489,2
2,0,131,0.186537,3
3,0,131,1.925646,4
4,0,185,1.984892,5
...,...,...,...,...
49995,9999,35,-1.777569,1
49996,9999,35,0.378292,2
49997,9999,35,-2.925798,3
49998,9999,74,0.145590,4


## Converting formats

In [7]:
# While our original data is long, we might want it to be in event study form (each row gives two consecutive observations)
bd = bd.get_es()
display(bd)
# Note the type
print(type(bd))

Unnamed: 0,wid,f1i,f2i,y1,y2,year_1,year_2,m
0,23,22,22,-3.407073,-3.407073,1,1,0
1,23,22,22,-1.316057,-1.316057,2,2,0
2,23,22,22,0.714426,0.714426,3,3,0
3,23,22,22,-1.892804,-1.892804,4,4,0
4,23,22,22,-2.257470,-2.257470,5,5,0
...,...,...,...,...,...,...,...,...
40633,9998,100,100,0.878803,-0.491500,4,5,1
40634,9999,35,35,-1.777569,0.378292,1,2,1
40635,9999,35,35,0.378292,-2.925798,2,3,1
40636,9999,35,74,-2.925798,0.145590,3,4,1


<class 'pytwoway.bipartite_pandas.BipartiteEventStudy'>


In [8]:
# We can also use event study data to retrive cross section data (cs=1 gives y1 as y1 for both stayers and movers; cs=0 gives y2 as y1 for only movers - this allows (almost) all income data to be accessed from the y1 column. Note that for movers, the last observation for each worker is not available without manipulation as it is shifted to the y2 column. Also note that the y1 row contains duplicates for all mover income, except for the first period.)
display(bd.get_cs())

Unnamed: 0,wid,f1i,f2i,y1,y2,year_1,year_2,m,cs
0,23,22,22,-3.407073,-3.407073,1,1,0,1
1,23,22,22,-1.316057,-1.316057,2,2,0,1
2,23,22,22,0.714426,0.714426,3,3,0,1
3,23,22,22,-1.892804,-1.892804,4,4,0,1
4,23,22,22,-2.257470,-2.257470,5,5,0,1
...,...,...,...,...,...,...,...,...,...
78081,9998,100,100,-0.491500,0.878803,5,4,1,0
78082,9999,35,35,0.378292,-1.777569,2,1,1,0
78083,9999,35,35,-2.925798,0.378292,3,2,1,0
78084,9999,74,35,0.145590,-2.925798,4,3,1,0


In [9]:
# Maybe we want to convert back into long form
bd = bd.get_long()
display(bd)
# Note the type
print(type(bd))

Unnamed: 0,wid,fid,comp,year,m
0,0,113,0.445394,1,1
1,0,131,2.093489,2,1
2,0,131,0.186537,3,1
3,0,131,1.925646,4,1
4,0,185,1.984892,5,1
...,...,...,...,...,...
49995,9999,35,-1.777569,1,1
49996,9999,35,0.378292,2,1
49997,9999,35,-2.925798,3,1
49998,9999,74,0.145590,4,1


<class 'pytwoway.bipartite_pandas.BipartiteLong'>


In [10]:
# Now suppose we want to collapse by employment spells (so any consecutive observations with the same worker in the same firm are collapsed into 1 observation)
bd = bd.get_collapsed_long()
display(bd)
# Note the type
print(type(bd))

Unnamed: 0,wid,fid,comp,year_start,year_end,weight,m
0,0,113,0.445394,1,1,1,1
1,0,131,1.401891,2,4,3,1
2,0,185,1.984892,5,5,1,1
3,1,72,-0.340153,1,2,2,1
4,1,113,2.497030,3,3,1,1
...,...,...,...,...,...,...,...
29811,9998,178,0.431457,1,1,1,1
29812,9998,126,0.003084,2,2,1,1
29813,9998,100,0.580192,3,5,3,1
29814,9999,35,-1.441692,1,3,3,1


<class 'pytwoway.bipartite_pandas.BipartiteLongCollapsed'>


In [11]:
# We can then check out the event study using collapsed data
bd = bd.get_es()
display(bd)
# Note the type
print(type(bd))

Unnamed: 0,wid,f1i,f2i,y1,y2,year_start_1,year_end_1,year_start_2,year_end_2,w1,w2,m
0,23,22,22,-1.631796,-1.631796,1,5,1,5,5,5,0
1,27,49,49,0.704600,0.704600,1,5,1,5,5,5,0
2,41,70,70,0.305096,0.305096,1,5,1,5,5,5,0
3,44,162,162,2.157609,2.157609,1,5,1,5,5,5,0
4,59,163,163,2.611192,2.611192,1,5,1,5,5,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...
20449,9996,187,166,2.584873,2.358368,4,4,5,5,1,1,1
20450,9997,51,64,-0.688550,-2.225336,1,4,5,5,4,1,1
20451,9998,178,126,0.431457,0.003084,1,1,2,2,1,1,1
20452,9998,126,100,0.003084,0.580192,2,2,3,5,1,3,1


<class 'pytwoway.bipartite_pandas.BipartiteEventStudyCollapsed'>


In [12]:
# We can then go back to collapsed long
bd = bd.get_collapsed_long()
display(bd)
# Note the type
print(type(bd))

Unnamed: 0,wid,fid,comp,year_start,year_end,weight,m
0,0,113,0.445394,1,1,1,1
1,0,131,1.401891,2,4,3,1
2,0,185,1.984892,5,5,1,1
3,1,72,-0.340153,1,2,2,1
4,1,113,2.497030,3,3,1,1
...,...,...,...,...,...,...,...
29811,9998,178,0.431457,1,1,1,1
29812,9998,126,0.003084,2,2,1,1
29813,9998,100,0.580192,3,5,3,1
29814,9999,35,-1.441692,1,3,3,1


<class 'pytwoway.bipartite_pandas.BipartiteLongCollapsed'>


## Clustering

In [13]:
# Starting over
bd = tw.BipartiteLong(sim_data)
bd.clean_data()
display(bd)

Unnamed: 0,wid,fid,comp,year
0,0,113,0.445394,1
1,0,131,2.093489,2
2,0,131,0.186537,3
3,0,131,1.925646,4
4,0,185,1.984892,5
...,...,...,...,...
49995,9999,35,-1.777569,1
49996,9999,35,0.378292,2
49997,9999,35,-2.925798,3
49998,9999,74,0.145590,4


In [14]:
# We can cluster from any format and clusters stay when reformatting
bd = bd.cluster()
display(bd)

Unnamed: 0,wid,fid,comp,year,j
0,0,113,0.445394,1,5
1,0,131,2.093489,2,0
2,0,131,0.186537,3,0
3,0,131,1.925646,4,0
4,0,185,1.984892,5,2
...,...,...,...,...,...
49995,9999,35,-1.777569,1,1
49996,9999,35,0.378292,2,1
49997,9999,35,-2.925798,3,1
49998,9999,74,0.145590,4,6


In [15]:
bd = bd.get_es()
display(bd)

Unnamed: 0,wid,f1i,f2i,y1,y2,year_1,year_2,j1,j2,m
0,23,22,22,-3.407073,-3.407073,1,1,7,7,0
1,23,22,22,-1.316057,-1.316057,2,2,7,7,0
2,23,22,22,0.714426,0.714426,3,3,7,7,0
3,23,22,22,-1.892804,-1.892804,4,4,7,7,0
4,23,22,22,-2.257470,-2.257470,5,5,7,7,0
...,...,...,...,...,...,...,...,...,...,...
40633,9998,100,100,0.878803,-0.491500,4,5,3,3,1
40634,9999,35,35,-1.777569,0.378292,1,2,1,1,1
40635,9999,35,35,0.378292,-2.925798,2,3,1,1,1
40636,9999,35,74,-2.925798,0.145590,3,4,1,6,1


In [16]:
bd = bd.get_long()
bd = bd.get_collapsed_long()
display(bd)

Unnamed: 0,wid,fid,comp,year_start,year_end,weight,j,m
0,0,113,0.445394,1,1,1,5,1
1,0,131,1.401891,2,4,3,0,1
2,0,185,1.984892,5,5,1,2,1
3,1,72,-0.340153,1,2,2,6,1
4,1,113,2.497030,3,3,1,5,1
...,...,...,...,...,...,...,...,...
29811,9998,178,0.431457,1,1,1,2,1
29812,9998,126,0.003084,2,2,1,0,1
29813,9998,100,0.580192,3,5,3,3,1
29814,9999,35,-1.441692,1,3,3,1,1


In [17]:
bd = bd.get_es()
display(bd)

Unnamed: 0,wid,f1i,f2i,y1,y2,year_start_1,year_end_1,year_start_2,year_end_2,w1,w2,j1,j2,m
0,23,22,22,-1.631796,-1.631796,1,5,1,5,5,5,7,7,0
1,27,49,49,0.704600,0.704600,1,5,1,5,5,5,9,9,0
2,41,70,70,0.305096,0.305096,1,5,1,5,5,5,9,9,0
3,44,162,162,2.157609,2.157609,1,5,1,5,5,5,8,8,0
4,59,163,163,2.611192,2.611192,1,5,1,5,5,5,8,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20449,9996,187,166,2.584873,2.358368,4,4,5,5,1,1,2,8,1
20450,9997,51,64,-0.688550,-2.225336,1,4,5,5,4,1,1,6,1
20451,9998,178,126,0.431457,0.003084,1,1,2,2,1,1,2,0,1
20452,9998,126,100,0.003084,0.580192,2,2,3,5,1,3,0,3,1


## Clustering options

In [18]:
# We can cluster on a specific year in long form
bd = tw.BipartiteLong(sim_data)
bd.clean_data()
bd = bd.cluster({'year': 1})
display(bd)

Unnamed: 0,wid,fid,comp,year,j
0,0,113,0.445394,1,4
1,0,131,2.093489,2,1
2,0,131,0.186537,3,1
3,0,131,1.925646,4,1
4,0,185,1.984892,5,2
...,...,...,...,...,...
49995,9999,35,-1.777569,1,3
49996,9999,35,0.378292,2,3
49997,9999,35,-2.925798,3,3
49998,9999,74,0.145590,4,9


In [19]:
# We can cluster on movers or stayers in any form
# Note that not all firms are clustered, so some rows have NaNs
bd = tw.BipartiteLong(sim_data)
bd.clean_data()
bd = bd.cluster({'stayers_movers': 'stayers'})
display(bd)

Unnamed: 0,wid,fid,comp,year,j,m
0,0,113,0.445394,1,0,1
1,0,131,2.093489,2,6,1
2,0,131,0.186537,3,6,1
3,0,131,1.925646,4,6,1
4,0,185,1.984892,5,1,1
...,...,...,...,...,...,...
49995,9999,35,-1.777569,1,7,1
49996,9999,35,0.378292,2,7,1
49997,9999,35,-2.925798,3,7,1
49998,9999,74,0.145590,4,0,1


In [20]:
# We can cluster on movers or stayers in any form
# Set 'dropna': True if you want to drop firms that don't get clustered (note that this fully resets firm and worker ids, since they must be contiguous for the FE/CRE/BLM estimators to work)
bd = tw.BipartiteLong(sim_data)
bd.clean_data()
bd = bd.cluster({'stayers_movers': 'stayers', 'dropna': True})
display(bd)

Unnamed: 0,wid,fid,comp,year,j,m
0,0,113,0.445394,1,7,1
1,0,131,2.093489,2,7,1
2,0,131,0.186537,3,7,1
3,0,131,1.925646,4,7,1
4,0,185,1.984892,5,1,1
...,...,...,...,...,...,...
48286,9999,35,-1.777569,1,4,1
48287,9999,35,0.378292,2,4,1
48288,9999,35,-2.925798,3,4,1
48289,9999,74,0.145590,4,0,1


In [21]:
# We can cluster on movers or stayers in any form
bd = tw.BipartiteLong(sim_data)
bd.clean_data()
bd = bd.cluster({'stayers_movers': 'movers'})
display(bd)

Unnamed: 0,wid,fid,comp,year,j,m
0,0,113,0.445394,1,0,1
1,0,131,2.093489,2,6,1
2,0,131,0.186537,3,6,1
3,0,131,1.925646,4,6,1
4,0,185,1.984892,5,2,1
...,...,...,...,...,...,...
49995,9999,35,-1.777569,1,1,1
49996,9999,35,0.378292,2,1,1
49997,9999,35,-2.925798,3,1,1
49998,9999,74,0.145590,4,3,1


## Some extra features

In [22]:
# We can get some summary statistics by running .summary()
sim_data = tw.SimTwoWay().sim_network()[['wid', 'fid', 'comp', 'year']]
bd = tw.BipartiteLong(sim_data)
bd.summary()
bd.clean_data()
bd.summary()

format: BipartiteLong
number of workers: 10000
number of firms: 195
number of observations: 50000
mean wage: 0.4013159142403359
max wage: 5.580774910105806
min wage: -4.978342649482517
connected: False
contiguous firm ids: False
contiguous worker ids: False
contiguous cluster ids (None if not clustered): None
correct column names and types: False
no nans: False
no duplicates: False

format: BipartiteLong
number of workers: 10000
number of firms: 195
number of observations: 50000
mean wage: 0.4013159142403359
max wage: 5.580774910105806
min wage: -4.978342649482517
connected: True
contiguous firm ids: True
contiguous worker ids: True
contiguous cluster ids (None if not clustered): None
correct column names and types: True
no nans: True
no duplicates: True



In [23]:
# BipartiteData does some nice column inference
# Suppose some of our columns are mislabeled - create a col_dict to clarify which columns to correct
sim_data = tw.SimTwoWay().sim_network()[['wid', 'fid', 'comp', 'year']]
sim_data = sim_data.rename({'fid': 'f1i'}, axis=1)
bd = tw.BipartiteLong(sim_data, col_dict={'fid': 'f1i'})
bd.clean_data()
display(bd)

Unnamed: 0,wid,fid,comp,year
0,0,100,1.386291,1
1,0,78,-1.804047,2
2,0,92,-0.925588,3
3,0,92,-1.173242,4
4,0,30,0.004835,5
...,...,...,...,...
49995,9999,125,0.843797,1
49996,9999,52,0.698797,2
49997,9999,39,-0.403950,3
49998,9999,39,-1.076814,4


In [24]:
# Suppose we have already clustered - the class determines this automatically
sim_data = tw.SimTwoWay().sim_network()[['wid', 'fid', 'comp', 'year']]
sim_data['j'] = 1
bd = tw.BipartiteLong(sim_data)
bd.clean_data()
display(bd)

Unnamed: 0,wid,fid,comp,year,j
0,0,12,-3.455616,1,1
1,0,12,-3.075479,2,1
2,0,23,-1.737707,3,1
3,0,37,-1.496512,4,1
4,0,37,0.525453,5,1
...,...,...,...,...,...
49995,9999,125,0.111549,1,1
49996,9999,125,0.227130,2,1
49997,9999,125,0.952326,3,1
49998,9999,54,-0.412935,4,1


In [25]:
# Now suppose we mislabeled our clusters - the class drops this column
sim_data = tw.SimTwoWay().sim_network()[['wid', 'fid', 'comp', 'year']]
sim_data['j1'] = 1
bd = tw.BipartiteLong(sim_data)
bd = bd.clean_data()
display(bd)

Unnamed: 0,wid,fid,comp,year
0,0,56,-0.085441,1
1,0,56,1.996593,2
2,0,115,-0.107398,3
3,0,115,3.389542,4
4,0,157,1.586202,5
...,...,...,...,...
49995,9999,46,3.564147,1
49996,9999,46,-1.277925,2
49997,9999,46,1.071586,3
49998,9999,168,2.631237,4


In [26]:
# We can make sure our columns are included by specifying the corrected labels in col_dict (note that if the variable of interest requires 2 columns, e.g. weight 1 and weight 2 for event study data, BOTH columns are required - if only one column is included, it is automatically dropped.)
sim_data = tw.SimTwoWay().sim_network()[['wid', 'fid', 'comp', 'year']]
sim_data['j1'] = 1
bd = tw.BipartiteLong(sim_data, col_dict={'j': 'j1'})
bd.clean_data()
display(bd)

Unnamed: 0,wid,fid,comp,year,j
0,0,111,0.527545,1,1
1,0,78,-0.937719,2,1
2,0,78,-0.645656,3,1
3,0,118,0.526164,4,1
4,0,163,3.130551,5,1
...,...,...,...,...,...
49995,9999,64,0.247145,1,1
49996,9999,18,-0.776968,2,1
49997,9999,100,0.959637,3,1
49998,9999,100,1.470325,4,1


In [27]:
# We may also specify a col_dict that overwrites a column label that would normally be inferred, and inference is skipped
sim_data = tw.SimTwoWay().sim_network()[['wid', 'fid', 'comp', 'year']]
sim_data = sim_data.rename({'fid': 'j'}, axis=1)
bd = tw.BipartiteLong(sim_data, col_dict={'fid': 'j'})
# Note that even though we have a column named j, because we specify it is the 'fid' column it is not inferred as the cluster values
bd.clean_data()
display(bd)

Unnamed: 0,wid,fid,comp,year
0,0,47,0.980005,1
1,0,47,0.550097,2
2,0,47,1.020063,3
3,0,75,0.654739,4
4,0,75,-0.047614,5
...,...,...,...,...
49995,9999,95,-0.468151,1
49996,9999,95,0.838850,2
49997,9999,95,0.661866,3
49998,9999,95,2.374607,4


In [28]:
# We can drop and rename optional columns (but not required columns)
sim_data = tw.SimTwoWay().sim_network()[['wid', 'fid', 'comp', 'year']]
sim_data['j'] = 1
bd = tw.BipartiteLong(sim_data)
bd.clean_data()
bd.rename({'j': 'j1'})
display(bd)

Unnamed: 0,wid,fid,comp,year,j1
0,0,142,1.187834,1,1
1,0,162,3.377437,2,1
2,0,162,1.334530,3,1
3,0,162,4.904663,4,1
4,0,162,3.694761,5,1
...,...,...,...,...,...
49995,9999,60,-0.708935,1,1
49996,9999,60,-0.379562,2,1
49997,9999,60,-0.799783,3,1
49998,9999,60,0.261037,4,1


In [29]:
# Dropping an optional column
bd.drop('j1')

Unnamed: 0,wid,fid,comp,year
0,0,142,1.187834,1
1,0,162,3.377437,2
2,0,162,1.334530,3
3,0,162,4.904663,4
4,0,162,3.694761,5
...,...,...,...,...
49995,9999,60,-0.708935,1
49996,9999,60,-0.379562,2
49997,9999,60,-0.799783,3
49998,9999,60,0.261037,4


In [30]:
# Dropping a required column is prevented
bd.drop('wid')



Unnamed: 0,wid,fid,comp,year
0,0,142,1.187834,1
1,0,162,3.377437,2
2,0,162,1.334530,3
3,0,162,4.904663,4
4,0,162,3.694761,5
...,...,...,...,...
49995,9999,60,-0.708935,1
49996,9999,60,-0.379562,2
49997,9999,60,-0.799783,3
49998,9999,60,0.261037,4
