# BipartiteData example

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Add pytwoway to system path, do not run this
import sys
sys.path.append('../../..')

In [3]:
# Import the pytwoway package 
# (Make sure you have installed it using pip install pytwoway)
import pytwoway as tw

## Simulate some data

The package contains functions to simulate data. We use this here to keep things simple.

In [4]:
# For the example, we simulate data
sim_data = tw.SimTwoWay().sim_network()[['wid', 'fid', 'comp', 'year']]
display(sim_data)

Unnamed: 0,wid,fid,comp,year
0,0,132,2.233641,1
1,0,167,2.116843,2
2,0,93,1.144747,3
3,0,93,-1.093063,4
4,0,93,4.273720,5
...,...,...,...,...
49995,9999,18,-1.179408,1
49996,9999,18,-2.350623,2
49997,9999,18,-1.999968,3
49998,9999,65,-0.818786,4


# BipartiteData

BipartiteData is a superclass with 3 subclasses:
- BipartiteLong
- BipartiteLongCollapsed
- BipartiteEventStudy

The user interfaces with BipartiteData which converts between the 3 subclasses.

In [5]:
# Our data is in long form (each row gives a single observation)
bd = tw.BipartiteData(sim_data, formatting='long') # Long is default formatting, most data is long
bd.clean_data() # Should always clean data after initializing

In [6]:
# Let's see how our data looks
display(bd.data)

Unnamed: 0,comp,year,fid,wid
0,2.233641,1,132,0
1,2.116843,2,167,0
2,1.144747,3,93,0
3,-1.093063,4,93,0
4,4.273720,5,93,0
...,...,...,...,...
49995,-1.179408,1,18,9999
49996,-2.350623,2,18,9999
49997,-1.999968,3,18,9999
49998,-0.818786,4,65,9999


## Converting formats

In [7]:
# While our original data is long, we might want it to be in event study form (each row gives two consecutive observations)
bd.long_to_es()
display(bd.data)

Unnamed: 0,wid,y1,y2,f1i,f2i,year_1,year_2,m
0,5,0.307568,0.307568,96,96,1,1,0
1,5,1.007378,1.007378,96,96,2,2,0
2,5,2.349264,2.349264,96,96,3,3,0
3,5,0.737270,0.737270,96,96,4,4,0
4,5,0.637346,0.637346,96,96,5,5,0
...,...,...,...,...,...,...,...,...
40665,9998,-1.635754,-3.720507,0,0,4,5,1
40666,9999,-1.179408,-2.350623,18,18,1,2,1
40667,9999,-2.350623,-1.999968,18,18,2,3,1
40668,9999,-1.999968,-0.818786,18,65,3,4,1


In [8]:
# We can also use event study data to retrive cross section data (cs=1 gives y1 as y1 for both stayers and movers; cs=0 gives y2 as y1 for only movers - this allows (almost) all income data to be accessed from the y1 column. Note that for movers, the last observation for each worker is not available without manipulation as it is shifted to the y2 column. Also note that the y1 row contains duplicates for all mover income, except for the first period.)
display(bd.get_cs())

Unnamed: 0,wid,f1i,f2i,y1,y2,year_1,year_2,m,cs
0,5,96,96,0.307568,0.307568,1,1,0,1
1,5,96,96,1.007378,1.007378,2,2,0,1
2,5,96,96,2.349264,2.349264,3,3,0,1
3,5,96,96,0.737270,0.737270,4,4,0,1
4,5,96,96,0.637346,0.637346,5,5,0,1
...,...,...,...,...,...,...,...,...,...
77985,9998,0,0,-3.720507,-1.635754,5,4,1,0
77986,9999,18,18,-2.350623,-1.179408,2,1,1,0
77987,9999,18,18,-1.999968,-2.350623,3,2,1,0
77988,9999,65,18,-0.818786,-1.999968,4,3,1,0


In [9]:
# Maybe we want to convert back into long form
bd.es_to_long()
display(bd.data)

Unnamed: 0,wid,comp,fid,year,m
0,0,2.233641,132,1,1
1,0,2.116843,167,2,1
2,0,1.144747,93,3,1
3,0,-1.093063,93,4,1
4,0,4.273720,93,5,1
...,...,...,...,...,...
49995,9999,-1.179408,18,1,1
49996,9999,-2.350623,18,2,1
49997,9999,-1.999968,18,3,1
49998,9999,-0.818786,65,4,1


In [10]:
# Now suppose we want to collapse by employment spells (so any consecutive observations with the same worker in the same firm are collapsed into 1 observation)
bd.long_to_collapsed_long()
display(bd.data)

Unnamed: 0,wid,comp,fid,year_start,year_end,weight,m
0,0,2.233641,132,1,1,1,1
1,0,2.116843,167,2,2,1,1
2,0,1.441801,93,3,5,3,1
3,1,-0.522226,39,1,2,2,1
4,1,-1.786568,11,3,4,2,1
...,...,...,...,...,...,...,...
29572,9998,0.012756,1,1,1,1,1
29573,9998,-1.611240,22,2,2,1,1
29574,9998,-2.984213,0,3,5,3,1
29575,9999,-1.843333,18,1,3,3,1


In [11]:
# We can then check out the event study using collapsed data
bd.collapsed_long_to_es()
display(bd.data)

Unnamed: 0,wid,y1,y2,f1i,f2i,year_start_1,year_start_2,year_end_1,year_end_2,m,w1,w2
0,5,1.007765,1.007765,96,96,1,1,5,5,0,5,5
1,7,-1.391752,-1.391752,16,16,1,1,5,5,0,5,5
2,20,1.691682,1.691682,172,172,1,1,5,5,0,5,5
3,23,-0.715762,-0.715762,31,31,1,1,5,5,0,5,5
4,60,1.423370,1.423370,159,159,1,1,5,5,0,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...
20242,9996,-0.775103,-0.163332,81,91,3,4,3,5,1,1,2
20243,9997,-1.432805,-2.310656,27,15,1,5,4,5,1,4,1
20244,9998,0.012756,-1.611240,1,22,1,2,1,2,1,1,1
20245,9998,-1.611240,-2.984213,22,0,2,3,2,5,1,1,3


In [12]:
# We can then go back to collapsed long
bd.es_to_collapsed_long()
display(bd.data)

Unnamed: 0,wid,comp,fid,year_start,year_end,m,weight
0,0,2.233641,132,1,1,1,1
1,0,2.116843,167,2,2,1,1
2,0,1.441801,93,3,5,1,3
3,1,-0.522226,39,1,2,1,2
4,1,-1.786568,11,3,4,1,2
...,...,...,...,...,...,...,...
29572,9998,0.012756,1,1,1,1,1
29573,9998,-1.611240,22,2,2,1,1
29574,9998,-2.984213,0,3,5,1,3
29575,9999,-1.843333,18,1,3,1,3


## Clustering

In [13]:
# Starting over
bd = tw.BipartiteData(sim_data, formatting='long')
bd.clean_data()
display(bd.data)

Unnamed: 0,comp,year,fid,wid
0,2.233641,1,132,0
1,2.116843,2,167,0
2,1.144747,3,93,0
3,-1.093063,4,93,0
4,4.273720,5,93,0
...,...,...,...,...
49995,-1.179408,1,18,9999
49996,-2.350623,2,18,9999
49997,-1.999968,3,18,9999
49998,-0.818786,4,65,9999


In [14]:
# We can cluster from any format and clusters stay when reformatting
bd.cluster()
display(bd.data)

Unnamed: 0,comp,year,fid,wid,m,j
0,2.233641,1,132,0,1,7
1,2.116843,2,167,0,1,1
2,1.144747,3,93,0,1,0
3,-1.093063,4,93,0,1,0
4,4.273720,5,93,0,1,0
...,...,...,...,...,...,...
49995,-1.179408,1,18,9999,1,9
49996,-2.350623,2,18,9999,1,9
49997,-1.999968,3,18,9999,1,9
49998,-0.818786,4,65,9999,1,6


In [15]:
bd.long_to_es()
display(bd.data)

Unnamed: 0,wid,y1,y2,f1i,f2i,year_1,year_2,m,j1,j2
0,5,0.307568,0.307568,96,96,1,1,0,0,0
1,5,1.007378,1.007378,96,96,2,2,0,0,0
2,5,2.349264,2.349264,96,96,3,3,0,0,0
3,5,0.737270,0.737270,96,96,4,4,0,0,0
4,5,0.637346,0.637346,96,96,5,5,0,0,0
...,...,...,...,...,...,...,...,...,...,...
40665,9998,-1.635754,-3.720507,0,0,4,5,1,2,2
40666,9999,-1.179408,-2.350623,18,18,1,2,1,9,9
40667,9999,-2.350623,-1.999968,18,18,2,3,1,9,9
40668,9999,-1.999968,-0.818786,18,65,3,4,1,9,6


In [16]:
bd.es_to_long()
bd.long_to_collapsed_long()
display(bd.data)

Unnamed: 0,wid,comp,fid,year_start,year_end,weight,m,j
0,0,2.233641,132,1,1,1,1,7
1,0,2.116843,167,2,2,1,1,1
2,0,1.441801,93,3,5,3,1,0
3,1,-0.522226,39,1,2,2,1,3
4,1,-1.786568,11,3,4,2,1,2
...,...,...,...,...,...,...,...,...
29572,9998,0.012756,1,1,1,1,1,2
29573,9998,-1.611240,22,2,2,1,1,9
29574,9998,-2.984213,0,3,5,3,1,2
29575,9999,-1.843333,18,1,3,3,1,9


In [17]:
bd.collapsed_long_to_es()
display(bd.data)

Unnamed: 0,wid,y1,y2,f1i,f2i,year_start_1,year_start_2,year_end_1,year_end_2,m,w1,w2,j1,j2
0,5,1.007765,1.007765,96,96,1,1,5,5,0,5,5,0,0
1,7,-1.391752,-1.391752,16,16,1,1,5,5,0,5,5,9,9
2,20,1.691682,1.691682,172,172,1,1,5,5,0,5,5,4,4
3,23,-0.715762,-0.715762,31,31,1,1,5,5,0,5,5,3,3
4,60,1.423370,1.423370,159,159,1,1,5,5,0,5,5,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20242,9996,-0.775103,-0.163332,81,91,3,4,3,5,1,1,2,6,0
20243,9997,-1.432805,-2.310656,27,15,1,5,4,5,1,4,1,3,9
20244,9998,0.012756,-1.611240,1,22,1,2,1,2,1,1,1,2,9
20245,9998,-1.611240,-2.984213,22,0,2,3,2,5,1,1,3,9,2


## Clustering options

In [18]:
# We can cluster on a specific year in long form
bd = tw.BipartiteData(sim_data, formatting='long')
bd.clean_data()
bd.cluster({'year': 1})
display(bd.data)

Unnamed: 0,comp,year,fid,wid,m,j
0,2.233641,1,132,0,1,3
1,2.116843,2,167,0,1,1
2,1.144747,3,93,0,1,8
3,-1.093063,4,93,0,1,8
4,4.273720,5,93,0,1,8
...,...,...,...,...,...,...
49995,-1.179408,1,18,9999,1,2
49996,-2.350623,2,18,9999,1,2
49997,-1.999968,3,18,9999,1,2
49998,-0.818786,4,65,9999,1,0


In [19]:
# We can cluster on movers or stayers in any form
# Note the clusters become floats because not all firms are clustered, so some rows have NaNs
bd = tw.BipartiteData(sim_data, formatting='long')
bd.clean_data()
bd.cluster({'stayers_movers': 'stayers'})
display(bd.data)

Unnamed: 0,comp,year,fid,wid,m,j
0,2.233641,1,132,0,1,7.0
1,2.116843,2,167,0,1,0.0
2,1.144747,3,93,0,1,9.0
3,-1.093063,4,93,0,1,9.0
4,4.273720,5,93,0,1,9.0
...,...,...,...,...,...,...
49995,-1.179408,1,18,9999,1,5.0
49996,-2.350623,2,18,9999,1,5.0
49997,-1.999968,3,18,9999,1,5.0
49998,-0.818786,4,65,9999,1,2.0


In [20]:
# We can cluster on movers or stayers in any form
bd = tw.BipartiteData(sim_data, formatting='long')
bd.clean_data()
bd.cluster({'stayers_movers': 'movers'})
display(bd.data)

Unnamed: 0,comp,year,fid,wid,m,j
0,2.233641,1,132,0,1,4
1,2.116843,2,167,0,1,0
2,1.144747,3,93,0,1,8
3,-1.093063,4,93,0,1,8
4,4.273720,5,93,0,1,8
...,...,...,...,...,...,...
49995,-1.179408,1,18,9999,1,2
49996,-2.350623,2,18,9999,1,2
49997,-1.999968,3,18,9999,1,2
49998,-0.818786,4,65,9999,1,1


## Some extra features

In [21]:
# BipartiteData does some nice column imputation
# Suppose some of our columns are mislabeled - create a col_dict to clarify which columns to correct
sim_data = tw.SimTwoWay().sim_network()[['wid', 'fid', 'comp', 'year']]
sim_data = sim_data.rename({'fid': 'f1i'}, axis=1)
bd = tw.BipartiteData(sim_data, col_dict={'fid': 'f1i'}, formatting='long')
bd.clean_data()
display(bd.data)

Unnamed: 0,comp,year,fid,wid
0,3.962083,1,185,0
1,1.997126,2,185,0
2,2.185055,3,187,0
3,0.371443,4,187,0
4,0.958993,5,187,0
...,...,...,...,...
49995,2.262165,1,178,9999
49996,0.772176,2,133,9999
49997,2.015632,3,71,9999
49998,0.365363,4,71,9999


In [22]:
# Suppose we have already clustered - the class determines this automatically
sim_data = tw.SimTwoWay().sim_network()[['wid', 'fid', 'comp', 'year']]
sim_data['j'] = 1
bd = tw.BipartiteData(sim_data, formatting='long')
bd.clean_data()
display(bd.data)

Unnamed: 0,comp,year,j,fid,wid
0,1.101025,1,1,115,0
1,0.437201,2,1,70,0
2,0.761899,3,1,127,0
3,-0.841149,4,1,127,0
4,2.916458,5,1,169,0
...,...,...,...,...,...
49995,-3.416694,1,1,9,9999
49996,-0.959004,2,1,31,9999
49997,0.022645,3,1,31,9999
49998,-2.729511,4,1,31,9999


In [23]:
# Now suppose we mislabeled our clusters - the class drops this column
sim_data = tw.SimTwoWay().sim_network()[['wid', 'fid', 'comp', 'year']]
sim_data['j1'] = 1
bd = tw.BipartiteData(sim_data, formatting='long')
bd.clean_data()
display(bd.data)

Unnamed: 0,comp,year,fid,wid
0,-0.639681,1,59,0
1,-0.758475,2,59,0
2,0.895386,3,59,0
3,-1.035843,4,59,0
4,-2.208539,5,31,0
...,...,...,...,...
49995,2.582202,1,174,9999
49996,0.842186,2,192,9999
49997,2.628380,3,192,9999
49998,1.566230,4,192,9999


In [24]:
# We can make sure our columns are included by specifying the corrected labels in col_dict (note that if the variable of interest requires 2 columns, e.g. weight 1 and weight 2 for event study data, BOTH columns are required - if only one column is included, it is automatically dropped.)
sim_data = tw.SimTwoWay().sim_network()[['wid', 'fid', 'comp', 'year']]
sim_data['j1'] = 1
bd = tw.BipartiteData(sim_data, col_dict={'j': 'j1'}, formatting='long')
bd.clean_data()
display(bd.data)

Unnamed: 0,comp,year,j,fid,wid
0,2.003465,1,1,179,0
1,1.452602,2,1,104,0
2,1.631202,3,1,47,0
3,0.029973,4,1,47,0
4,-0.524351,5,1,47,0
...,...,...,...,...,...
49995,1.268498,1,1,92,9999
49996,1.160713,2,1,92,9999
49997,-3.080717,3,1,145,9999
49998,0.601908,4,1,145,9999


In [25]:
# We may also specify a col_dict that overwrites a column label that would normally be imputed, and imputation is skipped
sim_data = tw.SimTwoWay().sim_network()[['wid', 'fid', 'comp', 'year']]
sim_data = sim_data.rename({'fid': 'j'}, axis=1)
bd = tw.BipartiteData(sim_data, col_dict={'fid': 'j'}, formatting='long')
# Note that even though we have a column named j, because we specify it is the 'fid' column it is not imputed as the cluster values
bd.clean_data()
display(bd.data)

Unnamed: 0,comp,year,fid,wid
0,0.402571,1,146,0
1,0.747450,2,146,0
2,0.248654,3,64,0
3,-0.229086,4,64,0
4,-0.232683,5,88,0
...,...,...,...,...
49995,-0.952633,1,28,9999
49996,-0.006647,2,28,9999
49997,-1.572864,3,38,9999
49998,-0.419463,4,38,9999
