# BipartitePandas example

In [1]:
# Add BipartitePandas to system path, do not run this
import sys
sys.path.append('../../..')
sys.path.append('/Users/adamalexanderoppenheimer/Desktop/pytwoway')

In [2]:
# Import the BipartitePandas package 
# (Make sure you have installed it using pip install bipartitepandas)
import bipartitepandas as bpd

In [3]:
# For the example, we simulate bipartite labor data using the package PyTwoWay
import pytwoway as tw
sim_data = tw.SimTwoWay().sim_network()[['i', 'j', 'y', 't']]
display(sim_data)

Unnamed: 0,i,j,y,t
0,0,34,0.085399,1
1,0,51,0.272650,2
2,0,51,-0.297119,3
3,0,44,-1.042264,4
4,0,44,0.314497,5
...,...,...,...,...
49995,9999,129,0.397837,1
49996,9999,146,0.182625,2
49997,9999,146,0.979173,3
49998,9999,149,0.382159,4


## BipartitePandas

BipartitePandas includes 4 classes:
- BipartiteLong
- BipartiteLongCollapsed
- BipartiteEventStudy
- BipartiteEventStudyCollapsed

For long data, each row gives a single observation. For collapsed long data, each row gives a single observation where employment spells at the same firm are collapsed into that observation. For event study data, each row gives the firms and compensation for a single worker over two consecutive periods (this is created from BipartiteLong). For collapsed event study data, each row gives the firms and salaries for a single worker over two consecutive employment spells (this is created from BipartiteLongCollapsed).

In [4]:
# Our data is in long form (each row gives a single observation)
bdf = bpd.BipartiteLong(sim_data)
display(bdf)
# Note the type
print(type(bdf))

Unnamed: 0,i,j,y,t
0,0,34,0.085399,1
1,0,51,0.272650,2
2,0,51,-0.297119,3
3,0,44,-1.042264,4
4,0,44,0.314497,5
...,...,...,...,...
49995,9999,129,0.397837,1
49996,9999,146,0.182625,2
49997,9999,146,0.979173,3
49998,9999,149,0.382159,4


<class 'bipartitepandas.bipartitelong.BipartiteLong'>


In [5]:
# We can check the number of workers, firms, and clusters (which is None because the data is not clustered)
print(bdf.n_workers())
print(bdf.n_firms())
print(bdf.n_clusters())

10000
195
None


In [6]:
# Whenever we want to use the data, we need to clean it
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,34,0.085399,1
1,0,51,0.272650,2
2,0,51,-0.297119,3
3,0,44,-1.042264,4
4,0,44,0.314497,5
...,...,...,...,...
49995,9999,129,0.397837,1
49996,9999,146,0.182625,2
49997,9999,146,0.979173,3
49998,9999,149,0.382159,4


## Converting formats

In [7]:
# While our original data is long, we might want it to be in event study form (each row gives two consecutive observations)
bdf = bdf.get_es()
display(bdf)
# Note the type
print(type(bdf))

Unnamed: 0,i,j1,j2,y1,y2,t1,t2,m
0,6,12,12,-2.278795,-2.278795,1,1,0
1,6,12,12,-2.207067,-2.207067,2,2,0
2,6,12,12,-2.283169,-2.283169,3,3,0
3,6,12,12,-1.276438,-1.276438,4,4,0
4,6,12,12,-1.499829,-1.499829,5,5,0
...,...,...,...,...,...,...,...,...
40678,9998,66,66,-0.116671,0.600787,4,5,1
40679,9999,129,146,0.397837,0.182625,1,2,1
40680,9999,146,146,0.182625,0.979173,2,3,1
40681,9999,146,149,0.979173,0.382159,3,4,1


<class 'bipartitepandas.bipartiteeventstudy.BipartiteEventStudy'>


In [8]:
# We can also use event study data to retrive cross section data (cs=1 gives y1 as y1 for both stayers and movers; cs=0 gives y2 as y1 for only movers - this allows (almost) all income data to be accessed from the y1 column. Note that for movers, the last observation for each worker is not available without manipulation as it is shifted to the y2 column. Also note that the y1 row contains duplicates for all mover incomes, except for the first period.)
display(bdf.get_cs())

Unnamed: 0,i,j1,j2,y1,y2,t1,t2,m,cs
0,6,12,12,-2.278795,-2.278795,1,1,0,1
1,6,12,12,-2.207067,-2.207067,2,2,0,1
2,6,12,12,-2.283169,-2.283169,3,3,0,1
3,6,12,12,-1.276438,-1.276438,4,4,0,1
4,6,12,12,-1.499829,-1.499829,5,5,0,1
...,...,...,...,...,...,...,...,...,...
77946,9998,66,66,0.600787,-0.116671,5,4,1,0
77947,9999,146,129,0.182625,0.397837,2,1,1,0
77948,9999,146,146,0.979173,0.182625,3,2,1,0
77949,9999,149,146,0.382159,0.979173,4,3,1,0


In [9]:
# Maybe we want to convert back into long form
bdf = bdf.get_long()
display(bdf)
# Note the type
print(type(bdf))

Unnamed: 0,i,j,y,t,m
0,0,34,0.085399,1,1
1,0,51,0.272650,2,1
2,0,51,-0.297119,3,1
3,0,44,-1.042264,4,1
4,0,44,0.314497,5,1
...,...,...,...,...,...
49995,9999,129,0.397837,1,1
49996,9999,146,0.182625,2,1
49997,9999,146,0.979173,3,1
49998,9999,149,0.382159,4,1


<class 'bipartitepandas.bipartitelong.BipartiteLong'>


In [10]:
# Now suppose we want to collapse by employment spells (so any consecutive observations with the same worker in the same firm are collapsed into 1 observation)
bdf = bdf.get_collapsed_long()
display(bdf)
# Note the type
print(type(bdf))

Unnamed: 0,i,j,y,t1,t2,w,m
0,0,34,0.085399,1,1,1,1
1,0,51,-0.012235,2,3,2,1
2,0,44,-0.363884,4,5,2,1
3,1,191,1.329782,1,1,1,1
4,1,149,1.446614,2,5,4,1
...,...,...,...,...,...,...,...
29845,9998,66,0.538122,2,5,4,1
29846,9999,129,0.397837,1,1,1,1
29847,9999,146,0.580899,2,3,2,1
29848,9999,149,0.382159,4,4,1,1


<class 'bipartitepandas.bipartitelongcollapsed.BipartiteLongCollapsed'>


In [11]:
# We can then check out the event study using collapsed data
bdf = bdf.get_es()
display(bdf)
# Note the type
print(type(bdf))

Unnamed: 0,i,j1,j2,y1,y2,t11,t12,t21,t22,w1,w2,m
0,6,12,12,-1.909060,-1.909060,1,5,1,5,5.0,5,0
1,48,84,84,-0.705389,-0.705389,1,5,1,5,5.0,5,0
2,57,142,142,1.312749,1.312749,1,5,1,5,5.0,5,0
3,61,0,0,-0.878919,-0.878919,1,5,1,5,5.0,5,0
4,62,74,74,0.111575,0.111575,1,5,1,5,5.0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...
20528,9997,24,27,-1.149961,-1.371562,1,4,5,5,4.0,1,1
20529,9998,81,66,0.305160,0.538122,1,1,2,5,1.0,4,1
20530,9999,129,146,0.397837,0.580899,1,1,2,3,1.0,2,1
20531,9999,146,149,0.580899,0.382159,2,3,4,4,2.0,1,1


<class 'bipartitepandas.bipartiteeventstudycollapsed.BipartiteEventStudyCollapsed'>


In [12]:
# We can then go back to collapsed long
bdf = bdf.get_long()
display(bdf)
# Note the type
print(type(bdf))

Unnamed: 0,i,j,y,t1,t2,w,m
0,0,34,0.085399,1,1,1.0,1
1,0,51,-0.012235,2,3,2.0,1
2,0,44,-0.363884,4,5,2.0,1
3,1,191,1.329782,1,1,1.0,1
4,1,149,1.446614,2,5,4.0,1
...,...,...,...,...,...,...,...
29845,9998,66,0.538122,2,5,4.0,1
29846,9999,129,0.397837,1,1,1.0,1
29847,9999,146,0.580899,2,3,2.0,1
29848,9999,149,0.382159,4,4,1.0,1


<class 'bipartitepandas.bipartitelongcollapsed.BipartiteLongCollapsed'>


## Clustering

In [13]:
# Starting over
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,34,0.085399,1
1,0,51,0.272650,2
2,0,51,-0.297119,3
3,0,44,-1.042264,4
4,0,44,0.314497,5
...,...,...,...,...
49995,9999,129,0.397837,1
49996,9999,146,0.182625,2
49997,9999,146,0.979173,3
49998,9999,149,0.382159,4


In [14]:
# We can cluster from any format and clusters stay when reformatting
bdf = bdf.cluster()
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,34,0.085399,1,3
1,0,51,0.272650,2,1
2,0,51,-0.297119,3,1
3,0,44,-1.042264,4,1
4,0,44,0.314497,5,1
...,...,...,...,...,...
49995,9999,129,0.397837,1,6
49996,9999,146,0.182625,2,0
49997,9999,146,0.979173,3,0
49998,9999,149,0.382159,4,0


In [15]:
bdf = bdf.get_es()
display(bdf)

Unnamed: 0,i,j1,j2,y1,y2,t1,t2,g1,g2,m
0,6,12,12,-2.278795,-2.278795,1,1,9,9,0
1,6,12,12,-2.207067,-2.207067,2,2,9,9,0
2,6,12,12,-2.283169,-2.283169,3,3,9,9,0
3,6,12,12,-1.276438,-1.276438,4,4,9,9,0
4,6,12,12,-1.499829,-1.499829,5,5,9,9,0
...,...,...,...,...,...,...,...,...,...,...
40678,9998,66,66,-0.116671,0.600787,4,5,2,2,1
40679,9999,129,146,0.397837,0.182625,1,2,6,0,1
40680,9999,146,146,0.182625,0.979173,2,3,0,0,1
40681,9999,146,149,0.979173,0.382159,3,4,0,0,1


In [16]:
bdf = bdf.get_long()
bdf = bdf.get_collapsed_long()
display(bdf)

Unnamed: 0,i,j,y,t1,t2,w,g,m
0,0,34,0.085399,1,1,1,3,1
1,0,51,-0.012235,2,3,2,1,1
2,0,44,-0.363884,4,5,2,1,1
3,1,191,1.329782,1,1,1,4,1
4,1,149,1.446614,2,5,4,0,1
...,...,...,...,...,...,...,...,...
29845,9998,66,0.538122,2,5,4,2,1
29846,9999,129,0.397837,1,1,1,6,1
29847,9999,146,0.580899,2,3,2,0,1
29848,9999,149,0.382159,4,4,1,0,1


In [17]:
bdf = bdf.get_es()
display(bdf)

Unnamed: 0,i,j1,j2,y1,y2,t11,t12,t21,t22,w1,w2,g1,g2,m
0,6,12,12,-1.909060,-1.909060,1,5,1,5,5.0,5,9,9,0
1,48,84,84,-0.705389,-0.705389,1,5,1,5,5.0,5,2,2,0
2,57,142,142,1.312749,1.312749,1,5,1,5,5.0,5,6,6,0
3,61,0,0,-0.878919,-0.878919,1,5,1,5,5.0,5,5,5,0
4,62,74,74,0.111575,0.111575,1,5,1,5,5.0,5,7,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20528,9997,24,27,-1.149961,-1.371562,1,4,5,5,4.0,1,9,3,1
20529,9998,81,66,0.305160,0.538122,1,1,2,5,1.0,4,2,2,1
20530,9999,129,146,0.397837,0.580899,1,1,2,3,1.0,2,6,0,1
20531,9999,146,149,0.580899,0.382159,2,3,4,4,2.0,1,0,0,1


In [18]:
bdf.drop('g')

Unnamed: 0,i,j1,j2,y1,y2,t11,t12,t21,t22,w1,w2,m
0,6,12,12,-1.909060,-1.909060,1,5,1,5,5.0,5,0
1,48,84,84,-0.705389,-0.705389,1,5,1,5,5.0,5,0
2,57,142,142,1.312749,1.312749,1,5,1,5,5.0,5,0
3,61,0,0,-0.878919,-0.878919,1,5,1,5,5.0,5,0
4,62,74,74,0.111575,0.111575,1,5,1,5,5.0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...
20528,9997,24,27,-1.149961,-1.371562,1,4,5,5,4.0,1,1
20529,9998,81,66,0.305160,0.538122,1,1,2,5,1.0,4,1
20530,9999,129,146,0.397837,0.580899,1,1,2,3,1.0,2,1
20531,9999,146,149,0.580899,0.382159,2,3,4,4,2.0,1,1


## Clustering options

In [19]:
# We can cluster on a specific year with non-collapsed data
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
bdf = bdf.cluster({'year': 1})
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,34,0.085399,1,0
1,0,51,0.272650,2,9
2,0,51,-0.297119,3,9
3,0,44,-1.042264,4,9
4,0,44,0.314497,5,9
...,...,...,...,...,...
49995,9999,129,0.397837,1,1
49996,9999,146,0.182625,2,6
49997,9999,146,0.979173,3,6
49998,9999,149,0.382159,4,6


In [20]:
# We can cluster on movers or stayers in any form
# Note that not all firms are clustered, so some rows have NaNs
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
bdf = bdf.cluster({'stayers_movers': 'stayers'})
display(bdf)

Unnamed: 0,i,j,y,t,g,m
0,0,34,0.085399,1,1,1
1,0,51,0.272650,2,1,1
2,0,51,-0.297119,3,1,1
3,0,44,-1.042264,4,6,1
4,0,44,0.314497,5,6,1
...,...,...,...,...,...,...
49995,9999,129,0.397837,1,2,1
49996,9999,146,0.182625,2,2,1
49997,9999,146,0.979173,3,2,1
49998,9999,149,0.382159,4,5,1


In [21]:
# We can cluster on movers or stayers in any form
# Set 'dropna': True if you want to drop firms that don't get clustered (note that this fully resets firm and worker ids, since they must be contiguous for the FE/CRE/BLM estimators to work)
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
bdf = bdf.cluster({'stayers_movers': 'stayers', 'dropna': True})
display(bdf)

Unnamed: 0,i,j,y,t,g,m
0,0,34,0.085399,1,4,1
1,0,51,0.272650,2,4,1
2,0,51,-0.297119,3,4,1
3,0,44,-1.042264,4,9,1
4,0,44,0.314497,5,9,1
...,...,...,...,...,...,...
47972,9999,129,0.397837,1,1,1
47973,9999,146,0.182625,2,1,1
47974,9999,146,0.979173,3,1,1
47975,9999,149,0.382159,4,5,1


In [22]:
# We can cluster on movers or stayers in any form
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
bdf = bdf.cluster({'stayers_movers': 'movers'})
display(bdf)

Unnamed: 0,i,j,y,t,g,m
0,0,34,0.085399,1,1,1
1,0,51,0.272650,2,8,1
2,0,51,-0.297119,3,8,1
3,0,44,-1.042264,4,8,1
4,0,44,0.314497,5,8,1
...,...,...,...,...,...,...
49995,9999,129,0.397837,1,0,1
49996,9999,146,0.182625,2,6,1
49997,9999,146,0.979173,3,6,1
49998,9999,149,0.382159,4,6,1


## Some extra features

In [23]:
# We can get some summary statistics by running .summary()
sim_data = tw.SimTwoWay().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf.summary()
bdf = bdf.clean_data()
bdf.summary()

format: BipartiteLong
number of workers: 10000
number of firms: 195
number of observations: 50000
mean wage: 0.3893386011808661
max wage: 5.6953531568707865
min wage: -5.072150043735375
connected: False
contiguous i ids (None if not included): False
contiguous j ids (None if not included): False
contiguous g ids (None if not included): None
correct column names and types: False
no nans: False
no duplicates: False
i-t (worker-year) observations unique (None if t column(s) not included): False

format: BipartiteLong
number of workers: 10000
number of firms: 195
number of observations: 50000
mean wage: 0.3893386011808661
max wage: 5.6953531568707865
min wage: -5.072150043735375
connected: True
contiguous i ids (None if not included): True
contiguous j ids (None if not included): True
contiguous g ids (None if not included): None
correct column names and types: True
no nans: True
no duplicates: True
i-t (worker-year) observations unique (None if t column(s) not included): True



In [24]:
# If you set include_id_reference_dict=True, BipartitePandas saves dataframes linking original column ids to contiguous column ids
try:
    display(bdf.id_reference_dict['i'])
except:
    print('Notice this fails, as include_id_reference_dict has not been set to True')
bdf = bpd.BipartiteLong(sim_data, include_id_reference_dict=True)
bdf = bdf.clean_data()
display(bdf.id_reference_dict['i']) # Notice it's empty as the ids were originally contiguous
bdf = bdf[bdf['i'] < 200]
bdf = bdf.clean_data()
display(bdf.id_reference_dict['i'])
bdf = bdf[bdf['i'] > 99]
bdf = bdf.clean_data()
display(bdf.id_reference_dict['i']) # Now ids are approximately - 100 as we dropped the first 100 rows (but because some workers might no longer in the largest connected set of firms, more than 100 rows can be dropped)
display(bdf.id_reference_dict['j']) # Notice firm ids adjusted from both steps

Notice this fails, as include_id_reference_dict has not been set to True


Unnamed: 0,original_ids,adjusted_ids_1
0,100,0
1,101,1
2,103,2
3,104,3
4,105,4
...,...,...
87,195,87
88,196,88
89,197,89
90,198,90


Unnamed: 0,original_ids,adjusted_ids_1,adjusted_ids_2
0,0,0,0
1,1,1,1
2,2,2,
3,3,3,2
4,4,4,3
...,...,...,...
181,190,181,137
182,191,182,138
183,192,183,139
184,193,184,140


In [25]:
# BipartitePandas does some nice column inference
# Suppose some of our columns are mislabeled - create a col_dict to clarify which columns to correct
sim_data = tw.SimTwoWay().sim_network()[['i', 'j', 'y', 't']]
sim_data = sim_data.rename({'i': 'i1'}, axis=1)
bdf = bpd.BipartiteLong(sim_data, col_dict={'i': 'i1'})
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,15,0.793047,1
1,0,99,-2.602358,2
2,0,21,-2.860965,3
3,0,21,-2.464436,4
4,0,21,-3.208799,5
...,...,...,...,...
49995,9999,54,-0.113751,1
49996,9999,19,0.095793,2
49997,9999,19,-0.855997,3
49998,9999,120,-0.239286,4


In [26]:
# Suppose we have already clustered - the class determines this automatically
sim_data = tw.SimTwoWay().sim_network()[['i', 'j', 'y', 't']]
sim_data['g'] = 1
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,21,-1.754394,1,0
1,0,13,-0.127063,2,0
2,0,19,-1.565290,3,0
3,0,50,-0.663289,4,0
4,0,50,-1.115333,5,0
...,...,...,...,...,...
49995,9999,95,-0.144884,1,0
49996,9999,155,0.435689,2,0
49997,9999,106,0.213675,3,0
49998,9999,106,1.334241,4,0


In [27]:
# Now suppose we mislabeled our clusters - the class drops this column
sim_data = tw.SimTwoWay().sim_network()[['i', 'j', 'y', 't']]
sim_data['g1'] = 1
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,104,1.252429,1
1,0,104,0.595987,2
2,0,104,0.613011,3
3,0,104,0.844392,4
4,0,104,-0.077763,5
...,...,...,...,...
49995,9999,72,-1.722204,1
49996,9999,121,0.824762,2
49997,9999,121,-2.282874,3
49998,9999,149,0.605612,4


In [28]:
# We can make sure our columns are included by specifying the corrected labels in col_dict (note that if the variable of interest requires 2 columns, e.g. weight 1 and weight 2 for event study data, BOTH columns are required - if only one column is included, it is automatically dropped.)
sim_data = tw.SimTwoWay().sim_network()[['i', 'j', 'y', 't']]
sim_data['g1'] = 1
bdf = bpd.BipartiteLong(sim_data, col_dict={'g': 'g1'})
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,2,-1.237287,1,0
1,0,11,-1.864289,2,0
2,0,49,-0.371723,3,0
3,0,67,-0.429302,4,0
4,0,67,-0.468308,5,0
...,...,...,...,...,...
49995,9999,23,-2.008774,1,0
49996,9999,18,-0.961762,2,0
49997,9999,18,-0.877630,3,0
49998,9999,18,-0.144286,4,0


In [29]:
# We may also specify a col_dict that overwrites a column label that would normally be inferred, and inference is skipped
sim_data = tw.SimTwoWay().sim_network()[['i', 'j', 'y', 't']]
sim_data = sim_data.rename({'i': 'g'}, axis=1)
bdf = bpd.BipartiteLong(sim_data, col_dict={'i': 'g'})
# Note that even though we have a column named g, because we specify it is the 'i' column it is not inferred as the cluster values
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,73,0.421496,1
1,0,73,0.338802,2
2,0,73,0.626823,3
3,0,88,0.760244,4
4,0,88,1.601247,5
...,...,...,...,...
49995,9999,69,-0.788912,1
49996,9999,2,-3.008116,2
49997,9999,2,0.535798,3
49998,9999,2,-1.091005,4


In [30]:
# Dropping an optional column
bdf.gen_m()
display(bdf)
bdf.drop('m')

Unnamed: 0,i,j,y,t,m
0,0,73,0.421496,1,1
1,0,73,0.338802,2,1
2,0,73,0.626823,3,1
3,0,88,0.760244,4,1
4,0,88,1.601247,5,1
...,...,...,...,...,...
49995,9999,69,-0.788912,1,1
49996,9999,2,-3.008116,2,1
49997,9999,2,0.535798,3,1
49998,9999,2,-1.091005,4,1


Unnamed: 0,i,j,y,t
0,0,73,0.421496,1
1,0,73,0.338802,2
2,0,73,0.626823,3
3,0,88,0.760244,4
4,0,88,1.601247,5
...,...,...,...,...
49995,9999,69,-0.788912,1
49996,9999,2,-3.008116,2
49997,9999,2,0.535798,3
49998,9999,2,-1.091005,4


In [31]:
# Dropping a required column is prevented
bdf.drop('i')



Unnamed: 0,i,j,y,t
0,0,73,0.421496,1
1,0,73,0.338802,2
2,0,73,0.626823,3
3,0,88,0.760244,4
4,0,88,1.601247,5
...,...,...,...,...
49995,9999,69,-0.788912,1
49996,9999,2,-3.008116,2
49997,9999,2,0.535798,3
49998,9999,2,-1.091005,4
