# Extra features

In [1]:
# Add BipartitePandas to system path, do not run this
# import sys
# sys.path.append('../../..')

In [2]:
# Import the BipartitePandas package 
# (Make sure you have installed it using pip install bipartitepandas)
import bipartitepandas as bpd

In [3]:
# For the example, we simulate bipartite labor data
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,143,0.831914,1
1,0,143,1.874040,2
2,0,143,0.715957,3
3,0,68,-2.700606,4
4,0,81,0.272300,5
...,...,...,...,...
49995,9999,101,0.007044,1
49996,9999,63,0.401529,2
49997,9999,62,1.534063,3
49998,9999,65,-1.001604,4


### Summary statistics

In [4]:
# We can get some summary statistics by running .summary()
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf.summary()
bdf = bdf.clean_data()
bdf.summary()

format: BipartiteLong
number of workers: 10000
number of firms: 194
number of observations: 50000
mean wage: 0.40380924850687144
max wage: 5.85239140272283
min wage: -5.021604177226817
connected: False
contiguous i ids (None if not included): False
contiguous j ids (None if not included): False
contiguous g ids (None if not included): None
correct column names and types: False
no nans: False
no duplicates: False
i-t (worker-year) observations unique (None if t column(s) not included): False

format: BipartiteLong
number of workers: 10000
number of firms: 194
number of observations: 50000
mean wage: 0.40380924850687144
max wage: 5.85239140272283
min wage: -5.021604177226817
connected: True
contiguous i ids (None if not included): True
contiguous j ids (None if not included): True
contiguous g ids (None if not included): None
correct column names and types: True
no nans: True
no duplicates: True
i-t (worker-year) observations unique (None if t column(s) not included): True



In [5]:
# We can easily retrieve the number of workers, firms, and clusters (which is None because the data is not clustered)
print(bdf.n_workers())
print(bdf.n_firms())
print(bdf.n_clusters())

10000
194
None


### Original worker and firm ids

In [6]:
# If you set include_id_reference_dict=True, BipartitePandas saves dataframes linking original column ids to contiguous column ids
try:
    display(bdf.id_reference_dict['i'])
except:
    print('Notice this fails, as include_id_reference_dict has not been set to True')
bdf = bpd.BipartiteLong(sim_data, include_id_reference_dict=True)
bdf = bdf.clean_data()
display(bdf.id_reference_dict['i']) # Notice it's empty as the ids were originally contiguous
bdf = bdf[bdf['i'] < 200]
bdf = bdf.clean_data()
display(bdf.id_reference_dict['i'])
bdf = bdf[bdf['i'] > 99]
bdf = bdf.clean_data()
display(bdf.id_reference_dict['i']) # Now ids are approximately - 100 as we dropped the first 100 rows (but because some workers might no longer in the largest connected set of firms, more than 100 rows can be dropped)
display(bdf.id_reference_dict['j']) # Notice firm ids adjusted from both steps
# Finally, we can return a Pandas DataFrame with the original columns merged in
display(bdf.original_ids())

Notice this fails, as include_id_reference_dict has not been set to True


Unnamed: 0,original_ids,adjusted_ids_1
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4
...,...,...
194,195,194
195,196,195
196,197,196
197,198,197


Unnamed: 0,original_ids,adjusted_ids_1,adjusted_ids_2
0,0,0,
1,1,1,
2,2,2,
3,3,3,
4,4,4,
...,...,...,...
194,195,194,88
195,196,195,89
196,197,196,90
197,198,197,91


Unnamed: 0,original_ids,adjusted_ids_1,adjusted_ids_2
0,0,0,
1,1,1,0
2,2,2,1
3,3,3,2
4,4,4,
...,...,...,...
182,189,182,135
183,190,183,136
184,191,184,137
185,192,185,138


Unnamed: 0,i,j,y,t,original_i,original_j
0,0,9,-0.437325,1,100,13
1,0,9,-1.016313,2,100,13
2,0,9,0.893949,3,100,13
3,0,7,-0.570367,4,100,11
4,0,15,-2.143699,5,100,21
...,...,...,...,...,...,...
460,92,69,0.511365,1,199,92
461,92,92,0.813950,2,199,126
462,92,92,0.300599,3,199,126
463,92,92,0.990898,4,199,126


### Filling in missing years as unemployed

In [7]:
# BipartitePandas can fill in missing years as unemployed (j=-1) for long data
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf[(bdf['t'] == 1) | (bdf['t'] == 5)] # Drop periods 2-4
bdf = bdf.clean_data()
display(bdf.fill_periods()) # Can specify fill_j=x instead of -1 and fill_y=y instead of NaN

Unnamed: 0,i,j,y,t
0,0,161,1.730230,1
1,0,-1,,2
2,0,-1,,3
3,0,-1,,4
4,0,190,1.514419,5
...,...,...,...,...
49995,9999,131,1.341158,1
49996,9999,-1,,2
49997,9999,-1,,3
49998,9999,-1,,4


### Getting extended event studies

In [8]:
# BipartitePandas allows you to use clustered long data to generate event studies with more than 2 periods
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
bdf = bdf.cluster()
es_extended = bdf.get_es_extended(periods_pre=3, periods_post=2) # periods_post includes the first period after the transition
display(es_extended)

Unnamed: 0,i,t,g_l3,g_l2,g_l1,g_f1,g_f2,y_l3,y_l2,y_l1,y_f1,y_f2
0,0,4,5,5,2,5,1,-1.319589,-0.174907,-0.268492,-0.222823,-0.967423
1,1,4,7,5,5,1,1,3.112369,0.707684,-1.179753,0.115367,1.798845
2,4,4,5,7,2,7,5,-0.343093,3.086335,1.551410,-1.890970,1.593227
3,5,4,6,6,6,3,3,-0.891242,0.135581,-1.018234,1.376120,0.533601
4,7,4,4,4,4,4,1,2.811751,2.295401,1.396154,2.319758,0.501592
...,...,...,...,...,...,...,...,...,...,...,...,...
4957,9985,4,5,5,7,2,7,0.090348,0.681990,1.323747,0.626204,0.199305
4958,9988,4,7,2,2,4,5,-0.738250,2.015954,1.348512,2.239793,0.995149
4959,9991,4,3,3,3,3,3,1.237640,1.103824,1.007448,0.543575,1.256424
4960,9992,4,4,1,1,5,5,0.858764,0.232722,-1.258985,1.512707,-0.000295


In [9]:
# You can specify which columns to include (by default they are g and y)
es_extended = bdf.get_es_extended(periods_pre=3, periods_post=2, include=['j', 'y'])
display(es_extended)

Unnamed: 0,i,t,j_l3,j_l2,j_l1,j_f1,j_f2,y_l3,y_l2,y_l1,y_f1,y_f2
0,0,4,73,73,160,100,116,-1.319589,-0.174907,-0.268492,-0.222823,-0.967423
1,1,4,138,70,70,107,107,3.112369,0.707684,-1.179753,0.115367,1.798845
2,4,4,67,129,147,130,101,-0.343093,3.086335,1.551410,-1.890970,1.593227
3,5,4,26,26,26,48,48,-0.891242,0.135581,-1.018234,1.376120,0.533601
4,7,4,171,171,171,184,104,2.811751,2.295401,1.396154,2.319758,0.501592
...,...,...,...,...,...,...,...,...,...,...,...,...
4957,9985,4,82,82,130,153,137,0.090348,0.681990,1.323747,0.626204,0.199305
4958,9988,4,142,153,153,180,68,-0.738250,2.015954,1.348512,2.239793,0.995149
4959,9991,4,61,61,53,41,41,1.237640,1.103824,1.007448,0.543575,1.256424
4960,9992,4,185,117,117,75,75,0.858764,0.232722,-1.258985,1.512707,-0.000295


In [10]:
# You can specify column(s) for stable_pre or stable_post to keep only workers with those columns constant before/after the transition
es_extended = bdf.get_es_extended(periods_pre=3, periods_post=2, stable_pre='j', stable_post='j', include=['j', 'y'])
display(es_extended)

Unnamed: 0,i,t,j_l3,j_l2,j_l1,j_f1,j_f2,y_l3,y_l2,y_l1,y_f1,y_f2
0,5,4,26,26,26,48,48,-0.891242,0.135581,-1.018234,1.376120,0.533601
1,11,4,90,90,90,55,55,-1.188273,-1.556692,-0.243082,-1.114554,-0.825119
2,20,4,7,7,7,19,19,-0.760350,-0.307045,-1.849397,-0.480912,-0.711828
3,21,4,165,165,165,136,136,1.077572,1.676825,1.654868,1.285087,-0.124872
4,22,4,160,160,160,129,129,1.192284,2.082066,1.773477,-0.602750,2.492439
...,...,...,...,...,...,...,...,...,...,...,...,...
672,9931,4,40,40,40,19,19,0.868774,-0.654855,-1.015332,-0.619288,-0.956393
673,9947,4,109,109,109,110,110,2.927761,-0.251957,-0.253679,3.107526,2.628622
674,9952,4,53,53,53,156,156,0.278879,-1.857849,-0.501891,0.513648,1.731994
675,9975,4,117,117,117,48,48,-0.313294,1.096537,1.609323,-1.585562,0.525544


In [11]:
# You can specify column(s) for stable_pre or stable_post that aren't included
es_extended = bdf.get_es_extended(periods_pre=3, periods_post=2, stable_pre='g', stable_post='g', include=['j', 'y'])
display(es_extended)

Unnamed: 0,i,t,j_l3,j_l2,j_l1,j_f1,j_f2,y_l3,y_l2,y_l1,y_f1,y_f2
0,5,4,26,26,26,48,48,-0.891242,0.135581,-1.018234,1.376120,0.533601
1,10,4,114,114,113,123,123,1.885514,1.330839,0.799175,0.384274,0.397947
2,11,4,90,90,90,55,55,-1.188273,-1.556692,-0.243082,-1.114554,-0.825119
3,12,4,106,106,106,117,112,1.802951,1.348905,2.319669,1.358730,1.521606
4,20,4,7,7,7,19,19,-0.760350,-0.307045,-1.849397,-0.480912,-0.711828
...,...,...,...,...,...,...,...,...,...,...,...,...
1151,9956,4,61,49,49,106,119,-2.440256,-0.525262,0.314338,0.011021,0.789005
1152,9975,4,117,117,117,48,48,-0.313294,1.096537,1.609323,-1.585562,0.525544
1153,9980,4,34,34,34,5,9,-0.116958,0.210124,0.745134,-2.156556,-1.264819
1154,9982,4,179,179,179,173,173,3.382625,1.140685,6.041010,2.953020,2.001907


In [12]:
# You can also redefine what column to use to define a transition
es_extended = bdf.get_es_extended(periods_pre=3, periods_post=2, stable_pre='g', stable_post='j', include=['j', 'g', 'y'], transition_col='g')
display(es_extended)
display(es_extended[es_extended['j_l3'] != es_extended['j_l2']])

Unnamed: 0,i,t,j_l3,j_l2,j_l1,j_f1,j_f2,g_l3,g_l2,g_l1,g_f1,g_f2,y_l3,y_l2,y_l1,y_f1,y_f2
0,5,4,26,26,26,48,48,6,6,6,3,3,-0.891242,0.135581,-1.018234,1.376120,0.533601
1,11,4,90,90,90,55,55,5,5,5,3,3,-1.188273,-1.556692,-0.243082,-1.114554,-0.825119
2,20,4,7,7,7,19,19,0,0,0,6,6,-0.760350,-0.307045,-1.849397,-0.480912,-0.711828
3,21,4,165,165,165,136,136,2,2,2,7,7,1.077572,1.676825,1.654868,1.285087,-0.124872
4,22,4,160,160,160,129,129,2,2,2,7,7,1.192284,2.082066,1.773477,-0.602750,2.492439
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
773,9881,4,167,167,167,30,30,2,2,2,3,3,0.972363,1.811118,1.746846,-0.601241,-1.296357
774,9886,4,34,34,34,39,39,3,3,3,6,6,-0.855795,-0.476592,0.311436,-0.349774,1.146934
775,9925,4,137,137,137,78,78,7,7,7,5,5,-0.006472,0.319333,-0.369671,-0.123170,-0.960522
776,9952,4,53,53,53,156,156,3,3,3,2,2,0.278879,-1.857849,-0.501891,0.513648,1.731994


Unnamed: 0,i,t,j_l3,j_l2,j_l1,j_f1,j_f2,g_l3,g_l2,g_l1,g_f1,g_f2,y_l3,y_l2,y_l1,y_f1,y_f2
7,99,4,11,3,3,13,13,0,0,0,6,6,-1.779357,-1.499258,-0.747031,-0.334472,0.198113
8,121,4,109,106,114,129,129,1,1,1,7,7,2.085544,2.005758,2.421565,1.894027,3.343613
9,124,4,8,7,7,43,43,0,0,0,3,3,-0.092789,-2.621944,-3.537007,-0.191808,0.706635
25,302,4,81,77,77,33,33,5,5,5,3,3,-0.631665,-1.384186,-1.919992,-1.944860,-1.027920
33,465,4,77,101,78,115,115,5,5,5,1,1,-1.420949,-0.795264,-0.391693,1.593954,0.157045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
746,9533,4,180,189,189,89,89,4,4,4,5,5,1.817817,1.934500,1.931564,-0.711794,0.769824
754,9616,4,91,99,99,146,146,5,5,5,7,7,0.486870,0.994644,-1.378687,0.467880,2.143138
759,9700,4,63,64,64,110,110,5,5,5,1,1,-0.751869,-0.241331,0.241297,-1.101039,-0.692780
763,9760,4,60,61,61,40,40,3,3,3,6,6,-0.611731,-0.246435,-0.204811,-0.048489,0.700177


### Uncollapsing data (converting from collapsed long to long)

In [13]:
# BipartitePandas allows you to un-collapse collapsed long data (variables assumed constant over spells)
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data().get_collapsed_long()
display(bdf)
bdf = bdf.uncollapse()
display(bdf)

Unnamed: 0,i,j,y,t1,t2,w,m
0,0,158,1.399034,1,1,1,1
1,0,70,0.909846,2,3,2,1
2,0,152,1.871571,4,5,2,1
3,1,63,-1.396622,1,1,1,1
4,1,126,1.154723,2,5,4,1
...,...,...,...,...,...,...,...
29805,9998,32,0.270743,4,4,1,1
29806,9998,30,-0.948227,5,5,1,1
29807,9999,86,0.636042,1,1,1,1
29808,9999,87,0.166891,2,3,2,1


Unnamed: 0,i,j,y,t,w,m
0,0,158,1.399034,1,1.0,1
1,0,70,0.909846,2,2.0,1
2,0,70,0.909846,3,2.0,1
3,0,152,1.871571,4,2.0,1
4,0,152,1.871571,5,2.0,1
...,...,...,...,...,...,...
49995,9999,86,0.636042,1,1.0,1
49996,9999,87,0.166891,2,2.0,1
49997,9999,87,0.166891,3,2.0,1
49998,9999,74,0.549215,4,2.0,1


### Column inference

In [14]:
# BipartitePandas does some nice column inference
# Suppose some of our columns are mislabeled - create a col_dict to clarify which columns to correct
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data = sim_data.rename({'i': 'i1'}, axis=1)
bdf = bpd.BipartiteLong(sim_data, col_dict={'i': 'i1'})
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,84,0.045776,1
1,0,84,0.547004,2
2,0,84,-0.991601,3
3,0,84,2.286988,4
4,0,84,-0.208582,5
...,...,...,...,...
49995,9999,130,-0.862824,1
49996,9999,130,-0.831983,2
49997,9999,130,-0.518348,3
49998,9999,89,1.248998,4


In [15]:
# Suppose we have already clustered - the class determines this automatically
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data['g'] = 1
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,106,-0.381943,1,0
1,0,106,0.176967,2,0
2,0,106,0.870707,3,0
3,0,106,0.478489,4,0
4,0,106,-0.574835,5,0
...,...,...,...,...,...
49995,9999,162,1.329164,1,0
49996,9999,82,0.681668,2,0
49997,9999,72,0.433670,3,0
49998,9999,72,0.269498,4,0


In [16]:
# Now suppose we mislabeled our clusters - the class drops this column
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data['g1'] = 1
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,22,-2.601893,1
1,0,22,-0.127736,2
2,0,29,-2.141948,3
3,0,29,-1.054840,4
4,0,29,-1.372042,5
...,...,...,...,...
49995,9999,122,1.195890,1
49996,9999,67,-1.252743,2
49997,9999,25,-0.172797,3
49998,9999,44,0.739482,4


In [17]:
# We can make sure our columns are included by specifying the corrected labels in col_dict (note that if the variable of interest requires 2 columns, e.g. weight 1 and weight 2 for event study data, BOTH columns are required - if only one column is included, it is automatically dropped.)
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data['g1'] = 1
bdf = bpd.BipartiteLong(sim_data, col_dict={'g': 'g1'})
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,168,0.814595,1,0
1,0,168,0.662443,2,0
2,0,168,0.347532,3,0
3,0,168,2.484297,4,0
4,0,137,2.897988,5,0
...,...,...,...,...,...
49995,9999,4,-1.536425,1,0
49996,9999,154,1.455651,2,0
49997,9999,42,-0.152831,3,0
49998,9999,42,-1.943490,4,0


In [18]:
# We may also specify a col_dict that overwrites a column label that would normally be inferred, and inference is skipped
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data = sim_data.rename({'i': 'g'}, axis=1)
bdf = bpd.BipartiteLong(sim_data, col_dict={'i': 'g'})
# Note that even though we have a column named g, because we specify it is the 'i' column it is not inferred as the cluster values
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,40,0.368689,1
1,0,40,0.387865,2
2,0,77,0.429439,3
3,0,77,-0.211502,4
4,0,77,1.001490,5
...,...,...,...,...
49995,9999,105,0.061235,1
49996,9999,133,-0.870774,2
49997,9999,133,-0.365559,3
49998,9999,133,0.743916,4


In [19]:
# Dropping an optional column
bdf.gen_m()
display(bdf)
bdf.drop('m')

Unnamed: 0,i,j,y,t,m
0,0,40,0.368689,1,1
1,0,40,0.387865,2,1
2,0,77,0.429439,3,1
3,0,77,-0.211502,4,1
4,0,77,1.001490,5,1
...,...,...,...,...,...
49995,9999,105,0.061235,1,1
49996,9999,133,-0.870774,2,1
49997,9999,133,-0.365559,3,1
49998,9999,133,0.743916,4,1


Unnamed: 0,i,j,y,t
0,0,40,0.368689,1
1,0,40,0.387865,2
2,0,77,0.429439,3
3,0,77,-0.211502,4
4,0,77,1.001490,5
...,...,...,...,...
49995,9999,105,0.061235,1
49996,9999,133,-0.870774,2
49997,9999,133,-0.365559,3
49998,9999,133,0.743916,4


In [20]:
# Dropping a required column is prevented
bdf.drop('i')



Unnamed: 0,i,j,y,t
0,0,40,0.368689,1
1,0,40,0.387865,2
2,0,77,0.429439,3
3,0,77,-0.211502,4
4,0,77,1.001490,5
...,...,...,...,...
49995,9999,105,0.061235,1
49996,9999,133,-0.870774,2
49997,9999,133,-0.365559,3
49998,9999,133,0.743916,4
