# Extra features

In [1]:
# Add BipartitePandas to system path, do not run this
# import sys
# sys.path.append('../../..')

In [2]:
# Import the BipartitePandas package 
# (Make sure you have installed it using pip install bipartitepandas)
import bipartitepandas as bpd

In [3]:
# For the example, we simulate bipartite labor data
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,95,-0.475259,1
1,0,157,2.036083,2
2,0,157,0.662234,3
3,0,157,4.144825,4
4,0,177,1.571732,5
...,...,...,...,...
49995,9999,173,3.390882,1
49996,9999,173,2.714062,2
49997,9999,173,2.291981,3
49998,9999,173,2.192866,4


### Summary statistics

In [4]:
# We can get some summary statistics by running .summary()
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf.summary()
bdf = bdf.clean_data()
bdf.summary()

format: BipartiteLong
number of workers: 10000
number of firms: 196
number of observations: 50000
mean wage: 0.4106874263360842
max wage: 5.59122351830557
min wage: -6.6439909931722205
connected: False
contiguous i ids (None if not included): False
contiguous j ids (None if not included): False
contiguous g ids (None if not included): None
correct column names and types: False
no nans: False
no duplicates: False
i-t (worker-year) observations unique (None if t column(s) not included): False

format: BipartiteLong
number of workers: 10000
number of firms: 196
number of observations: 50000
mean wage: 0.4106874263360842
max wage: 5.59122351830557
min wage: -6.6439909931722205
connected: True
contiguous i ids (None if not included): True
contiguous j ids (None if not included): True
contiguous g ids (None if not included): None
correct column names and types: True
no nans: True
no duplicates: True
i-t (worker-year) observations unique (None if t column(s) not included): True



In [5]:
# We can easily retrieve the number of workers, firms, and clusters (which is None because the data is not clustered)
print(bdf.n_workers())
print(bdf.n_firms())
print(bdf.n_clusters())

10000
196
None


### Original worker and firm ids

In [6]:
# If you set include_id_reference_dict=True, BipartitePandas saves dataframes linking original column ids to contiguous column ids
try:
    display(bdf.id_reference_dict['i'])
except:
    print('Notice this fails, as include_id_reference_dict has not been set to True')
bdf = bpd.BipartiteLong(sim_data, include_id_reference_dict=True)
bdf = bdf.clean_data()
display(bdf.id_reference_dict['i']) # Notice it's empty as the ids were originally contiguous
bdf = bdf[bdf['i'] < 200]
bdf = bdf.clean_data()
display(bdf.id_reference_dict['i'])
bdf = bdf[bdf['i'] > 99]
bdf = bdf.clean_data()
display(bdf.id_reference_dict['i']) # Now ids are approximately - 100 as we dropped the first 100 rows (but because some workers might no longer in the largest connected set of firms, more than 100 rows can be dropped)
display(bdf.id_reference_dict['j']) # Notice firm ids adjusted from both steps
# Finally, we can return a Pandas DataFrame with the original columns merged in
display(bdf.original_ids())

Notice this fails, as include_id_reference_dict has not been set to True


Unnamed: 0,original_ids,adjusted_ids_1
0,100,0
1,101,1
2,102,2
3,103,3
4,104,4
...,...,...
92,195,92
93,196,93
94,197,94
95,198,95


Unnamed: 0,original_ids,adjusted_ids_1,adjusted_ids_2
0,0,0,0
1,1,1,1
2,2,2,2
3,3,3,
4,4,4,3
...,...,...,...
186,191,186,143
187,192,187,144
188,193,188,
189,194,189,145


Unnamed: 0,i,j,y,t,original_i,original_j
0,0,77,-0.367219,1,100,97
1,0,77,0.129278,2,100,97
2,0,77,0.487851,3,100,97
3,0,77,0.587456,4,100,97
4,0,77,-0.600196,5,100,97
...,...,...,...,...,...,...
480,96,10,-1.857470,1,199,11
481,96,20,-2.253385,2,199,22
482,96,20,-1.531509,3,199,22
483,96,20,-1.925547,4,199,22


### Filling in missing years as unemployed

In [7]:
# BipartitePandas can fill in missing years as unemployed (j=-1) for long data
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf[(bdf['t'] == 1) | (bdf['t'] == 5)] # Drop periods 2-4
bdf = bdf.clean_data()
display(bdf.fill_periods()) # Can specify fill_j=x instead of -1 and fill_y=y instead of NaN

Unnamed: 0,i,j,y,t
0,0,75,0.190688,1
1,0,-1,,2
2,0,-1,,3
3,0,-1,,4
4,0,108,-0.327032,5
...,...,...,...,...
49995,9999,2,-1.422114,1
49996,9999,-1,,2
49997,9999,-1,,3
49998,9999,-1,,4


### Getting extended event studies

In [8]:
# BipartitePandas allows you to use clustered long data to generate event studies with more than 2 periods (note they only return i, t, g, and y columns)
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
bdf = bdf.cluster()
es_extended = bdf.get_es_extended(periods_pre=3, periods_post=2, stable_pre=False, stable_post=False) # periods_post includes the first period after the transition
display(es_extended)

Unnamed: 0,i,t,g_l3,g_l2,g_l1,g_f1,g_f2,y_l3,y_l2,y_l1,y_f1,y_f2
0,3,4,3,3,3,3,3,2.389689,2.860047,2.610123,2.824718,3.188280
1,5,4,6,6,3,3,0,1.687508,2.222085,2.273496,1.039830,0.579249
2,6,4,2,0,0,7,7,-1.073846,-1.785203,-2.658163,-2.498565,-2.896267
3,7,4,0,0,0,7,7,-1.451531,0.016457,0.536526,-2.219636,1.128918
4,8,4,3,3,3,1,1,3.145236,0.295461,-0.540854,1.660430,1.257174
...,...,...,...,...,...,...,...,...,...,...,...,...
4978,9994,4,0,3,1,0,0,-0.833096,1.738988,1.685855,0.259959,-0.000176
4979,9995,4,0,0,0,0,1,-0.570217,1.573532,-0.401494,-2.171693,0.837444
4980,9997,4,7,0,3,0,0,0.560460,-1.741946,1.618080,0.828687,-0.898448
4981,9998,4,1,7,1,6,5,1.810997,1.469065,1.421835,1.256657,-1.404912


In [9]:
# You can specify stable_pre or stable_post to keep only workers who stayed at the same firm before/after the transition
es_extended = bdf.get_es_extended(periods_pre=3, periods_post=2, stable_pre=False, stable_post=True)
display(es_extended)

Unnamed: 0,i,t,g_l3,g_l2,g_l1,g_f1,g_f2,y_l3,y_l2,y_l1,y_f1,y_f2
0,3,4,3,3,3,3,3,2.389689,2.860047,2.610123,2.824718,3.188280
1,7,4,0,0,0,7,7,-1.451531,0.016457,0.536526,-2.219636,1.128918
2,14,4,7,7,7,2,2,0.251200,0.950570,-0.661024,-3.091429,-2.239547
3,15,4,0,1,1,3,3,2.183230,2.348490,1.786574,2.426871,1.873137
4,17,4,2,2,2,4,4,-1.088123,-0.264196,-1.710529,-1.997116,0.298100
...,...,...,...,...,...,...,...,...,...,...,...,...
2537,9990,4,3,6,6,6,6,5.357987,1.121260,1.358064,2.647050,0.313891
2538,9992,4,1,7,7,4,4,1.518030,-0.986159,-0.276542,-0.177890,0.395933
2539,9994,4,0,3,1,0,0,-0.833096,1.738988,1.685855,0.259959,-0.000176
2540,9997,4,7,0,3,0,0,0.560460,-1.741946,1.618080,0.828687,-0.898448


### Uncollapsing data (converting from collapsed long to long)

In [10]:
# BipartitePandas allows you to un-collapse collapsed long data (variables assumed constant over spells)
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data().get_collapsed_long()
display(bdf)
bdf = bdf.uncollapse()
display(bdf)

Unnamed: 0,i,j,y,t1,t2,w,m
0,0,78,0.009346,1,1,1,1
1,0,44,0.381649,2,2,1,1
2,0,46,0.017770,3,3,1,1
3,0,37,-0.738595,4,4,1,1
4,0,77,-0.428766,5,5,1,1
...,...,...,...,...,...,...,...
29927,9998,38,-2.019297,3,4,2,1
29928,9998,93,0.532911,5,5,1,1
29929,9999,54,1.078404,1,1,1,1
29930,9999,62,-1.230441,2,3,2,1


Unnamed: 0,i,j,y,t,w,m
0,0,78,0.009346,1,1.0,1
1,0,44,0.381649,2,1.0,1
2,0,46,0.017770,3,1.0,1
3,0,37,-0.738595,4,1.0,1
4,0,77,-0.428766,5,1.0,1
...,...,...,...,...,...,...
49995,9999,54,1.078404,1,1.0,1
49996,9999,62,-1.230441,2,2.0,1
49997,9999,62,-1.230441,3,2.0,1
49998,9999,3,-2.744810,4,2.0,1


### Column inference

In [11]:
# BipartitePandas does some nice column inference
# Suppose some of our columns are mislabeled - create a col_dict to clarify which columns to correct
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data = sim_data.rename({'i': 'i1'}, axis=1)
bdf = bpd.BipartiteLong(sim_data, col_dict={'i': 'i1'})
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,126,0.273470,1
1,0,17,-2.168754,2
2,0,17,-0.114282,3
3,0,63,-0.826436,4
4,0,7,0.188220,5
...,...,...,...,...
49995,9999,174,0.859657,1
49996,9999,174,1.621943,2
49997,9999,128,0.793700,3
49998,9999,128,-0.856568,4


In [12]:
# Suppose we have already clustered - the class determines this automatically
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data['g'] = 1
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,32,-0.735045,1,0
1,0,32,-0.651975,2,0
2,0,32,-2.005339,3,0
3,0,32,-1.170584,4,0
4,0,16,-1.312386,5,0
...,...,...,...,...,...
49995,9999,53,-0.248344,1,0
49996,9999,53,-1.131706,2,0
49997,9999,93,-0.244001,3,0
49998,9999,93,-1.136047,4,0


In [13]:
# Now suppose we mislabeled our clusters - the class drops this column
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data['g1'] = 1
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,111,-1.021839,1
1,0,111,1.083939,2
2,0,111,-0.456949,3
3,0,129,1.168908,4
4,0,129,-0.659914,5
...,...,...,...,...
49995,9999,18,0.069395,1
49996,9999,18,-1.895396,2
49997,9999,42,-0.966151,3
49998,9999,3,-1.377394,4


In [14]:
# We can make sure our columns are included by specifying the corrected labels in col_dict (note that if the variable of interest requires 2 columns, e.g. weight 1 and weight 2 for event study data, BOTH columns are required - if only one column is included, it is automatically dropped.)
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data['g1'] = 1
bdf = bpd.BipartiteLong(sim_data, col_dict={'g': 'g1'})
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,42,-0.552423,1,0
1,0,42,0.943231,2,0
2,0,42,-1.555695,3,0
3,0,42,-0.619463,4,0
4,0,88,-1.768071,5,0
...,...,...,...,...,...
49995,9999,102,0.300628,1,0
49996,9999,102,-1.552802,2,0
49997,9999,107,-1.702500,3,0
49998,9999,49,-0.932932,4,0


In [15]:
# We may also specify a col_dict that overwrites a column label that would normally be inferred, and inference is skipped
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data = sim_data.rename({'i': 'g'}, axis=1)
bdf = bpd.BipartiteLong(sim_data, col_dict={'i': 'g'})
# Note that even though we have a column named g, because we specify it is the 'i' column it is not inferred as the cluster values
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,15,-1.028431,1
1,0,15,-2.520617,2
2,0,15,-0.845811,3
3,0,1,-2.703370,4
4,0,1,-2.016674,5
...,...,...,...,...
49995,9999,110,0.295253,1
49996,9999,87,0.533566,2
49997,9999,17,-1.485227,3
49998,9999,58,0.940265,4


In [16]:
# Dropping an optional column
bdf.gen_m()
display(bdf)
bdf.drop('m')

Unnamed: 0,i,j,y,t,m
0,0,15,-1.028431,1,1
1,0,15,-2.520617,2,1
2,0,15,-0.845811,3,1
3,0,1,-2.703370,4,1
4,0,1,-2.016674,5,1
...,...,...,...,...,...
49995,9999,110,0.295253,1,1
49996,9999,87,0.533566,2,1
49997,9999,17,-1.485227,3,1
49998,9999,58,0.940265,4,1


Unnamed: 0,i,j,y,t
0,0,15,-1.028431,1
1,0,15,-2.520617,2
2,0,15,-0.845811,3
3,0,1,-2.703370,4
4,0,1,-2.016674,5
...,...,...,...,...
49995,9999,110,0.295253,1
49996,9999,87,0.533566,2
49997,9999,17,-1.485227,3
49998,9999,58,0.940265,4


In [17]:
# Dropping a required column is prevented
bdf.drop('i')



Unnamed: 0,i,j,y,t
0,0,15,-1.028431,1
1,0,15,-2.520617,2
2,0,15,-0.845811,3
3,0,1,-2.703370,4
4,0,1,-2.016674,5
...,...,...,...,...
49995,9999,110,0.295253,1
49996,9999,87,0.533566,2
49997,9999,17,-1.485227,3
49998,9999,58,0.940265,4
