# Extra features

In [1]:
# Add BipartitePandas to system path, do not run this
# import sys
# sys.path.append('../../..')

In [2]:
# Import the BipartitePandas package 
# (Make sure you have installed it using pip install bipartitepandas)
import bipartitepandas as bpd

In [3]:
# For the example, we simulate bipartite labor data
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,165,1.879419,1
1,0,189,2.874304,2
2,0,132,0.325859,3
3,0,179,0.665738,4
4,0,179,2.368762,5
...,...,...,...,...
49995,9999,105,0.363534,1
49996,9999,48,1.699112,2
49997,9999,48,0.821230,3
49998,9999,92,-0.054306,4


### Summary statistics

In [4]:
# We can get some summary statistics by running .summary()
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf.summary()
bdf = bdf.clean()
bdf.summary()

format: BipartiteLong
number of workers: 10000
number of firms: 197
number of observations: 50000
mean wage: 0.421232629815004
max wage: 6.050973857770082
min wage: -4.718041854179123
connected (None if not ignoring connected set): False
contiguous i ids (None if not included): False
contiguous j ids (None if not included): False
contiguous g ids (None if not included): None
correct column names and types: False
no nans: False
no duplicates: False
i-t (worker-year) observations unique (None if t column(s) not included): False

format: BipartiteLong
number of workers: 10000
number of firms: 197
number of observations: 50000
mean wage: 0.421232629815004
max wage: 6.050973857770082
min wage: -4.718041854179123
connected (None if not ignoring connected set): connected
contiguous i ids (None if not included): True
contiguous j ids (None if not included): True
contiguous g ids (None if not included): None
correct column names and types: True
no nans: True
no duplicates: True
i-t (worker-year

In [5]:
# We can easily retrieve the number of workers, firms, and clusters (which is None because the data is not clustered)
print(bdf.n_workers())
print(bdf.n_firms())
print(bdf.n_clusters())

10000
197
None


### Connected set options

In [6]:
# Can compute the largest connected set of firms
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf[bdf['i'] < 200]
bdf = bdf.clean({'connectedness': 'connected'})
display(bdf)

Unnamed: 0,i,j,y,t
0,0,112,0.648245,1
1,0,112,1.082891,2
2,0,144,1.545957,3
3,0,114,-0.317596,4
4,0,126,0.651996,5
...,...,...,...,...
995,199,163,1.343188,1
996,199,184,1.745711,2
997,199,184,2.235194,3
998,199,184,3.568609,4


In [7]:
# Can compute the largest biconnected set of firms
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf[bdf['i'] < 200]
bdf = bdf.clean({'connectedness': 'biconnected'})
display(bdf)

Unnamed: 0,i,j,y,t
0,0,101,0.648245,1
1,0,101,1.082891,2
2,0,130,1.545957,3
3,0,103,-0.317596,4
4,0,114,0.651996,5
...,...,...,...,...
968,198,148,1.343188,1
969,198,167,1.745711,2
970,198,167,2.235194,3
971,198,167,3.568609,4


In [8]:
# Can skip connectedness
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf[bdf['i'] < 200]
bdf = bdf.clean({'connectedness': None})
display(bdf)

Unnamed: 0,i,j,y,t
0,0,112,0.648245,1
1,0,112,1.082891,2
2,0,144,1.545957,3
3,0,114,-0.317596,4
4,0,126,0.651996,5
...,...,...,...,...
995,199,163,1.343188,1
996,199,184,1.745711,2
997,199,184,2.235194,3
998,199,184,3.568609,4


### Original worker and firm ids

In [9]:
# If you set include_id_reference_dict=True, BipartitePandas saves dataframes linking original column ids to contiguous column ids
try:
    display(bdf.id_reference_dict['i'])
except:
    print('Notice this fails, as include_id_reference_dict has not been set to True')
bdf = bpd.BipartiteLong(sim_data, include_id_reference_dict=True)
bdf = bdf.clean()
display(bdf.id_reference_dict['i']) # Notice it's empty as the ids were originally contiguous
bdf = bdf[bdf['i'] < 200]
bdf = bdf.clean()
display(bdf.id_reference_dict['i'])
bdf = bdf[bdf['i'] > 99]
bdf = bdf.clean()
display(bdf.id_reference_dict['i']) # Now ids are approximately - 100 as we dropped the first 100 rows (but because some workers might no longer in the largest connected set of firms, more than 100 rows can be dropped)
display(bdf.id_reference_dict['j']) # Notice firm ids adjusted from both steps
# Finally, we can return a Pandas DataFrame with the original columns merged in
display(bdf.original_ids())

Notice this fails, as include_id_reference_dict has not been set to True


Unnamed: 0,original_ids,adjusted_ids_1
0,100,0
1,101,1
2,102,2
3,103,3
4,104,4
...,...,...
93,195,93
94,196,94
95,197,95
96,198,96


Unnamed: 0,original_ids,adjusted_ids_1,adjusted_ids_2
0,0,0,0
1,1,1,
2,2,2,1
3,3,3,2
4,4,4,3
...,...,...,...
183,192,183,
184,193,184,148
185,194,185,149
186,195,186,150


Unnamed: 0,i,j,y,t,original_i,original_j
0,0,72,-0.527867,1,100,101
1,0,105,1.214800,2,100,140
2,0,57,-0.815256,3,100,83
3,0,57,2.096110,4,100,83
4,0,61,-0.279476,5,100,88
...,...,...,...,...,...,...
485,97,131,1.343188,1,199,172
486,97,148,1.745711,2,199,193
487,97,148,2.235194,3,199,193
488,97,148,3.568609,4,199,193


### Filling in missing years as unemployed

In [10]:
# BipartitePandas can fill in missing years as unemployed (j=-1) for long data
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf[(bdf['t'] == 1) | (bdf['t'] == 5)] # Drop periods 2-4
bdf = bdf.clean()
display(bdf.fill_periods()) # Can specify fill_j=x instead of -1 and fill_y=y instead of NaN

Unnamed: 0,i,j,y,t
0,0,62,0.536547,1
1,0,-1,,2
2,0,-1,,3
3,0,-1,,4
4,0,136,0.832333,5
...,...,...,...,...
49995,9999,168,2.948802,1
49996,9999,-1,,2
49997,9999,-1,,3
49998,9999,-1,,4


### Uncollapsing data (converting from collapsed long to long)

In [11]:
# BipartitePandas allows you to un-collapse collapsed long data (variables assumed constant over spells)
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean().collapse()
display(bdf)
bdf = bdf.uncollapse()
display(bdf)

Unnamed: 0,i,j,y,t1,t2,w,m
0,0,9,-1.399933,1,1,1,1
1,0,68,-0.506151,2,5,4,1
2,1,98,0.484150,1,4,4,1
3,1,81,3.904754,5,5,1,1
4,2,196,0.802172,1,2,2,1
...,...,...,...,...,...,...,...
29814,9997,0,-1.735127,1,5,5,0
29815,9998,160,0.202144,1,4,4,1
29816,9998,132,2.614698,5,5,1,1
29817,9999,81,-0.299633,1,2,2,1


Unnamed: 0,i,j,y,t,w,m
0,0,9,-1.399933,1,1.0,1
1,0,68,-0.506151,2,4.0,1
2,0,68,-0.506151,3,4.0,1
3,0,68,-0.506151,4,4.0,1
4,0,68,-0.506151,5,4.0,1
...,...,...,...,...,...,...
49995,9999,81,-0.299633,1,2.0,1
49996,9999,81,-0.299633,2,2.0,1
49997,9999,14,-2.367281,3,3.0,1
49998,9999,14,-2.367281,4,3.0,1


### Handling i-t (worker-year) duplicates (note perfect duplicates are always dropped)

In [12]:
# Keep max paying job
bdf = bpd.BipartiteLong(sim_data)
append_val = bdf.iloc[0].copy()
append_val['y'] = 2 * append_val['y']
bdf = bdf.append(append_val)
bdf[['i', 'j', 't']] = bdf[['i', 'j', 't']].astype(int)
bdf = bdf.clean({'i_t_how': 'max'})
display(bdf)

Unnamed: 0,i,j,y,t
0,0,9,-1.399933,1
1,0,68,-0.196659,2
2,0,68,-0.459690,3
3,0,68,-0.066983,4
4,0,68,-1.301274,5
...,...,...,...,...
49995,9999,81,-0.699513,1
49996,9999,81,0.100247,2
49997,9999,14,-2.096191,3
49998,9999,14,-2.663811,4


In [13]:
# Keep max sum over i-j-t (worker-firm-year) duplicates
bdf = bpd.BipartiteLong(sim_data)
append_val = bdf.iloc[0].copy()
append_val['y'] = 2 * append_val['y']
bdf = bdf.append(append_val)
bdf[['i', 'j', 't']] = bdf[['i', 'j', 't']].astype(int)
bdf = bdf.clean({'i_t_how': 'sum'})
display(bdf)

Unnamed: 0,i,j,y,t
0,0,9,-4.199799,1
1,0,68,-0.196659,2
2,0,68,-0.459690,3
3,0,68,-0.066983,4
4,0,68,-1.301274,5
...,...,...,...,...
49995,9999,81,-0.699513,1
49996,9999,81,0.100247,2
49997,9999,14,-2.096191,3
49998,9999,14,-2.663811,4


In [14]:
# Keep max mean over i-j-t (worker-firm-year) duplicates
bdf = bpd.BipartiteLong(sim_data)
append_val = bdf.iloc[0].copy()
append_val['y'] = 2 * append_val['y']
bdf = bdf.append(append_val)
bdf[['i', 'j', 't']] = bdf[['i', 'j', 't']].astype(int)
bdf = bdf.clean({'i_t_how': 'mean'})
display(bdf)

Unnamed: 0,i,j,y,t
0,0,9,-2.099900,1
1,0,68,-0.196659,2
2,0,68,-0.459690,3
3,0,68,-0.066983,4
4,0,68,-1.301274,5
...,...,...,...,...
49995,9999,81,-0.699513,1
49996,9999,81,0.100247,2
49997,9999,14,-2.096191,3
49998,9999,14,-2.663811,4


### Column inference

In [15]:
# BipartitePandas does some nice column inference
# Suppose some of our columns are mislabeled - create a col_dict to clarify which columns to correct
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data = sim_data.rename({'i': 'i1'}, axis=1)
bdf = bpd.BipartiteLong(sim_data, col_dict={'i': 'i1'})
bdf = bdf.clean()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,160,1.545692,1
1,0,160,-0.206587,2
2,0,38,0.173688,3
3,0,132,1.067294,4
4,0,132,0.975155,5
...,...,...,...,...
49995,9999,58,-1.140622,1
49996,9999,183,3.063293,2
49997,9999,163,2.235301,3
49998,9999,152,1.554662,4


In [16]:
# Suppose we have already clustered - the class determines this automatically
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data['g'] = 1
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean()
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,34,0.461506,1,0
1,0,34,-0.345892,2,0
2,0,34,-0.927259,3,0
3,0,10,-1.568678,4,0
4,0,10,-1.475200,5,0
...,...,...,...,...,...
49995,9999,138,1.384965,1,0
49996,9999,136,1.023290,2,0
49997,9999,136,1.992347,3,0
49998,9999,136,4.169362,4,0


In [17]:
# Now suppose we mislabeled our clusters - the class drops this column
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data['g1'] = 1
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,140,2.033386,1
1,0,147,0.667737,2
2,0,144,0.706213,3
3,0,196,2.228223,4
4,0,162,0.320569,5
...,...,...,...,...
49995,9999,155,3.092776,1
49996,9999,155,3.361496,2
49997,9999,61,0.501577,3
49998,9999,163,1.892761,4


In [18]:
# We can make sure our columns are included by specifying the corrected labels in col_dict (note that if the variable of interest requires 2 columns, e.g. weight 1 and weight 2 for event study data, BOTH columns are required - if only one column is included, it is automatically dropped.)
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data['g1'] = 1
bdf = bpd.BipartiteLong(sim_data, col_dict={'g': 'g1'})
bdf = bdf.clean()
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,88,0.932728,1,0
1,0,106,0.446870,2,0
2,0,77,-0.801808,3,0
3,0,77,1.072784,4,0
4,0,77,-0.495686,5,0
...,...,...,...,...,...
49995,9999,14,-1.786902,1,0
49996,9999,14,-0.966727,2,0
49997,9999,14,-1.777515,3,0
49998,9999,83,0.317224,4,0


In [19]:
# We may also specify a col_dict that overwrites a column label that would normally be inferred, and inference is skipped
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data = sim_data.rename({'i': 'g'}, axis=1)
bdf = bpd.BipartiteLong(sim_data, col_dict={'i': 'g'})
# Note that even though we have a column named g, because we specify it is the 'i' column it is not inferred as the cluster values
bdf = bdf.clean()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,95,0.525415,1
1,0,95,-1.114370,2
2,0,95,-0.255893,3
3,0,134,-0.299719,4
4,0,97,0.864466,5
...,...,...,...,...
49995,9999,124,0.230510,1
49996,9999,137,3.862834,2
49997,9999,137,1.089809,3
49998,9999,137,2.911541,4


In [20]:
# Dropping an optional column
bdf.gen_m()
display(bdf)
bdf.drop('m')

Unnamed: 0,i,j,y,t,m
0,0,95,0.525415,1,1
1,0,95,-1.114370,2,1
2,0,95,-0.255893,3,1
3,0,134,-0.299719,4,1
4,0,97,0.864466,5,1
...,...,...,...,...,...
49995,9999,124,0.230510,1,1
49996,9999,137,3.862834,2,1
49997,9999,137,1.089809,3,1
49998,9999,137,2.911541,4,1


Unnamed: 0,i,j,y,t
0,0,95,0.525415,1
1,0,95,-1.114370,2
2,0,95,-0.255893,3
3,0,134,-0.299719,4
4,0,97,0.864466,5
...,...,...,...,...
49995,9999,124,0.230510,1
49996,9999,137,3.862834,2
49997,9999,137,1.089809,3
49998,9999,137,2.911541,4


In [21]:
# Dropping a required column is prevented
bdf.drop('i')



Unnamed: 0,i,j,y,t
0,0,95,0.525415,1
1,0,95,-1.114370,2
2,0,95,-0.255893,3
3,0,134,-0.299719,4
4,0,97,0.864466,5
...,...,...,...,...
49995,9999,124,0.230510,1
49996,9999,137,3.862834,2
49997,9999,137,1.089809,3
49998,9999,137,2.911541,4
