# Extra features

In [1]:
# Add BipartitePandas to system path, do not run this
# import sys
# sys.path.append('../../..')

In [2]:
# Import the BipartitePandas package 
# (Make sure you have installed it using pip install bipartitepandas)
import bipartitepandas as bpd

In [3]:
# For the example, we simulate bipartite labor data
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,187,1.118187,1
1,0,144,0.825055,2
2,0,91,-0.578380,3
3,0,124,2.276800,4
4,0,124,0.696323,5
...,...,...,...,...
49995,9999,55,0.202448,1
49996,9999,55,-1.168406,2
49997,9999,38,-0.613073,3
49998,9999,24,-1.193380,4


### Summary statistics

In [4]:
# We can get some summary statistics by running .summary()
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf.summary()
bdf = bdf.clean_data()
bdf.summary()

format: BipartiteLong
number of workers: 10000
number of firms: 194
number of observations: 50000
mean wage: 0.4070214078339329
max wage: 6.152193605725564
min wage: -5.738938593755481
connected: False
contiguous i ids (None if not included): False
contiguous j ids (None if not included): False
contiguous g ids (None if not included): None
correct column names and types: False
no nans: False
no duplicates: False
i-t (worker-year) observations unique (None if t column(s) not included): False

format: BipartiteLong
number of workers: 10000
number of firms: 194
number of observations: 50000
mean wage: 0.4070214078339329
max wage: 6.152193605725564
min wage: -5.738938593755481
connected: True
contiguous i ids (None if not included): True
contiguous j ids (None if not included): True
contiguous g ids (None if not included): None
correct column names and types: True
no nans: True
no duplicates: True
i-t (worker-year) observations unique (None if t column(s) not included): True



In [5]:
# We can easily retrieve the number of workers, firms, and clusters (which is None because the data is not clustered)
print(bdf.n_workers())
print(bdf.n_firms())
print(bdf.n_clusters())

10000
194
None


### Original worker and firm ids

In [6]:
# If you set include_id_reference_dict=True, BipartitePandas saves dataframes linking original column ids to contiguous column ids
try:
    display(bdf.id_reference_dict['i'])
except:
    print('Notice this fails, as include_id_reference_dict has not been set to True')
bdf = bpd.BipartiteLong(sim_data, include_id_reference_dict=True)
bdf = bdf.clean_data()
display(bdf.id_reference_dict['i']) # Notice it's empty as the ids were originally contiguous
bdf = bdf[bdf['i'] < 200]
bdf = bdf.clean_data()
display(bdf.id_reference_dict['i'])
bdf = bdf[bdf['i'] > 99]
bdf = bdf.clean_data()
display(bdf.id_reference_dict['i']) # Now ids are approximately - 100 as we dropped the first 100 rows (but because some workers might no longer in the largest connected set of firms, more than 100 rows can be dropped)
display(bdf.id_reference_dict['j']) # Notice firm ids adjusted from both steps
# Finally, we can return a Pandas DataFrame with the original columns merged in
display(bdf.original_ids())

Notice this fails, as include_id_reference_dict has not been set to True


Unnamed: 0,original_ids,adjusted_ids_1
0,100,0
1,101,1
2,102,2
3,104,3
4,105,4
...,...,...
89,195,89
90,196,90
91,197,91
92,198,92


Unnamed: 0,original_ids,adjusted_ids_1,adjusted_ids_2
0,0,0,
1,3,1,0
2,4,2,1
3,5,3,2
4,6,4,
...,...,...,...
184,189,184,143
185,190,185,144
186,191,186,
187,192,187,145


Unnamed: 0,i,j,y,t,original_i,original_j
0,0,3,-3.014089,1,100,7
1,0,25,-0.965636,2,100,41
2,0,25,0.532228,3,100,41
3,0,32,-0.455499,4,100,49
4,0,22,-0.728214,5,100,37
...,...,...,...,...,...,...
465,93,44,0.107045,1,199,63
466,93,44,0.065594,2,199,63
467,93,25,0.362479,3,199,41
468,93,29,0.397594,4,199,46


### Filling in missing years as unemployed

In [7]:
# BipartitePandas can fill in missing years as unemployed (j=-1) for long data
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf[(bdf['t'] == 1) | (bdf['t'] == 5)] # Drop periods 2-4
bdf = bdf.clean_data()
display(bdf.fill_periods()) # Can specify fill_j=x instead of -1 and fill_y=y instead of NaN

Unnamed: 0,i,j,y,t
0,0,65,0.168842,1
1,0,-1,,2
2,0,-1,,3
3,0,-1,,4
4,0,144,1.932735,5
...,...,...,...,...
49995,9999,143,1.179381,1
49996,9999,-1,,2
49997,9999,-1,,3
49998,9999,-1,,4


### Uncollapsing data (converting from collapsed long to long)

In [8]:
# BipartitePandas allows you to un-collapse collapsed long data (variables assumed constant over spells)
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data().get_collapsed_long()
display(bdf)
bdf = bdf.uncollapse()
display(bdf)

Unnamed: 0,i,j,y,t1,t2,w,m
0,0,178,2.934633,1,1,1,1
1,0,182,3.489114,2,3,2,1
2,0,151,1.502607,4,5,2,1
3,1,116,0.689134,1,2,2,1
4,1,28,-0.687698,3,5,3,1
...,...,...,...,...,...,...,...
29823,9997,130,0.481400,3,4,2,1
29824,9997,101,0.642999,5,5,1,1
29825,9998,6,-1.344365,1,2,2,1
29826,9998,52,-1.480374,3,5,3,1


Unnamed: 0,i,j,y,t,w,m
0,0,178,2.934633,1,1.0,1
1,0,182,3.489114,2,2.0,1
2,0,182,3.489114,3,2.0,1
3,0,151,1.502607,4,2.0,1
4,0,151,1.502607,5,2.0,1
...,...,...,...,...,...,...
49995,9999,4,-1.440963,1,5.0,0
49996,9999,4,-1.440963,2,5.0,0
49997,9999,4,-1.440963,3,5.0,0
49998,9999,4,-1.440963,4,5.0,0


### Column inference

In [9]:
# BipartitePandas does some nice column inference
# Suppose some of our columns are mislabeled - create a col_dict to clarify which columns to correct
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data = sim_data.rename({'i': 'i1'}, axis=1)
bdf = bpd.BipartiteLong(sim_data, col_dict={'i': 'i1'})
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,7,-1.004132,1
1,0,7,-1.036686,2
2,0,7,-2.021918,3
3,0,91,1.559148,4
4,0,79,0.109276,5
...,...,...,...,...
49995,9999,13,-1.030326,1
49996,9999,13,-0.817959,2
49997,9999,13,-1.670953,3
49998,9999,101,-0.125289,4


In [10]:
# Suppose we have already clustered - the class determines this automatically
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data['g'] = 1
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,194,3.488161,1,0
1,0,194,0.784953,2,0
2,0,63,-1.144378,3,0
3,0,63,1.149469,4,0
4,0,65,0.598780,5,0
...,...,...,...,...,...
49995,9999,23,0.656531,1,0
49996,9999,78,0.384348,2,0
49997,9999,170,2.854079,3,0
49998,9999,178,1.010523,4,0


In [11]:
# Now suppose we mislabeled our clusters - the class drops this column
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data['g1'] = 1
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,115,-1.432553,1
1,0,122,-0.274992,2
2,0,122,1.094538,3
3,0,122,-0.560156,4
4,0,122,0.622826,5
...,...,...,...,...
49995,9999,66,0.496160,1
49996,9999,182,2.107007,2
49997,9999,160,1.269078,3
49998,9999,160,2.048033,4


In [12]:
# We can make sure our columns are included by specifying the corrected labels in col_dict (note that if the variable of interest requires 2 columns, e.g. weight 1 and weight 2 for event study data, BOTH columns are required - if only one column is included, it is automatically dropped.)
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data['g1'] = 1
bdf = bpd.BipartiteLong(sim_data, col_dict={'g': 'g1'})
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,91,0.003615,1,0
1,0,71,0.897908,2,0
2,0,71,-0.330591,3,0
3,0,90,0.894953,4,0
4,0,140,3.019737,5,0
...,...,...,...,...,...
49995,9999,185,2.068286,1,0
49996,9999,185,4.050833,2,0
49997,9999,114,1.329334,3,0
49998,9999,114,0.403040,4,0


In [13]:
# We may also specify a col_dict that overwrites a column label that would normally be inferred, and inference is skipped
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data = sim_data.rename({'i': 'g'}, axis=1)
bdf = bpd.BipartiteLong(sim_data, col_dict={'i': 'g'})
# Note that even though we have a column named g, because we specify it is the 'i' column it is not inferred as the cluster values
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,60,-0.762876,1
1,0,76,-0.944334,2
2,0,76,-0.511395,3
3,0,71,1.385352,4
4,0,49,-0.126687,5
...,...,...,...,...
49995,9999,127,1.787158,1
49996,9999,127,0.313189,2
49997,9999,168,2.470771,3
49998,9999,119,1.666305,4


In [14]:
# Dropping an optional column
bdf.gen_m()
display(bdf)
bdf.drop('m')

Unnamed: 0,i,j,y,t,m
0,0,60,-0.762876,1,1
1,0,76,-0.944334,2,1
2,0,76,-0.511395,3,1
3,0,71,1.385352,4,1
4,0,49,-0.126687,5,1
...,...,...,...,...,...
49995,9999,127,1.787158,1,1
49996,9999,127,0.313189,2,1
49997,9999,168,2.470771,3,1
49998,9999,119,1.666305,4,1


Unnamed: 0,i,j,y,t
0,0,60,-0.762876,1
1,0,76,-0.944334,2
2,0,76,-0.511395,3
3,0,71,1.385352,4
4,0,49,-0.126687,5
...,...,...,...,...
49995,9999,127,1.787158,1
49996,9999,127,0.313189,2
49997,9999,168,2.470771,3
49998,9999,119,1.666305,4


In [15]:
# Dropping a required column is prevented
bdf.drop('i')



Unnamed: 0,i,j,y,t
0,0,60,-0.762876,1
1,0,76,-0.944334,2
2,0,76,-0.511395,3
3,0,71,1.385352,4
4,0,49,-0.126687,5
...,...,...,...,...
49995,9999,127,1.787158,1
49996,9999,127,0.313189,2
49997,9999,168,2.470771,3
49998,9999,119,1.666305,4
