# Extra features

In [1]:
# Add BipartitePandas to system path, do not run this
# import sys
# sys.path.append('../../..')

In [2]:
# Import the BipartitePandas package 
# (Make sure you have installed it using pip install bipartitepandas)
import bipartitepandas as bpd

In [3]:
# For the example, we simulate bipartite labor data
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,100,0.850212,1
1,0,185,3.109079,2
2,0,185,1.150882,3
3,0,185,2.288035,4
4,0,185,2.180009,5
...,...,...,...,...
49995,9999,83,0.567736,1
49996,9999,119,0.761244,2
49997,9999,75,2.361000,3
49998,9999,75,0.027062,4


### Summary statistics

In [4]:
# We can check the number of workers, firms, and clusters (which is None because the data is not clustered)
print(bdf.n_workers())
print(bdf.n_firms())
print(bdf.n_clusters())

10000
194
None


In [5]:
# We can get some summary statistics by running .summary()
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf.summary()
bdf = bdf.clean_data()
bdf.summary()

format: BipartiteLong
number of workers: 10000
number of firms: 194
number of observations: 50000
mean wage: 0.3940152205683422
max wage: 5.776081530258098
min wage: -5.890310737959243
connected: False
contiguous i ids (None if not included): False
contiguous j ids (None if not included): False
contiguous g ids (None if not included): None
correct column names and types: False
no nans: False
no duplicates: False
i-t (worker-year) observations unique (None if t column(s) not included): False

format: BipartiteLong
number of workers: 10000
number of firms: 194
number of observations: 50000
mean wage: 0.3940152205683422
max wage: 5.776081530258098
min wage: -5.890310737959243
connected: True
contiguous i ids (None if not included): True
contiguous j ids (None if not included): True
contiguous g ids (None if not included): None
correct column names and types: True
no nans: True
no duplicates: True
i-t (worker-year) observations unique (None if t column(s) not included): True



### Original worker and firm ids

In [6]:
# If you set include_id_reference_dict=True, BipartitePandas saves dataframes linking original column ids to contiguous column ids
try:
    display(bdf.id_reference_dict['i'])
except:
    print('Notice this fails, as include_id_reference_dict has not been set to True')
bdf = bpd.BipartiteLong(sim_data, include_id_reference_dict=True)
bdf = bdf.clean_data()
display(bdf.id_reference_dict['i']) # Notice it's empty as the ids were originally contiguous
bdf = bdf[bdf['i'] < 200]
bdf = bdf.clean_data()
display(bdf.id_reference_dict['i'])
bdf = bdf[bdf['i'] > 99]
bdf = bdf.clean_data()
display(bdf.id_reference_dict['i']) # Now ids are approximately - 100 as we dropped the first 100 rows (but because some workers might no longer in the largest connected set of firms, more than 100 rows can be dropped)
display(bdf.id_reference_dict['j']) # Notice firm ids adjusted from both steps
# Finally, we can return a Pandas DataFrame with the original columns merged in
display(bdf.original_ids())

Notice this fails, as include_id_reference_dict has not been set to True


Unnamed: 0,original_ids,adjusted_ids_1
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4
...,...,...
194,195,194
195,196,195
196,197,196
197,198,197


Unnamed: 0,original_ids,adjusted_ids_1,adjusted_ids_2
0,0,0,
1,1,1,
2,2,2,
3,3,3,
4,4,4,
...,...,...,...
194,195,194,91
195,196,195,92
196,197,196,93
197,198,197,


Unnamed: 0,original_ids,adjusted_ids_1,adjusted_ids_2
0,0,0,0
1,1,1,1
2,2,2,2
3,3,3,3
4,4,4,
...,...,...,...
183,189,183,139
184,190,184,140
185,191,185,
186,192,186,


Unnamed: 0,i,j,y,t,original_i,original_j
0,0,54,-0.637943,1,101,72
1,0,54,-0.032348,2,101,72
2,0,129,1.214961,3,101,173
3,0,129,0.747303,4,101,173
4,0,95,0.292926,5,101,127
...,...,...,...,...,...,...
470,94,92,0.243164,1,199,123
471,94,92,1.217721,2,199,123
472,94,130,0.655586,3,199,174
473,94,130,2.070455,4,199,174


### Filling in missing years as unemployed

In [7]:
# BipartitePandas can fill in missing years as unemployed (j=-1) for long data
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf[(bdf['t'] == 1) | (bdf['t'] == 5)] # Drop periods 2-4
bdf = bdf.clean_data()
display(bdf.fill_periods()) # Can specify fill_j=x instead of -1 and fill_y=y instead of NaN

Unnamed: 0,i,j,y,t
0,0,35,-0.937087,1
1,0,-1,,2
2,0,-1,,3
3,0,-1,,4
4,0,0,-1.727093,5
...,...,...,...,...
49995,9999,154,2.694752,1
49996,9999,-1,,2
49997,9999,-1,,3
49998,9999,-1,,4


### Uncollapsing data (converting from collapsed long to long)

In [8]:
# BipartitePandas allows you to un-collapse collapsed long data (variables assumed constant over spells)
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data().get_collapsed_long()
display(bdf)
bdf = bdf.uncollapse()
display(bdf)

Unnamed: 0,i,j,y,t1,t2,w,m
0,0,15,-1.586632,1,1,1,1
1,0,31,-1.221474,2,5,4,1
2,1,147,0.514602,1,3,3,1
3,1,104,0.078592,4,5,2,1
4,2,39,-0.710781,1,5,5,0
...,...,...,...,...,...,...,...
29785,9998,164,0.735405,1,3,3,1
29786,9998,191,3.219480,4,5,2,1
29787,9999,149,2.117283,1,3,3,1
29788,9999,65,1.209873,4,4,1,1


Unnamed: 0,i,j,y,t,w,m
0,0,15,-1.586632,1,1.0,1
1,0,31,-1.221474,2,4.0,1
2,0,31,-1.221474,3,4.0,1
3,0,31,-1.221474,4,4.0,1
4,0,31,-1.221474,5,4.0,1
...,...,...,...,...,...,...
49995,9999,149,2.117283,1,3.0,1
49996,9999,149,2.117283,2,3.0,1
49997,9999,149,2.117283,3,3.0,1
49998,9999,65,1.209873,4,1.0,1


### Column inference

In [9]:
# BipartitePandas does some nice column inference
# Suppose some of our columns are mislabeled - create a col_dict to clarify which columns to correct
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data = sim_data.rename({'i': 'i1'}, axis=1)
bdf = bpd.BipartiteLong(sim_data, col_dict={'i': 'i1'})
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,73,-0.926906,1
1,0,73,-1.614707,2
2,0,23,0.526607,3
3,0,33,-1.338169,4
4,0,70,-0.662472,5
...,...,...,...,...
49995,9999,42,-0.259712,1
49996,9999,85,-0.272997,2
49997,9999,42,-1.173498,3
49998,9999,32,-2.484496,4


In [10]:
# Suppose we have already clustered - the class determines this automatically
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data['g'] = 1
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,49,1.333236,1,0
1,0,49,1.041921,2,0
2,0,55,0.157938,3,0
3,0,55,0.892844,4,0
4,0,66,1.074220,5,0
...,...,...,...,...,...
49995,9999,79,0.474852,1,0
49996,9999,79,0.679068,2,0
49997,9999,79,-0.605855,3,0
49998,9999,167,3.994244,4,0


In [11]:
# Now suppose we mislabeled our clusters - the class drops this column
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data['g1'] = 1
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,129,0.106212,1
1,0,129,2.421254,2
2,0,184,1.669022,3
3,0,184,2.527571,4
4,0,184,2.750344,5
...,...,...,...,...
49995,9999,128,0.594863,1
49996,9999,128,-0.556530,2
49997,9999,128,2.884363,3
49998,9999,182,2.830062,4


In [12]:
# We can make sure our columns are included by specifying the corrected labels in col_dict (note that if the variable of interest requires 2 columns, e.g. weight 1 and weight 2 for event study data, BOTH columns are required - if only one column is included, it is automatically dropped.)
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data['g1'] = 1
bdf = bpd.BipartiteLong(sim_data, col_dict={'g': 'g1'})
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,28,0.813411,1,0
1,0,64,0.309910,2,0
2,0,129,1.340066,3,0
3,0,143,0.616610,4,0
4,0,192,2.104819,5,0
...,...,...,...,...,...
49995,9999,96,-0.880188,1,0
49996,9999,41,0.151882,2,0
49997,9999,41,0.211678,3,0
49998,9999,41,-1.287437,4,0


In [13]:
# We may also specify a col_dict that overwrites a column label that would normally be inferred, and inference is skipped
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
sim_data = sim_data.rename({'i': 'g'}, axis=1)
bdf = bpd.BipartiteLong(sim_data, col_dict={'i': 'g'})
# Note that even though we have a column named g, because we specify it is the 'i' column it is not inferred as the cluster values
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,174,1.244612,1
1,0,174,2.199444,2
2,0,174,1.960504,3
3,0,174,1.397325,4
4,0,174,0.932462,5
...,...,...,...,...
49995,9999,120,1.499970,1
49996,9999,163,2.126022,2
49997,9999,50,1.051911,3
49998,9999,166,3.338495,4


In [14]:
# Dropping an optional column
bdf.gen_m()
display(bdf)
bdf.drop('m')

Unnamed: 0,i,j,y,t,m
0,0,174,1.244612,1,0
1,0,174,2.199444,2,0
2,0,174,1.960504,3,0
3,0,174,1.397325,4,0
4,0,174,0.932462,5,0
...,...,...,...,...,...
49995,9999,120,1.499970,1,1
49996,9999,163,2.126022,2,1
49997,9999,50,1.051911,3,1
49998,9999,166,3.338495,4,1


Unnamed: 0,i,j,y,t
0,0,174,1.244612,1
1,0,174,2.199444,2
2,0,174,1.960504,3
3,0,174,1.397325,4
4,0,174,0.932462,5
...,...,...,...,...
49995,9999,120,1.499970,1
49996,9999,163,2.126022,2
49997,9999,50,1.051911,3
49998,9999,166,3.338495,4


In [15]:
# Dropping a required column is prevented
bdf.drop('i')



Unnamed: 0,i,j,y,t
0,0,174,1.244612,1
1,0,174,2.199444,2
2,0,174,1.960504,3
3,0,174,1.397325,4
4,0,174,0.932462,5
...,...,...,...,...
49995,9999,120,1.499970,1
49996,9999,163,2.126022,2
49997,9999,50,1.051911,3
49998,9999,166,3.338495,4
