# Simple example

In [1]:
# Add BipartitePandas to system path, do not run this
# import sys
# sys.path.append('../../..')

In [2]:
# Import the BipartitePandas package 
# (Make sure you have installed it using pip install bipartitepandas)
import bipartitepandas as bpd

In [3]:
# For the example, we simulate bipartite labor data
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
display(sim_data)

Unnamed: 0,i,j,y,t
0,0,120,1.531425,1
1,0,108,3.285988,2
2,0,108,1.684814,3
3,0,30,1.448084,4
4,0,30,1.554086,5
...,...,...,...,...
49995,9999,53,-0.133057,1
49996,9999,53,-1.128230,2
49997,9999,53,0.874702,3
49998,9999,47,-1.695015,4


## Formats

BipartitePandas includes 4 format classes:
- BipartiteLong
- BipartiteLongCollapsed
- BipartiteEventStudy
- BipartiteEventStudyCollapsed

For long data, each row gives a single observation. For collapsed long data, each row gives a single observation where employment spells at the same firm are collapsed into that observation. For event study data, each row gives the firms and compensation for a single worker over two consecutive periods (this is created from BipartiteLong). For collapsed event study data, each row gives the firms and salaries for a single worker over two consecutive employment spells (this is created from BipartiteLongCollapsed).

In [4]:
# Our data is in long form (each row gives a single observation)
bdf = bpd.BipartiteLong(sim_data)
display(bdf)
# Note the type
print(type(bdf))

Unnamed: 0,i,j,y,t
0,0,120,1.531425,1
1,0,108,3.285988,2
2,0,108,1.684814,3
3,0,30,1.448084,4
4,0,30,1.554086,5
...,...,...,...,...
49995,9999,53,-0.133057,1
49996,9999,53,-1.128230,2
49997,9999,53,0.874702,3
49998,9999,47,-1.695015,4


<class 'bipartitepandas.bipartitelong.BipartiteLong'>


In [5]:
# We can check out some summary statistics (all values default to False before the data is cleaned)
bdf.summary()

format: BipartiteLong
number of workers: 10000
number of firms: 197
number of observations: 50000
mean wage: 0.4147539041781768
max wage: 6.269628313998815
min wage: -4.952170255652092
connected: False
contiguous i ids (None if not included): False
contiguous j ids (None if not included): False
contiguous g ids (None if not included): None
correct column names and types: False
no nans: False
no duplicates: False
i-t (worker-year) observations unique (None if t column(s) not included): False



In [6]:
# We can easily retrieve the number of workers, firms, and clusters (which is None because the data is not clustered)
print(bdf.n_workers())
print(bdf.n_firms())
print(bdf.n_clusters())

10000
197
None


In [7]:
# Whenever we want to use the data, we need to clean it
bdf = bdf.clean()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,120,1.531425,1
1,0,108,3.285988,2
2,0,108,1.684814,3
3,0,30,1.448084,4
4,0,30,1.554086,5
...,...,...,...,...
49995,9999,53,-0.133057,1
49996,9999,53,-1.128230,2
49997,9999,53,0.874702,3
49998,9999,47,-1.695015,4


## Converting formats

In [8]:
# While our original data is long, we might want to collapse by employment spells (so any consecutive observations with the same worker in the same firm are collapsed into 1 observation)
bdf = bdf.to_collapsed_long()
display(bdf)
# Note the type
print(type(bdf))

Unnamed: 0,i,j,y,t1,t2,w,m
0,0,120,1.531425,1,1,1,1
1,0,108,2.485401,2,3,2,1
2,0,30,1.501085,4,5,2,1
3,1,115,-0.363222,1,2,2,1
4,1,79,-1.451195,3,3,1,1
...,...,...,...,...,...,...,...
29768,9997,33,0.311619,4,5,2,1
29769,9998,50,-0.828744,1,5,5,0
29770,9999,53,-0.128862,1,3,3,1
29771,9999,47,-1.695015,4,4,1,1


<class 'bipartitepandas.bipartitelongcollapsed.BipartiteLongCollapsed'>


In [9]:
# Now suppose we want our data to be in collapsed event study form (each row gives two consecutive observations from the collapsed long data)
bdf = bdf.to_eventstudy()
display(bdf)
# Note the type
print(type(bdf))

Unnamed: 0,i,j1,j2,y1,y2,t11,t12,t21,t22,w1,w2,m
0,5,142,142,2.030896,2.030896,1,5,1,5,5.0,5,0
1,41,109,109,1.467780,1.467780,1,5,1,5,5.0,5,0
2,61,118,118,0.887261,0.887261,1,5,1,5,5.0,5,0
3,97,192,192,1.618996,1.618996,1,5,1,5,5.0,5,0
4,121,82,82,0.453086,0.453086,1,5,1,5,5.0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...
20432,9996,126,36,-0.154245,0.360923,3,3,4,5,1.0,2,1
20433,9997,73,186,0.428521,1.921746,1,1,2,3,1.0,2,1
20434,9997,186,33,1.921746,0.311619,2,3,4,5,2.0,2,1
20435,9999,53,47,-0.128862,-1.695015,1,3,4,4,3.0,1,1


<class 'bipartitepandas.bipartiteeventstudycollapsed.BipartiteEventStudyCollapsed'>


In [10]:
# We can also use event study data to retrive cross section data (cs=1 gives y1 as y1 for both stayers and movers; cs=0 gives y2 as y1 for only movers - this allows (almost) all income data to be accessed from the y1 column. Note that for movers, the last observation for each worker is not available without manipulation as it is shifted to the y2 column. Also note that the y1 row contains duplicates for all mover incomes, except for the first period.)
display(bdf.get_cs())

Unnamed: 0,i,j1,j2,y1,y2,t11,t12,t21,t22,w1,w2,m,cs
0,5,142,142,2.030896,2.030896,1,5,1,5,5.0,5.0,0,1
1,41,109,109,1.467780,1.467780,1,5,1,5,5.0,5.0,0,1
2,61,118,118,0.887261,0.887261,1,5,1,5,5.0,5.0,0,1
3,97,192,192,1.618996,1.618996,1,5,1,5,5.0,5.0,0,1
4,121,82,82,0.453086,0.453086,1,5,1,5,5.0,5.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40205,9996,36,126,0.360923,-0.154245,4,5,3,3,2.0,1.0,1,0
40206,9997,186,73,1.921746,0.428521,2,3,1,1,2.0,1.0,1,0
40207,9997,33,186,0.311619,1.921746,4,5,2,3,2.0,2.0,1,0
40208,9999,47,53,-1.695015,-0.128862,4,4,1,3,1.0,3.0,1,0


## Clustering

In [11]:
# Refreshing data
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,120,1.531425,1
1,0,108,3.285988,2
2,0,108,1.684814,3
3,0,30,1.448084,4
4,0,30,1.554086,5
...,...,...,...,...
49995,9999,53,-0.133057,1
49996,9999,53,-1.128230,2
49997,9999,53,0.874702,3
49998,9999,47,-1.695015,4


In [12]:
# Cluster by running the cluster() method (this works from any format and clusters stay when data is reformatted)
bdf = bdf.cluster()
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,120,1.531425,1,3
1,0,108,3.285988,2,3
2,0,108,1.684814,3,3
3,0,30,1.448084,4,1
4,0,30,1.554086,5,1
...,...,...,...,...,...
49995,9999,53,-0.133057,1,5
49996,9999,53,-1.128230,2,5
49997,9999,53,0.874702,3,5
49998,9999,47,-1.695015,4,1
