# Clustering examples

In [1]:
# Add BipartitePandas to system path, do not run this
# import sys
# sys.path.append('../../..')

In [2]:
# Import the BipartitePandas package 
# (Make sure you have installed it using pip install bipartitepandas)
import bipartitepandas as bpd

In [3]:
# For the example, we simulate bipartite labor data
sim_data = bpd.SimBipartite().sim_network()[['i', 'j', 'y', 't']]
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
display(bdf)

Unnamed: 0,i,j,y,t
0,0,60,-2.163817,1
1,0,60,-0.791688,2
2,0,26,0.529849,3
3,0,26,0.192700,4
4,0,34,-0.603499,5
...,...,...,...,...
49995,9999,87,-1.266920,1
49996,9999,153,-1.210249,2
49997,9999,142,0.582017,3
49998,9999,143,-1.809820,4


## Computing measures and selecting how to group on them

In [4]:
# We compute measures using the bpd.measures module
measures = bpd.measures.cdfs() # Use firm-level cdfs of income as our measure
grouping = bpd.grouping.kmeans() # Group using kmeans
# Now cluster using our measures and grouping
bdf = bdf.cluster(measures=measures, grouping=grouping)
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,60,-2.163817,1,1
1,0,60,-0.791688,2,1
2,0,26,0.529849,3,5
3,0,26,0.192700,4,5
4,0,34,-0.603499,5,5
...,...,...,...,...,...
49995,9999,87,-1.266920,1,2
49996,9999,153,-1.210249,2,7
49997,9999,142,0.582017,3,4
49998,9999,143,-1.809820,4,4


In [5]:
# Alternative measures and groupings exist
measures = bpd.measures.moments(measures='mean') # Using firm-level mean income, compute income quantiles over all firms
grouping = bpd.grouping.quantiles() # Group using quantiles, only valid if using 1 measure
# Now cluster using our measures and grouping
bdf = bdf.cluster(measures=measures, grouping=grouping)
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,60,-2.163817,1,1
1,0,60,-0.791688,2,1
2,0,26,0.529849,3,0
3,0,26,0.192700,4,0
4,0,34,-0.603499,5,0
...,...,...,...,...,...
49995,9999,87,-1.266920,1,2
49996,9999,153,-1.210249,2,3
49997,9999,142,0.582017,3,2
49998,9999,143,-1.809820,4,2


In [6]:
# We can use multiple measures
measures = [bpd.measures.cdfs(), bpd.measures.moments(measures=['mean', 'var'])]
grouping = bpd.grouping.kmeans() # Group using quantiles, only valid if using 1 measure
# Now cluster using our measures and grouping
bdf = bdf.cluster(measures=measures, grouping=grouping)
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,60,-2.163817,1,4
1,0,60,-0.791688,2,4
2,0,26,0.529849,3,2
3,0,26,0.192700,4,2
4,0,34,-0.603499,5,2
...,...,...,...,...,...
49995,9999,87,-1.266920,1,5
49996,9999,153,-1.210249,2,1
49997,9999,142,0.582017,3,7
49998,9999,143,-1.809820,4,7


## Clustering and Reformatting

In [7]:
# We can cluster from any format and clusters stay when reformatting
bdf = bdf.cluster()
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,60,-2.163817,1,1
1,0,60,-0.791688,2,1
2,0,26,0.529849,3,3
3,0,26,0.192700,4,3
4,0,34,-0.603499,5,3
...,...,...,...,...,...
49995,9999,87,-1.266920,1,7
49996,9999,153,-1.210249,2,5
49997,9999,142,0.582017,3,2
49998,9999,143,-1.809820,4,2


In [8]:
bdf = bdf.get_es()
display(bdf)

Unnamed: 0,i,j1,j2,y1,y2,t1,t2,g1,g2,m
0,14,31,31,0.935641,0.935641,1,1,1,1,0
1,14,31,31,-0.584035,-0.584035,2,2,1,1,0
2,14,31,31,-0.753048,-0.753048,3,3,1,1,0
3,14,31,31,-0.987248,-0.987248,4,4,1,1,0
4,14,31,31,-0.708413,-0.708413,5,5,1,1,0
...,...,...,...,...,...,...,...,...,...,...
40642,9998,39,15,0.798486,0.642239,4,5,1,3,1
40643,9999,87,153,-1.266920,-1.210249,1,2,7,5,1
40644,9999,153,142,-1.210249,0.582017,2,3,5,2,1
40645,9999,142,143,0.582017,-1.809820,3,4,2,2,1


In [9]:
bdf = bdf.get_long()
bdf = bdf.get_collapsed_long()
display(bdf)

Unnamed: 0,i,j,y,t1,t2,w,g,m
0,0,60,-1.477753,1,2,2,1,1
1,0,26,0.361275,3,4,2,3,1
2,0,34,-0.603499,5,5,1,3,1
3,1,161,1.803059,1,2,2,5,1
4,1,61,0.688224,3,3,1,1,1
...,...,...,...,...,...,...,...,...
29753,9999,87,-1.266920,1,1,1,7,1
29754,9999,153,-1.210249,2,2,1,5,1
29755,9999,142,0.582017,3,3,1,2,1
29756,9999,143,-1.809820,4,4,1,2,1


In [10]:
bdf = bdf.get_es()
display(bdf)

Unnamed: 0,i,j1,j2,y1,y2,t11,t12,t21,t22,w1,w2,g1,g2,m
0,14,31,31,-0.419421,-0.419421,1,5,1,5,5.0,5,1,1,0
1,66,82,82,0.234035,0.234035,1,5,1,5,5.0,5,4,4,0
2,75,66,66,-0.358665,-0.358665,1,5,1,5,5.0,5,4,4,0
3,78,108,108,-0.594697,-0.594697,1,5,1,5,5.0,5,7,7,0
4,99,34,34,-1.102178,-1.102178,1,5,1,5,5.0,5,3,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20400,9998,39,15,0.798486,0.642239,4,4,5,5,1.0,1,1,3,1
20401,9999,87,153,-1.266920,-1.210249,1,1,2,2,1.0,1,7,5,1
20402,9999,153,142,-1.210249,0.582017,2,2,3,3,1.0,1,5,2,1
20403,9999,142,143,0.582017,-1.809820,3,3,4,4,1.0,1,2,2,1


In [11]:
bdf.drop('g')

Unnamed: 0,i,j1,j2,y1,y2,t11,t12,t21,t22,w1,w2,m
0,14,31,31,-0.419421,-0.419421,1,5,1,5,5.0,5,0
1,66,82,82,0.234035,0.234035,1,5,1,5,5.0,5,0
2,75,66,66,-0.358665,-0.358665,1,5,1,5,5.0,5,0
3,78,108,108,-0.594697,-0.594697,1,5,1,5,5.0,5,0
4,99,34,34,-1.102178,-1.102178,1,5,1,5,5.0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...
20400,9998,39,15,0.798486,0.642239,4,4,5,5,1.0,1,1
20401,9999,87,153,-1.266920,-1.210249,1,1,2,2,1.0,1,1
20402,9999,153,142,-1.210249,0.582017,2,2,3,3,1.0,1,1
20403,9999,142,143,0.582017,-1.809820,3,3,4,4,1.0,1,1


In [12]:
bdf = bdf.cluster()
display(bdf)

Unnamed: 0,i,j1,j2,y1,y2,t11,t12,t21,t22,w1,w2,g1,g2,m
0,14,31,31,-0.419421,-0.419421,1,5,1,5,5.0,5,3,3,0
1,66,82,82,0.234035,0.234035,1,5,1,5,5.0,5,5,5,0
2,75,66,66,-0.358665,-0.358665,1,5,1,5,5.0,5,0,0,0
3,78,108,108,-0.594697,-0.594697,1,5,1,5,5.0,5,2,2,0
4,99,34,34,-1.102178,-1.102178,1,5,1,5,5.0,5,3,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20400,9998,39,15,0.798486,0.642239,4,4,5,5,1.0,1,3,3,1
20401,9999,87,153,-1.266920,-1.210249,1,1,2,2,1.0,1,2,4,1
20402,9999,153,142,-1.210249,0.582017,2,2,3,3,1.0,1,4,7,1
20403,9999,142,143,0.582017,-1.809820,3,3,4,4,1.0,1,7,7,1


## Additional clustering options

In [13]:
# We can cluster on a specific year with non-collapsed data
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
bdf = bdf.cluster(t=1)
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,60,-2.163817,1,6
1,0,60,-0.791688,2,6
2,0,26,0.529849,3,0
3,0,26,0.192700,4,0
4,0,34,-0.603499,5,6
...,...,...,...,...,...
49995,9999,87,-1.266920,1,2
49996,9999,153,-1.210249,2,3
49997,9999,142,0.582017,3,1
49998,9999,143,-1.809820,4,1


In [14]:
# We can cluster on movers or stayers in any form
# Note that not all firms are clustered, so some rows have NaNs
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
bdf = bdf.cluster(stayers_movers='stayers')
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,60,-2.163817,1,0
1,0,60,-0.791688,2,0
2,0,26,0.529849,3,5
3,0,26,0.192700,4,5
4,0,34,-0.603499,5,1
...,...,...,...,...,...
49995,9999,87,-1.266920,1,4
49996,9999,153,-1.210249,2,6
49997,9999,142,0.582017,3,4
49998,9999,143,-1.809820,4,6


In [15]:
# Set 'dropna': True if you want to drop firms that don't get clustered (note that this fully resets firm and worker ids, since they must be contiguous for the FE/CRE/BLM estimators to work)
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
bdf = bdf.cluster(stayers_movers='stayers', dropna=True)
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,60,-2.163817,1,0
1,0,60,-0.791688,2,0
2,0,26,0.529849,3,6
3,0,26,0.192700,4,6
4,0,34,-0.603499,5,2
...,...,...,...,...,...
48307,9999,87,-1.266920,1,7
48308,9999,153,-1.210249,2,1
48309,9999,142,0.582017,3,7
48310,9999,143,-1.809820,4,1


In [16]:
# The results from clustering on stayers or movers can differ
bdf = bpd.BipartiteLong(sim_data)
bdf = bdf.clean_data()
bdf = bdf.cluster(stayers_movers='movers', dropna=True)
display(bdf)

Unnamed: 0,i,j,y,t,g
0,0,60,-2.163817,1,0
1,0,60,-0.791688,2,0
2,0,26,0.529849,3,4
3,0,26,0.192700,4,4
4,0,34,-0.603499,5,4
...,...,...,...,...,...
49995,9999,87,-1.266920,1,7
49996,9999,153,-1.210249,2,1
49997,9999,142,0.582017,3,5
49998,9999,143,-1.809820,4,5
