# PyTwoWay example

In [1]:
# Add pytwoway to system path, do not run this
# import sys
# sys.path.append('../../..')

In [2]:
# Import the pytwoway package 
# (Make sure you have installed it using pip install pytwoway)
import pytwoway as tw
from bipartitepandas import SimBipartite

## Simulate some data

The package contains functions to simulate data. We use this here to keep things simple. If you have your own data, you can import it. Load it as a pandas dataframe and use it as an input.

As you can see, we will need the following required columns in our data:

 - `i`: worker identifier
 - `j`: firm identifier
 - `y`: compensation
 - `t`: time

In [3]:
# For the example, we simulate data
sim_data = SimBipartite().sim_network()
display(sim_data)

Unnamed: 0,i,t,k,alpha,psi,spell,freq,j,move,y
0,0,1,6,0.000000,0.348756,1,3,122,False,1.324643
1,0,2,6,0.000000,0.348756,1,3,122,False,0.094003
2,0,3,6,0.000000,0.348756,1,3,122,False,0.619061
3,0,4,6,0.000000,0.348756,2,1,109,True,-0.353818
4,0,5,5,0.000000,0.114185,3,1,95,True,-0.915972
...,...,...,...,...,...,...,...,...,...,...
49995,9999,1,4,-0.430727,-0.114185,1,3,69,False,-2.295000
49996,9999,2,4,-0.430727,-0.114185,1,3,69,False,0.164724
49997,9999,3,4,-0.430727,-0.114185,1,3,69,False,-1.264234
49998,9999,4,7,-0.430727,0.604585,2,1,136,True,-0.320706


## Create a TwoWay object using your data

In [4]:
# We need to specify a column dictionary to make sure columns are named correctly
# You can also manually update column names yourself
col_name_dict = {
    'i': 'i',    # Specify the column name for the worker identifier 
    'j': 'j',    # Specify the column name for the firm identifier 
    'y': 'y',  # Specify the column name for the compensation variable
    't': 't'   # Specify the column name for the time variable
}

# Create the TwoWay object that will do all the heavy lifting
tw_net = tw.TwoWay(data=sim_data, formatting='long', col_dict=col_name_dict)

## Now we can run the FE estimator

In [5]:
## Optional Parameters ##
fe_params = {
    'ncore': 1, # Number of cores to use
    'batch': 1, # Batch size to send in parallel
    'ndraw_pii': 50, # Number of draws to use in approximation for leverages
    'levfile': '', # File to load precomputed leverages
    'ndraw_tr': 5, # Number of draws to use in approximation for traces
    'h2': False, # If True, compute h2 correction
    'out': 'res_fe.json', # Outputfile where results are saved
    'statsonly': False, # If True, return only basic statistics
    'Q': 'cov(alpha, psi)' # Which Q matrix to consider. Options include 'cov(alpha, psi)' and 'cov(psi_t, psi_{t+1})'
}

# Estimate the fixed effect decomposition
tw_net.fit_fe(user_fe=fe_params)

100%|██████████| 5/5 [00:00<00:00, 231.19it/s]


## We can also run the CRE estimator

In [6]:
## Optional Parameters ##
cre_params = {
    'ncore': 1, # Number of cores to use
    'ndraw_tr': 5, # Number of draws to use in approximation for traces
    'ndp': 50, # Number of draw to use in approximation for leverages
    'out': 'res_cre.json', # Outputfile where results are saved
    'posterior': False, # If True, compute posterior variance
    'wo_btw': False # If True, sets between variation to 0, pure RE
}
KMeans_params = { # These parameters are specifically for the KMeans algorithm
                  # Read more at https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
    'n_clusters': 10,
    'init': 'k-means++',
    'n_init': 500,
    'max_iter': 300,
    'tol': 0.0001,
    'precompute_distances': 'deprecated',
    'verbose': 0,
    'random_state': None,
    'copy_x': True,
    'n_jobs': 'deprecated',
    'algorithm': 'auto'
}
cluster_params = {
    'cdf_resolution': 10, # How many values to use to approximate the cdf when clustering
    'grouping': 'quantile_all', # How to group the cdfs when clustering
                                # ('quantile_all' to get quantiles from entire set of data,
                                #     then have firm-level values between 0 and 1;
                                # 'quantile_firm_small' to get quantiles at the firm-level
                                #     and have values be compensations if small data;
                                # 'quantile_firm_large' to get quantiles at the firm-level
                                #     and have values be compensations if large data,
                                #     note that this is up to 50 times slower than 'quantile_firm_small'
                                #     and should only be used if the dataset is too large to copy
                                #     into a dictionary)
        'stayers_movers': None, # If None, clusters on entire dataset
                                # If 'stayers', clusters on only stayers
                                # If 'movers', clusters on only movers
        't': None, # If None, clusters on entire dataset
                   # If int, gives period in data to consider (only valid for non-collapsed data)
        'weighted': True, # If True, weight firm clusters by firm size (if a weight column is included, firm weight is computed using this column; otherwise, each observation has weight 1)
        'dropna': False, # If True, drop observations where firms aren't clustered
                         # If False, keep all observations
        'user_KMeans': KMeans_params
}

# Estimate the cre model
tw_net.fit_cre(user_cre=cre_params, user_cluster=cluster_params)

## Finally, we can investigate the results

In [7]:
display(tw_net.summary_fe())
display(tw_net.summary_cre())

{'var_fe': 0.5982717199250484,
 'cov_fe': 0.16661408133027297,
 'var_ho': 0.5938896845212178,
 'cov_ho': 0.16738561210485078,
 'var_y': 1.9579456470100542}

{'var_y': 1.9579456470100542,
 'var_bw': 0.5875425925447757,
 'cov_bw': 0.1726795021993911,
 'var_tot': 0.5875706667091098,
 'cov_tot': 0.173072058799131}