# pytwoway example

In [1]:
# Add pytwoway to system path, do not run this
# import sys
# sys.path.append('../../..')

In [2]:
# Import the pytwoway package 
# (Make sure you have installed it using pip install pytwoway)
import pytwoway as tw

## Simulate some data

The package contains functions to simulate data. We use this here to keep things simple. If you have your own data, you can import it. Load it as a pandas dataframe and use it as an input. 

As you can see, we will need the following required columns in our data: 

 - `wid`: the worker identifier
 - `fid`: the firm identifier
 - `year`: the time
 - `comp`: the outcome variable, in our case compensation

In [3]:
# For the example, we simulate data
sim_data = tw.SimTwoWay().sim_network()
display(sim_data)

Unnamed: 0,wid,year,k,alpha,psi,spell,freq,fid,move,comp
0,1,1,5,0.430727,0.114185,1,3,84,False,1.920632
1,1,2,5,0.430727,0.114185,1,3,84,False,0.527406
2,1,3,5,0.430727,0.114185,1,3,84,False,-0.714207
3,1,4,6,0.430727,0.348756,2,2,115,True,0.375877
4,1,5,6,0.430727,0.348756,2,2,115,False,0.843413
...,...,...,...,...,...,...,...,...,...,...
49995,10000,1,8,0.000000,0.908458,1,1,150,False,1.267188
49996,10000,2,7,0.000000,0.604585,2,4,137,True,0.672805
49997,10000,3,7,0.000000,0.604585,2,4,137,False,-0.623615
49998,10000,4,7,0.000000,0.604585,2,4,137,False,0.291969


## Create a TwoWay object using your data

In [4]:
# We need to specify a column dictionary to make sure columns are named correctly
# You can also manually update column names yourself
col_name_dict = {
    'fid': 'fid',    # Specify the column name for the firm identifier 
    'wid': 'wid',    # Specify the column name for the worker identifier 
    'year': 'year',  # Specify the column name for the year
    'comp': 'comp'   # Specify the column name for the outcome variable
}

# Create the TwoWay object that will do all the heavy lifting
tw_net = tw.TwoWay(data=sim_data, formatting='long', col_dict=col_name_dict)

## Now we can run the FE estimator

In [5]:
## Optional Parameters ##
fe_params = {
    'ncore': 1, # Number of cores to use
    'batch': 1, # Batch size to send in parallel
    'ndraw_pii': 50, # Number of draw to use in approximation for leverages
    'ndraw_tr': 5, # Number of draws to use in approximation for traces
    'check': False, # Whether to compute the non-approximated estimates as well
    'hetero': False, # Whether to compute the heteroskedastic estimates
    'out': 'res_fe.json', # Filepath for fe results
    'con': False, # Computes the smallest eigen values, this is the filepath where these results are saved
    'logfile': '', # Log output to a logfile
    'levfile': '', # File to load precomputed leverages
    'statsonly': False # Save data statistics only
}

# Compute the fixed effect decomposition
fe_res = tw_net.fit_fe(user_fe=fe_params)

100%|██████████| 5/5 [00:00<00:00, 249.65it/s]


## We can also run the CRE estimator

In [6]:
## Optional Parameters ##
cre_params = {
    'ncore': 1, # Number of cores to use
    'ndraw_tr': 5, # Number of draws to use in approximation for traces
    'ndp': 50, # Number of draw to use in approximation for leverages
    'out': 'res_cre.json', # Filepath for cre results
    'posterior': False, # Whether to compute the posterior variance
    'wobtw': False # Sets between variation to 0, pure RE when computing cre
}
KMeans_params = { # These parameters are specifically for the KMeans algorithm
                  # Read more at https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
    'n_clusters': 10,
    'init': 'k-means++',
    'n_init': 500,
    'max_iter': 300,
    'tol': 0.0001,
    'precompute_distances': 'deprecated',
    'verbose': 0,
    'random_state': None,
    'copy_x': True,
    'n_jobs': 'deprecated',
    'algorithm': 'auto'
}
cluster_params = {
    'cdf_resolution': 10, # How many values to use to approximate the cdf when clustering
    'grouping': 'quantile_all',
        # How to group the cdfs when clustering
        # ('quantile_all' to get quantiles from entire set of data,
        #     then have firm-level values between 0 and 1;
        # 'quantile_firm_small' to get quantiles at the firm-level
        #     and have values be compensations if small data;
        # 'quantile_firm_large' to get quantiles at the firm-level
        #     and have values be compensations if large data,
        #     note that this is up to 50 times slower than 'quantile_firm_small'
        #     and should only be used if the dataset is too large to copy
        #     into a dictionary)
    'year': None, # If None, uses entire dataset when clustering;
                  # If int, gives year of data to consider when clustering
    'user_KMeans': KMeans_params
}

# Compute the cre model
cre_res = tw_net.fit_cre(user_cre=cre_params, user_cluster=cluster_params)

## Finally, we can investigate the results

In [7]:
display(fe_res)
display(cre_res)

{'cores': '1',
 'ndp': '50',
 'ndt': '5',
 'n_firms': '196',
 'n_workers': '10000',
 'n_movers': '9344',
 'n_stayers': '656',
 'mover_quantiles': '[157.0, 181.0, 187.92894736842106, 195.0, 200.7450980392157, 206.0, 210.0, 214.0, 217.0, 228.0, 255.0]',
 'size_quantiles': '[161.0, 185.50404312668465, 191.0, 198.0, 204.0, 209.0, 213.0, 217.0, 221.76975169300226, 231.61555075593952, 261.0]',
 'between_firm_var': '0.9891734496234688',
 'var_y': '1.9550009807689568',
 'solver_time': '0.003079912000000018',
 'tot_var': '1.9599014840913969',
 'eps_var_ho': '0.6962512205591249',
 'eps_var_fe': '0.5205946703586726',
 'tr_var_ho': '0.00863868934254388',
 'tr_cov_ho': '-0.004786864415815065',
 'var_fe': '0.6109407805741677',
 'cov_fe': '0.16480214211415423',
 'var_ho': '0.6049260825753904',
 'cov_ho': '0.1681350023063165',
 'total_time': '0.07845926284790039'}

{'cores': '1',
 'ndt': '5',
 'n_firms': '196',
 'n_workers': '10000',
 'n_movers': '9344',
 'n_stayers': '656',
 'y1s_y1s': '0.02229858598031033',
 'y1s_y1s_count': '620',
 'y1s_var': '0.43221433361635203',
 'y1s_var_count': '656',
 'y1m_var': '0.9638887986375038',
 'y1m_var_count': '19877',
 'y2m_var': '0.9733870015979674',
 'y2m_var_count': '19877',
 'y1s_y1m1': '-0.0006583412937040781',
 'y1s_y1m1_count': '656',
 'y1s_y2m1': '-0.00024645058225545305',
 'y1s_y2m1_count': '656',
 'y1m1_y1m1': '-0.0013848178450396778',
 'y1m1_y1m1_count': '19877',
 'y2m1_y1m1': '-0.00042964278133828876',
 'y2m1_y1m1_count': '19877',
 'y2m1_y2m1': '-0.00019569228326585966',
 'y2m1_y2m1_count': '19877',
 'y1s_y1m2': '0.0028913676531741777',
 'y1s_y1m2_count': '656',
 'y1s_y2m2': '0.00041021309905162924',
 'y1s_y2m2_count': '656',
 'y1m2_y1m2': '0.0009235715713498977',
 'y1m2_y1m2_count': '19877',
 'y2m2_y1m2': '-0.00047052000676044045',
 'y2m2_y1m2_count': '19877',
 'y2m2_y2m2': '-0.0024230242553870823',