In [1]:
import pandas as pd
import numpy as np

# Task 2

I download data from IPUMS-USA, from year 2001 onwards. I use the following variables:
- Demographics: `SEX`, `AGE`, `RACE`, `MARST`, `NCHILD`, `EDUC`
- Geographic: `STATEFIP`, `COUNTYFIP`, `METAREA`
- `TRANTIME`
- Work variables: `OCC2010`, `IND1990`, `EMPSTAT`, `LABFORCE`, `UHRSWORK` (as close as I can get to `UHRSWORK1`), `INCWAGE` (as close as I can get to `EARNWEEK`)

In [2]:
data = pd.read_csv('census_data.csv')

## Data cleaning 

Here I remove some NA values for the variables used in the next step. Specifically, I remove:
- `0` for `TRANTIME`
- `1` for `EDUC`

I also recode `MARST` variable, specifically:
- `1, 2` are recoded to `married`
- `3, 4, 5, 6` are recoded to `single`

In [3]:
# NA variable name/values pairs
NA_values = {
    'TRANTIME': [0],
    'EDUC': [1],
}

# Remove NA values
for variable, nas in NA_values.items():
    data.loc[data[variable].isin(nas), variable] = None
    
# Key for replacing MARST values
marst_replace = {
    1: 'married',
    2: 'married',
    3: 'single',
    4: 'single',
    5: 'single',
    6: 'single',
}

# replace MARST values
data['MARST'] = data['MARST'].replace(marst_replace)

## Commuting time

First I restrict the sample to `LABFORCE == 2` (in the labor force) and `AGE` between 25 and 55. I also drop all entries with NA values in any of `MARST`, `SEX`, `EDUC` or `TRANTIME`.

Then I compute a pivot table, with values of `MARST` and `SEX` in the columns, and `EDUC` in the rows, with the value being the weighted mean of `TRANTIME`, weighted using the `PERWT` column.

In [4]:
data = data.query('LABFORCE == 2 & AGE >= 25 & AGE <= 55').copy()
data = data.dropna(subset = ['MARST', 'SEX', 'EDUC', 'TRANTIME'])

In [5]:
# Helper function for computing the weighted mean (weight and value column name fixed)
weight_col = 'PERWT'
vales_col = 'TRANTIME'

# Main function
def weighted_mean(data):
    return np.average(data[vales_col], weights = data[weight_col])

In [6]:
# Do the calculations
table = data.groupby(['SEX', 'MARST', 'EDUC']).apply(weighted_mean).unstack()

In [7]:
# Show result
table

Unnamed: 0_level_0,EDUC,0.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,10.0,11.0
SEX,MARST,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,married,29.686423,28.668993,29.119111,28.560696,28.24872,28.199053,28.379987,28.673927,29.539546,28.509616
1,single,29.029995,28.882502,27.855402,27.338305,26.827252,26.637944,26.108959,26.732929,26.985904,26.321075
2,married,25.317746,23.425783,22.382938,21.578335,21.530282,22.33215,23.511762,24.279355,25.340071,26.00419
2,single,27.875502,25.482029,23.533603,22.867492,23.268023,23.976442,24.881952,25.491811,26.693739,26.783194


In [8]:
# Do some recoding (columns to 1,2,3,4, as specified, and save as csv)
table.index = [1,2,3,4]
table.columns.name = None
table.to_csv('Mean commute times.csv')