# COMBINE DATA

In [1]:
import polars as pl
import polars.selectors as cs
from glob import glob
from toolz import pipe
from functools import reduce
from operator import add, mul
import re
from columns import activity_columns, respondent_columns, roster_columns, who_columns



## CONDUCT UNIONS

- In general the unioning in all file types is done in the same process...
    - create paths
    - create a dictionary of original column names
    - create dictionary to assist with renaming columns

    - create an example of what is considered a "correct table schema"
    - pull the int columns from the "correct table"
    - pull the string columns from the "correct table"
    - combine the int and string columns and map to their typeh

    - conduct renaming of columns and handle casting of data types, while also pulling in each table in one fell swoop.
    - Find the common columns between tables
    - remove columns that are not common to each table prior to unioning
    - conduct the union

### UNION ACTIVITY FILES

In [2]:
# create paths
(act_paths :=
 glob('./data/atusact*.dat') + glob('./data/*/atusact*.dat')
)

['./data/atusact_2003.dat',
 './data/atusact_2004.dat',
 './data/atusact_2005.dat',
 './data/atusact_2006.dat',
 './data/atusact_2007.dat',
 './data/atusact_2008.dat',
 './data/atusact_2009.dat',
 './data/atusact_2010.dat',
 './data/atusact_2011.dat',
 './data/atusact_2012.dat',
 './data/atusact_2013.dat',
 './data/atusact_2014.dat',
 './data/atusact_2015.dat',
 './data/atusact_2017.dat',
 './data/atusact_2018.dat',
 './data/atusact_2019.dat',
 './data/atusact_2020.dat',
 './data/atusact_2021.dat',
 './data/atusact_2022.dat',
 './data/atusact_2023.dat',
 './data/atusact_2024.dat',
 './data/atusact_2016/atusact_2016.dat']

In [3]:
#create a dictionary of original column names
(original_columns :=
 {p:{col: col
     for col in 
     pl.read_csv(p).columns
    }
  for p in act_paths
 }
)

{'./data/atusact_2003.dat': {'TUCASEID': 'TUCASEID',
  'TUACTDUR24': 'TUACTDUR24',
  'TRTCC_LN': 'TRTCC_LN',
  'TRTCOC_LN': 'TRTCOC_LN',
  'TUACTIVITY_N': 'TUACTIVITY_N',
  'TUTIER1CODE': 'TUTIER1CODE',
  'TUTIER2CODE': 'TUTIER2CODE',
  'TUTIER3CODE': 'TUTIER3CODE',
  'TUSTARTTIM': 'TUSTARTTIM',
  'TUSTOPTIME': 'TUSTOPTIME',
  'TUACTDUR': 'TUACTDUR',
  'TUCC8': 'TUCC8',
  'TUCUMDUR': 'TUCUMDUR',
  'TEWHERE': 'TEWHERE',
  'TXWHERE': 'TXWHERE',
  'TR_03CC57': 'TR_03CC57',
  'TUCUMDUR24': 'TUCUMDUR24'},
 './data/atusact_2004.dat': {'TUCASEID': 'TUCASEID',
  'TUACTIVITY_N': 'TUACTIVITY_N',
  'TUTIER1CODE': 'TUTIER1CODE',
  'TUTIER2CODE': 'TUTIER2CODE',
  'TUTIER3CODE': 'TUTIER3CODE',
  'TUSTARTTIM': 'TUSTARTTIM',
  'TUSTOPTIME': 'TUSTOPTIME',
  'TUACTDUR': 'TUACTDUR',
  'TUCUMDUR': 'TUCUMDUR',
  'TUCC5': 'TUCC5',
  'TUCC7': 'TUCC7',
  'TUCC8': 'TUCC8',
  'TEWHERE': 'TEWHERE',
  'TXWHERE': 'TXWHERE',
  'TUCC5B': 'TUCC5B',
  'TUACTDUR24': 'TUACTDUR24',
  'TUCUMDUR24': 'TUCUMDUR24',
  'TRTCC_

In [4]:
#create dictionary to assist with renaming columns
(fixed_columns :=
 {**original_columns, 
  **{
      path:{col:col.upper() for col in original_columns.get(path,[])}
     for path in act_paths
     },  
}
)

{'./data/atusact_2003.dat': {'TUCASEID': 'TUCASEID',
  'TUACTDUR24': 'TUACTDUR24',
  'TRTCC_LN': 'TRTCC_LN',
  'TRTCOC_LN': 'TRTCOC_LN',
  'TUACTIVITY_N': 'TUACTIVITY_N',
  'TUTIER1CODE': 'TUTIER1CODE',
  'TUTIER2CODE': 'TUTIER2CODE',
  'TUTIER3CODE': 'TUTIER3CODE',
  'TUSTARTTIM': 'TUSTARTTIM',
  'TUSTOPTIME': 'TUSTOPTIME',
  'TUACTDUR': 'TUACTDUR',
  'TUCC8': 'TUCC8',
  'TUCUMDUR': 'TUCUMDUR',
  'TEWHERE': 'TEWHERE',
  'TXWHERE': 'TXWHERE',
  'TR_03CC57': 'TR_03CC57',
  'TUCUMDUR24': 'TUCUMDUR24'},
 './data/atusact_2004.dat': {'TUCASEID': 'TUCASEID',
  'TUACTIVITY_N': 'TUACTIVITY_N',
  'TUTIER1CODE': 'TUTIER1CODE',
  'TUTIER2CODE': 'TUTIER2CODE',
  'TUTIER3CODE': 'TUTIER3CODE',
  'TUSTARTTIM': 'TUSTARTTIM',
  'TUSTOPTIME': 'TUSTOPTIME',
  'TUACTDUR': 'TUACTDUR',
  'TUCUMDUR': 'TUCUMDUR',
  'TUCC5': 'TUCC5',
  'TUCC7': 'TUCC7',
  'TUCC8': 'TUCC8',
  'TEWHERE': 'TEWHERE',
  'TXWHERE': 'TXWHERE',
  'TUCC5B': 'TUCC5B',
  'TUACTDUR24': 'TUACTDUR24',
  'TUCUMDUR24': 'TUCUMDUR24',
  'TRTCC_

In [5]:
# create an example of what is considered a "correct table schema"
(example_correct_table := pl.read_csv(act_paths[0]).head()
)

TUCASEID,TUACTDUR24,TRTCC_LN,TRTCOC_LN,TUACTIVITY_N,TUTIER1CODE,TUTIER2CODE,TUTIER3CODE,TUSTARTTIM,TUSTOPTIME,TUACTDUR,TUCC8,TUCUMDUR,TEWHERE,TXWHERE,TR_03CC57,TUCUMDUR24
i64,i64,i64,i64,i64,i64,i64,i64,str,str,i64,i64,i64,i64,i64,i64,i64
20030100013280,60,-1,-1,1,13,1,24,"""04:00:00""","""05:00:00""",60,97,60,9,0,-1,60
20030100013280,30,-1,-1,2,1,2,1,"""05:00:00""","""05:30:00""",30,0,90,-1,0,-1,90
20030100013280,600,-1,-1,3,1,1,1,"""05:30:00""","""15:30:00""",600,0,690,-1,0,-1,690
20030100013280,150,-1,-1,4,12,3,3,"""15:30:00""","""18:00:00""",150,0,840,1,0,-1,840
20030100013280,5,-1,-1,5,11,1,1,"""18:00:00""","""18:05:00""",5,0,845,1,0,-1,845


In [6]:
# pull the int columns from the "correct table"
(int_columns := example_correct_table.select(cs.integer()).columns
)

['TUCASEID',
 'TUACTDUR24',
 'TRTCC_LN',
 'TRTCOC_LN',
 'TUACTIVITY_N',
 'TUTIER1CODE',
 'TUTIER2CODE',
 'TUTIER3CODE',
 'TUACTDUR',
 'TUCC8',
 'TUCUMDUR',
 'TEWHERE',
 'TXWHERE',
 'TR_03CC57',
 'TUCUMDUR24']

In [7]:
# pull the string columns from the "correct table"
(str_columns := example_correct_table.select(cs.string()).columns
)

['TUSTARTTIM', 'TUSTOPTIME']

In [8]:
#combine the int and string columns and map to their type
(col_and_types := {c: pl.String() for c in str_columns
} |    # Merge operator
{c:pl.Int64() for c in int_columns
 if c in activity_columns #pl.read_csv(p,n_rows=100).columns  # only cast columns that we need
        
}
 )

{'TUSTARTTIM': String,
 'TUSTOPTIME': String,
 'TUCASEID': Int64,
 'TUACTDUR24': Int64,
 'TUACTIVITY_N': Int64,
 'TUTIER1CODE': Int64,
 'TUTIER2CODE': Int64,
 'TUTIER3CODE': Int64}

In [9]:
# conduct renaming of columns and handle casting of data types. IT also pulls in each table.
(dfs := [pl.read_csv(p)
       .rename(col_rename)
       .select([
            pl.col(c).cast(t)
            for c, t in col_and_types.items()
            if c in activity_columns  # only cast columns that exist
        ]) for p, col_rename in fixed_columns.items()]
)

[shape: (412_611, 6)
 ┌────────────────┬────────────┬──────────────┬─────────────┬─────────────┬─────────────┐
 │ TUCASEID       ┆ TUACTDUR24 ┆ TUACTIVITY_N ┆ TUTIER1CODE ┆ TUTIER2CODE ┆ TUTIER3CODE │
 │ ---            ┆ ---        ┆ ---          ┆ ---         ┆ ---         ┆ ---         │
 │ i64            ┆ i64        ┆ i64          ┆ i64         ┆ i64         ┆ i64         │
 ╞════════════════╪════════════╪══════════════╪═════════════╪═════════════╪═════════════╡
 │ 20030100013280 ┆ 60         ┆ 1            ┆ 13          ┆ 1           ┆ 24          │
 │ 20030100013280 ┆ 30         ┆ 2            ┆ 1           ┆ 2           ┆ 1           │
 │ 20030100013280 ┆ 600        ┆ 3            ┆ 1           ┆ 1           ┆ 1           │
 │ 20030100013280 ┆ 150        ┆ 4            ┆ 12          ┆ 3           ┆ 3           │
 │ 20030100013280 ┆ 5          ┆ 5            ┆ 11          ┆ 1           ┆ 1           │
 │ …              ┆ …          ┆ …            ┆ …           ┆ …           ┆ …  

In [10]:
# Find the common columns between tables
(common_cols := set(dfs[0].columns))

{'TUACTDUR24',
 'TUACTIVITY_N',
 'TUCASEID',
 'TUTIER1CODE',
 'TUTIER2CODE',
 'TUTIER3CODE'}

In [11]:
# remove columns that are not common to each table prior to unioning

for df in dfs[1:]:
    common_cols &= set(df.columns)

In [12]:
# conduct the union, and filter to smallest size.
(act_combined := pl.concat([df.select(sorted(common_cols)) for df in dfs]).filter(pl.col('TUTIER1CODE') == 2))


TUACTDUR24,TUACTIVITY_N,TUCASEID,TUTIER1CODE,TUTIER2CODE,TUTIER3CODE
i64,i64,i64,i64,i64,i64
15,7,20030100013352,2,1,4
180,10,20030100013352,2,2,1
60,14,20030100013352,2,2,3
20,2,20030100013848,2,2,1
15,4,20030100013848,2,2,3
…,…,…,…,…,…
40,14,20161212162426,2,1,1
10,22,20161212162426,2,1,4
3,23,20161212162426,2,2,1
5,5,20161212162456,2,2,1


In [13]:
# creates a filter that will be used to filter when unioning the who  files
(activity_filter := act_combined.select("TUACTIVITY_N","TUCASEID").unique())

TUACTIVITY_N,TUCASEID
i64,i64
7,20161008162032
4,20191009191790
12,20051211050972
2,20031211031774
21,20100604101555
…,…
4,20140605142419
5,20150302151000
15,20141009140600
23,20190403191922


### UNION WHO FILES

In [14]:
# create paths
(who_paths :=
 glob('./data/atuswho*.dat') + glob('./data/*/atuswho*.dat')
)

['./data/atuswho_2004.dat',
 './data/atuswho_2005.dat',
 './data/atuswho_2006.dat',
 './data/atuswho_2007.dat',
 './data/atuswho_2008.dat',
 './data/atuswho_2009.dat',
 './data/atuswho_2010.dat',
 './data/atuswho_2011.dat',
 './data/atuswho_2012.dat',
 './data/atuswho_2013.dat',
 './data/atuswho_2014.dat',
 './data/atuswho_2015.dat',
 './data/atuswho_2017.dat',
 './data/atuswho_2018.dat',
 './data/atuswho_2019.dat',
 './data/atuswho_2020.dat',
 './data/atuswho_2021.dat',
 './data/atuswho_2022.dat',
 './data/atuswho_2023.dat',
 './data/atuswho_2024.dat',
 './data/atuswho_2003/atuswho_2003.dat',
 './data/atuswho_2016/atuswho_2016.dat']

In [15]:
#create a dictionary of original column names
(original_columns :=
 {p:{col: col
     for col in 
     pl.read_csv(p).columns
    }
  for p in who_paths
 }
)


{'./data/atuswho_2004.dat': {'TUCASEID': 'TUCASEID',
  'TUACTIVITY_N': 'TUACTIVITY_N',
  'TUWHO_CODE': 'TUWHO_CODE',
  'TULINENO': 'TULINENO',
  'TRWHONA': 'TRWHONA'},
 './data/atuswho_2005.dat': {'TUCASEID': 'TUCASEID',
  'TUACTIVITY_N': 'TUACTIVITY_N',
  'TRWHONA': 'TRWHONA',
  'TULINENO': 'TULINENO',
  'TUWHO_CODE': 'TUWHO_CODE'},
 './data/atuswho_2006.dat': {'TUCASEID': 'TUCASEID',
  'TUACTIVITY_N': 'TUACTIVITY_N',
  'TRWHONA': 'TRWHONA',
  'TULINENO': 'TULINENO',
  'TUWHO_CODE': 'TUWHO_CODE'},
 './data/atuswho_2007.dat': {'TUCASEID': 'TUCASEID',
  'TUACTIVITY_N': 'TUACTIVITY_N',
  'TRWHONA': 'TRWHONA',
  'TULINENO': 'TULINENO',
  'TUWHO_CODE': 'TUWHO_CODE'},
 './data/atuswho_2008.dat': {'TUCASEID': 'TUCASEID',
  'TUACTIVITY_N': 'TUACTIVITY_N',
  'TRWHONA': 'TRWHONA',
  'TULINENO': 'TULINENO',
  'TUWHO_CODE': 'TUWHO_CODE'},
 './data/atuswho_2009.dat': {'TUCASEID': 'TUCASEID',
  'TUACTIVITY_N': 'TUACTIVITY_N',
  'TRWHONA': 'TRWHONA',
  'TULINENO': 'TULINENO',
  'TUWHO_CODE': 'TUWHO_

In [16]:
#create dictionary to assist with renaming columns
(fixed_columns :=
 {**original_columns, 
  **{
      path:{col:col.upper() for col in original_columns.get(path,[])}
     for path in who_paths
     },  
}
)

{'./data/atuswho_2004.dat': {'TUCASEID': 'TUCASEID',
  'TUACTIVITY_N': 'TUACTIVITY_N',
  'TUWHO_CODE': 'TUWHO_CODE',
  'TULINENO': 'TULINENO',
  'TRWHONA': 'TRWHONA'},
 './data/atuswho_2005.dat': {'TUCASEID': 'TUCASEID',
  'TUACTIVITY_N': 'TUACTIVITY_N',
  'TRWHONA': 'TRWHONA',
  'TULINENO': 'TULINENO',
  'TUWHO_CODE': 'TUWHO_CODE'},
 './data/atuswho_2006.dat': {'TUCASEID': 'TUCASEID',
  'TUACTIVITY_N': 'TUACTIVITY_N',
  'TRWHONA': 'TRWHONA',
  'TULINENO': 'TULINENO',
  'TUWHO_CODE': 'TUWHO_CODE'},
 './data/atuswho_2007.dat': {'TUCASEID': 'TUCASEID',
  'TUACTIVITY_N': 'TUACTIVITY_N',
  'TRWHONA': 'TRWHONA',
  'TULINENO': 'TULINENO',
  'TUWHO_CODE': 'TUWHO_CODE'},
 './data/atuswho_2008.dat': {'TUCASEID': 'TUCASEID',
  'TUACTIVITY_N': 'TUACTIVITY_N',
  'TRWHONA': 'TRWHONA',
  'TULINENO': 'TULINENO',
  'TUWHO_CODE': 'TUWHO_CODE'},
 './data/atuswho_2009.dat': {'TUCASEID': 'TUCASEID',
  'TUACTIVITY_N': 'TUACTIVITY_N',
  'TRWHONA': 'TRWHONA',
  'TULINENO': 'TULINENO',
  'TUWHO_CODE': 'TUWHO_

In [17]:
# create an example of what is considered a "correct table schema"
(example_correct_table := pl.read_csv(who_paths[0]).head()
)

TUCASEID,TUACTIVITY_N,TUWHO_CODE,TULINENO,TRWHONA
i64,i64,i64,i64,i64
20040101040003,1,-1,-1,1
20040101040003,2,-1,-1,1
20040101040003,3,18,1,0
20040101040003,4,18,1,0
20040101040003,5,18,1,0


In [18]:
# pull the int columns from the "correct table"
(int_columns := example_correct_table.select(cs.integer()).columns
)

['TUCASEID', 'TUACTIVITY_N', 'TUWHO_CODE', 'TULINENO', 'TRWHONA']

In [19]:
# pull the string columns from the "correct table"
(str_columns := example_correct_table.select(cs.string()).columns
)

[]

In [20]:
#combine the int and string columns and map to their type
(col_and_types := {c: pl.String() for c in str_columns
} |    # Merge operator
{c:pl.Int64() for c in int_columns
 if c in who_columns   # only cast columns that we need
        
}
 )

{'TUCASEID': Int64, 'TUACTIVITY_N': Int64, 'TULINENO': Int64}

In [21]:
# conduct renaming of columns and handle casting of data types.IT also pulls in each table.
(dfs := [pl.read_csv(p)
       .rename(col_rename)
       .select([
            pl.col(c).cast(t)
            for c, t in col_and_types.items()
            if c in who_columns  # only cast columns that exist
        ]) for p, col_rename in fixed_columns.items()])

[shape: (354_485, 3)
 ┌────────────────┬──────────────┬──────────┐
 │ TUCASEID       ┆ TUACTIVITY_N ┆ TULINENO │
 │ ---            ┆ ---          ┆ ---      │
 │ i64            ┆ i64          ┆ i64      │
 ╞════════════════╪══════════════╪══════════╡
 │ 20040101040003 ┆ 1            ┆ -1       │
 │ 20040101040003 ┆ 2            ┆ -1       │
 │ 20040101040003 ┆ 3            ┆ 1        │
 │ 20040101040003 ┆ 4            ┆ 1        │
 │ 20040101040003 ┆ 5            ┆ 1        │
 │ …              ┆ …            ┆ …        │
 │ 20041212042565 ┆ 12           ┆ 3        │
 │ 20041212042565 ┆ 13           ┆ 2        │
 │ 20041212042565 ┆ 13           ┆ -1       │
 │ 20041212042565 ┆ 14           ┆ 2        │
 │ 20041212042565 ┆ 15           ┆ -1       │
 └────────────────┴──────────────┴──────────┘,
 shape: (342_235, 3)
 ┌────────────────┬──────────────┬──────────┐
 │ TUCASEID       ┆ TUACTIVITY_N ┆ TULINENO │
 │ ---            ┆ ---          ┆ ---      │
 │ i64            ┆ i64          ┆ i6

In [22]:
# Find the common columns between tables
(common_cols := set(dfs[0].columns))

{'TUACTIVITY_N', 'TUCASEID', 'TULINENO'}

In [23]:
# remove columns that are not common to each table prior to unioning
for df in dfs[1:]: 
    common_cols &= set(df.columns)

In [24]:
# conduct the union IT'S  NOT IN THE FINAL FORM YET
(who_combiner := pl.concat([df.select(sorted(common_cols)) for df in dfs])
 )



TUACTIVITY_N,TUCASEID,TULINENO
i64,i64,i64
1,20040101040003,-1
2,20040101040003,-1
3,20040101040003,1
4,20040101040003,1
5,20040101040003,1
…,…,…
9,20161212162509,2
9,20161212162509,4
10,20161212162509,2
10,20161212162509,4


In [25]:
# filtering out any entries that will not be used in the activity file 
#NOW IT'S IN THE FINAL FORM
(who_combined := who_combiner
    .join(activity_filter
           , on = ["TUCASEID", "TUACTIVITY_N"]
          , how = 'inner')
)
# note the extra filtration is why our numbers don't match up. I wanted to embrace your best practices of shrinking the dataset as small as possible.

TUACTIVITY_N,TUCASEID,TULINENO
i64,i64,i64
6,20040101040003,1
4,20040101040060,3
6,20040101040060,1
11,20040101040060,3
11,20040101040060,-1
…,…,…
22,20161212162426,1
23,20161212162426,2
23,20161212162426,3
5,20161212162456,1


### UNION & FILTER ROSTER FILES
Roster file filtered for only ages 25-64 and households with a married couple.

In [26]:
# create paths
(rost_paths :=
 glob('./data/atusrost*.dat') + glob('./data/*/atusrost*.dat')
)

['./data/atusrost_2003.dat',
 './data/atusrost_2004.dat',
 './data/atusrost_2005.dat',
 './data/atusrost_2006.dat',
 './data/atusrost_2007.dat',
 './data/atusrost_2008.dat',
 './data/atusrost_2009.dat',
 './data/atusrost_2010.dat',
 './data/atusrost_2011.dat',
 './data/atusrost_2012.dat',
 './data/atusrost_2013.dat',
 './data/atusrost_2014.dat',
 './data/atusrost_2015.dat',
 './data/atusrost_2017.dat',
 './data/atusrost_2018.dat',
 './data/atusrost_2019.dat',
 './data/atusrost_2020.dat',
 './data/atusrost_2021.dat',
 './data/atusrost_2022.dat',
 './data/atusrost_2023.dat',
 './data/atusrost_2024.dat',
 './data/atusrost_2016/atusrost_2016.dat']

In [27]:
#create a dictionary of original column names
(original_columns :=
 {p:{col: col
     for col in 
     pl.read_csv(p).columns
    }
  for p in rost_paths
 }
)

{'./data/atusrost_2003.dat': {'TUCASEID': 'TUCASEID',
  'TULINENO': 'TULINENO',
  'TURRP': 'TURRP',
  'TESEX': 'TESEX',
  'TERRP': 'TERRP',
  'TEAGE': 'TEAGE',
  'TXSEX': 'TXSEX',
  'TXRRP': 'TXRRP',
  'TXAGE': 'TXAGE'},
 './data/atusrost_2004.dat': {'TUCASEID': 'TUCASEID',
  'TULINENO': 'TULINENO',
  'TURRP': 'TURRP',
  'TESEX': 'TESEX',
  'TERRP': 'TERRP',
  'TEAGE': 'TEAGE',
  'TXSEX': 'TXSEX',
  'TXRRP': 'TXRRP',
  'TXAGE': 'TXAGE'},
 './data/atusrost_2005.dat': {'TUCASEID': 'TUCASEID',
  'TULINENO': 'TULINENO',
  'TEAGE': 'TEAGE',
  'TERRP': 'TERRP',
  'TESEX': 'TESEX',
  'TXAGE': 'TXAGE',
  'TXRRP': 'TXRRP',
  'TXSEX': 'TXSEX'},
 './data/atusrost_2006.dat': {'TUCASEID': 'TUCASEID',
  'TULINENO': 'TULINENO',
  'TEAGE': 'TEAGE',
  'TERRP': 'TERRP',
  'TESEX': 'TESEX',
  'TXAGE': 'TXAGE',
  'TXRRP': 'TXRRP',
  'TXSEX': 'TXSEX'},
 './data/atusrost_2007.dat': {'TUCASEID': 'TUCASEID',
  'TULINENO': 'TULINENO',
  'TEAGE': 'TEAGE',
  'TERRP': 'TERRP',
  'TESEX': 'TESEX',
  'TXAGE': 'TXAG

In [28]:
#create dictionary to assist with renaming columns
(fixed_columns :=
 {**original_columns, 
  **{
      path:{col:col.upper() for col in original_columns.get(path,[])}
     for path in rost_paths
     },  
}
)

{'./data/atusrost_2003.dat': {'TUCASEID': 'TUCASEID',
  'TULINENO': 'TULINENO',
  'TURRP': 'TURRP',
  'TESEX': 'TESEX',
  'TERRP': 'TERRP',
  'TEAGE': 'TEAGE',
  'TXSEX': 'TXSEX',
  'TXRRP': 'TXRRP',
  'TXAGE': 'TXAGE'},
 './data/atusrost_2004.dat': {'TUCASEID': 'TUCASEID',
  'TULINENO': 'TULINENO',
  'TURRP': 'TURRP',
  'TESEX': 'TESEX',
  'TERRP': 'TERRP',
  'TEAGE': 'TEAGE',
  'TXSEX': 'TXSEX',
  'TXRRP': 'TXRRP',
  'TXAGE': 'TXAGE'},
 './data/atusrost_2005.dat': {'TUCASEID': 'TUCASEID',
  'TULINENO': 'TULINENO',
  'TEAGE': 'TEAGE',
  'TERRP': 'TERRP',
  'TESEX': 'TESEX',
  'TXAGE': 'TXAGE',
  'TXRRP': 'TXRRP',
  'TXSEX': 'TXSEX'},
 './data/atusrost_2006.dat': {'TUCASEID': 'TUCASEID',
  'TULINENO': 'TULINENO',
  'TEAGE': 'TEAGE',
  'TERRP': 'TERRP',
  'TESEX': 'TESEX',
  'TXAGE': 'TXAGE',
  'TXRRP': 'TXRRP',
  'TXSEX': 'TXSEX'},
 './data/atusrost_2007.dat': {'TUCASEID': 'TUCASEID',
  'TULINENO': 'TULINENO',
  'TEAGE': 'TEAGE',
  'TERRP': 'TERRP',
  'TESEX': 'TESEX',
  'TXAGE': 'TXAG

In [29]:
# create an example of what is considered a "correct table schema"
(example_correct_table := pl.read_csv(rost_paths[0]).head()
)

TUCASEID,TULINENO,TURRP,TESEX,TERRP,TEAGE,TXSEX,TXRRP,TXAGE
i64,i64,i64,i64,i64,i64,i64,i64,i64
20030100013280,1,18,1,18,60,0,0,0
20030100013280,2,20,2,20,72,0,0,0
20030100013280,3,22,2,22,37,0,0,0
20030100013344,1,18,2,18,41,0,0,0
20030100013344,2,20,1,20,42,0,0,0


In [30]:
# pull the int columns from the "correct table"
(int_columns := example_correct_table.select(cs.integer()).columns
)

['TUCASEID',
 'TULINENO',
 'TURRP',
 'TESEX',
 'TERRP',
 'TEAGE',
 'TXSEX',
 'TXRRP',
 'TXAGE']

In [31]:
# pull the string columns from the "correct table"
(str_columns := example_correct_table.select(cs.string()).columns
)

[]

In [32]:
#combine the int and string columns and map to their type
(col_and_types := {c: pl.String() for c in str_columns
} |    # Merge operator
{c:pl.Int64() for c in int_columns
 if c in roster_columns   # only cast columns that we need     
}
 )

{'TUCASEID': Int64,
 'TULINENO': Int64,
 'TESEX': Int64,
 'TERRP': Int64,
 'TEAGE': Int64}

In [33]:
# conduct renaming of columns and handle casting of data types.IT also pulls in each table.

(dfs := [pl.read_csv(p)
       .rename(col_rename)
       .select([
            pl.col(c).cast(t)
            for c, t in col_and_types.items()
            if c in roster_columns # only cast columns that exist
        ]) for p, col_rename in fixed_columns.items()]
)

[shape: (58_911, 5)
 ┌────────────────┬──────────┬───────┬───────┬───────┐
 │ TUCASEID       ┆ TULINENO ┆ TESEX ┆ TERRP ┆ TEAGE │
 │ ---            ┆ ---      ┆ ---   ┆ ---   ┆ ---   │
 │ i64            ┆ i64      ┆ i64   ┆ i64   ┆ i64   │
 ╞════════════════╪══════════╪═══════╪═══════╪═══════╡
 │ 20030100013280 ┆ 1        ┆ 1     ┆ 18    ┆ 60    │
 │ 20030100013280 ┆ 2        ┆ 2     ┆ 20    ┆ 72    │
 │ 20030100013280 ┆ 3        ┆ 2     ┆ 22    ┆ 37    │
 │ 20030100013344 ┆ 1        ┆ 2     ┆ 18    ┆ 41    │
 │ 20030100013344 ┆ 2        ┆ 1     ┆ 20    ┆ 42    │
 │ …              ┆ …        ┆ …     ┆ …     ┆ …     │
 │ 20031212033636 ┆ 2        ┆ 2     ┆ 20    ┆ 47    │
 │ 20031212033636 ┆ 3        ┆ 2     ┆ 22    ┆ 12    │
 │ 20031212033636 ┆ 4        ┆ 1     ┆ 22    ┆ 13    │
 │ 20031212033642 ┆ 1        ┆ 2     ┆ 18    ┆ 54    │
 │ 20031212033642 ┆ 2        ┆ 1     ┆ 20    ┆ 56    │
 └────────────────┴──────────┴───────┴───────┴───────┘,
 shape: (39_401, 5)
 ┌────────────────┬─────

In [34]:
# find common columns
(common_cols := set(dfs[0].columns))

{'TEAGE', 'TERRP', 'TESEX', 'TUCASEID', 'TULINENO'}

In [35]:
# remove columns that are not common to each table prior to unioning
for df in dfs[1:]:
    common_cols &= set(df.columns)

In [36]:
# conduct the initial union
(rost_combiner := pl.concat([df.select(sorted(common_cols)) for df in dfs]))


TEAGE,TERRP,TESEX,TUCASEID,TULINENO
i64,i64,i64,i64,i64
60,18,1,20030100013280,1
72,20,2,20030100013280,2
37,22,2,20030100013280,3
41,18,2,20030100013344,1
42,20,1,20030100013344,2
…,…,…,…,…
80,19,2,20161212162456,1
16,18,2,20161212162509,1
37,24,1,20161212162509,2
38,24,2,20161212162509,3


#### filter the roster file after unioning

In [37]:
# filter the table and create the final format
(rost_combined:=(rost_combiner # select only the married people that are over 18.
 .filter((pl.col('TEAGE').is_between(24,64, closed='right')   ) & (pl.col('TERRP') <= 20))
 .with_columns(ones = 1,housemates =  pl.col('TUCASEID').count().over('TUCASEID'))
 .filter(pl.col('housemates') == 2)
 .drop('TEAGE', 'ones', 'housemates', 'TERRP')
              )
)
#If you dock points for having a row count 16 off from what you got, I WILL cry.

TESEX,TUCASEID,TULINENO
i64,i64,i64
2,20030100013344,1
1,20030100013344,2
2,20030100013352,1
1,20030100013352,2
2,20030100013848,1
…,…,…
2,20161212161953,2
1,20161212161997,1
2,20161212161997,2
2,20161212162217,1


### UNION RESPONDENT FILES
 - in the respondent files there was some issues that BLS noted  about the TUFINLWGT column that will require some extra work for a few reasons.
     - TUFINLWGT does not exist in all years.
     -  in 2003-2006 the data in that column was not accurate, so we need to handle that specially.
    

In [38]:
# create paths

(resp_paths :=
 glob('./data/atusresp*.dat') + glob('./data/*/atusresp*.dat')
)

['./data/atusresp_2003.dat',
 './data/atusresp_2004.dat',
 './data/atusresp_2005.dat',
 './data/atusresp_2006.dat',
 './data/atusresp_2007.dat',
 './data/atusresp_2008.dat',
 './data/atusresp_2009.dat',
 './data/atusresp_2010.dat',
 './data/atusresp_2011.dat',
 './data/atusresp_2012.dat',
 './data/atusresp_2013.dat',
 './data/atusresp_2014.dat',
 './data/atusresp_2015.dat',
 './data/atusresp_2017.dat',
 './data/atusresp_2018.dat',
 './data/atusresp_2019.dat',
 './data/atusresp_2020.dat',
 './data/atusresp_2021.dat',
 './data/atusresp_2022.dat',
 './data/atusresp_2023.dat',
 './data/atusresp_2024.dat',
 './data/atusresp_2016/atusresp_2016.dat']

In [39]:
#create a dictionary of original column names
(original_columns :=
 {p:{col: col
     for col in 
     pl.read_csv(p,n_rows = 1,infer_schema_length=10000).columns
    }
  for p in resp_paths
 }
)

{'./data/atusresp_2003.dat': {'TUYEAR': 'TUYEAR',
  'TUMONTH': 'TUMONTH',
  'TUCASEID': 'TUCASEID',
  'TULINENO': 'TULINENO',
  'TUDIARYDATE': 'TUDIARYDATE',
  'TUDIARYDAY': 'TUDIARYDAY',
  'TESPEMPNOT': 'TESPEMPNOT',
  'TRSPFTPT': 'TRSPFTPT',
  'TRWERNAL': 'TRWERNAL',
  'TUIO1MFG': 'TUIO1MFG',
  'TUDIS1': 'TUDIS1',
  'TRYHHCHILD': 'TRYHHCHILD',
  'TRMJIND1': 'TRMJIND1',
  'TRDTOCC1': 'TRDTOCC1',
  'TXSPEMPNOT': 'TXSPEMPNOT',
  'TXRET1': 'TXRET1',
  'TRIMIND1': 'TRIMIND1',
  'TRDTIND1': 'TRDTIND1',
  'TRMJOCGR': 'TRMJOCGR',
  'TXSPUHRS': 'TXSPUHRS',
  'TRSPPRES': 'TRSPPRES',
  'TUABSOT': 'TUABSOT',
  'TUBUS': 'TUBUS',
  'TUBUS1': 'TUBUS1',
  'TUBUS2OT': 'TUBUS2OT',
  'TUBUSL1': 'TUBUSL1',
  'TUBUSL2': 'TUBUSL2',
  'TUBUSL3': 'TUBUSL3',
  'TUBUSL4': 'TUBUSL4',
  'TUDIS': 'TUDIS',
  'TULAY': 'TULAY',
  'TUFWK': 'TUFWK',
  'TURETOT': 'TURETOT',
  'TUSPABS': 'TUSPABS',
  'TUSPUSFT': 'TUSPUSFT',
  'TUCC2': 'TUCC2',
  'TUCC4': 'TUCC4',
  'TUCC9': 'TUCC9',
  'TULAY6M': 'TULAY6M',
  'TULAYAVR'

#### Handle the issues inside of the weight column

In [40]:
#create dictionary to assist with renaming columns
(fixed_columns := {
    **original_columns, 
    **{
        path: {
            col: "weight" 
            for col in original_columns.get(path, [])
            if col in ['TU06FWGT', 'TU20FWGT']
            or (col == 'TUFINLWGT' and not any(str(year) in path for year in range(2003, 2006)))
        }
        for path in resp_paths
    }
}
)

{'./data/atusresp_2003.dat': {'TU06FWGT': 'weight'},
 './data/atusresp_2004.dat': {'TU06FWGT': 'weight'},
 './data/atusresp_2005.dat': {'TU06FWGT': 'weight'},
 './data/atusresp_2006.dat': {'TUFINLWGT': 'weight'},
 './data/atusresp_2007.dat': {'TUFINLWGT': 'weight'},
 './data/atusresp_2008.dat': {'TUFINLWGT': 'weight'},
 './data/atusresp_2009.dat': {'TUFINLWGT': 'weight'},
 './data/atusresp_2010.dat': {'TUFINLWGT': 'weight'},
 './data/atusresp_2011.dat': {'TUFINLWGT': 'weight'},
 './data/atusresp_2012.dat': {'TUFINLWGT': 'weight'},
 './data/atusresp_2013.dat': {'TUFINLWGT': 'weight'},
 './data/atusresp_2014.dat': {'TUFINLWGT': 'weight'},
 './data/atusresp_2015.dat': {'TUFINLWGT': 'weight'},
 './data/atusresp_2017.dat': {'TUFINLWGT': 'weight'},
 './data/atusresp_2018.dat': {'TUFINLWGT': 'weight'},
 './data/atusresp_2019.dat': {'TUFINLWGT': 'weight'},
 './data/atusresp_2020.dat': {'TU20FWGT': 'weight'},
 './data/atusresp_2021.dat': {'TUFINLWGT': 'weight'},
 './data/atusresp_2022.dat': {'T

In [41]:
# create an example of what is considered a "correct table schema"
(example_correct_table := pl.read_csv(resp_paths[0],n_rows = 1000,infer_schema_length=10000).head()
)

TUYEAR,TUMONTH,TUCASEID,TULINENO,TUDIARYDATE,TUDIARYDAY,TESPEMPNOT,TRSPFTPT,TRWERNAL,TUIO1MFG,TUDIS1,TRYHHCHILD,TRMJIND1,TRDTOCC1,TXSPEMPNOT,TXRET1,TRIMIND1,TRDTIND1,TRMJOCGR,TXSPUHRS,TRSPPRES,TUABSOT,TUBUS,TUBUS1,TUBUS2OT,TUBUSL1,TUBUSL2,TUBUSL3,TUBUSL4,TUDIS,TULAY,TUFWK,TURETOT,TUSPABS,TUSPUSFT,TUCC2,TUCC4,…,TRTFRIEND,TRTOHHCHILD,TRTONHHCHILD,TRTNOCHILD,TRTCHILD,TELFS,TXLFS,TRHERNAL,TUSPWK,TUIODP1,TUIODP2,TUIODP3,TTHR,TXIO1COW,TXIO1ICD,TXERNRT,TUFINLWGT,TU06FWGT,TUBNCHWGT,TRTCC,TRTCOC,TEERNHRY,TRERNWA,TRERNHLY,TRERNUPD,TESPUHRS,TRDPFTPT,TERET1,TUDIS2,TRHHCHILD,TRNHHCHILD,TRCHILDNUM,TROHHCHILD,TRTCCTOT,TXERNHRY,TUBWGT,TU04FWGT
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,i64,i64,i64,i64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64
2003,1,20030100013280,1,20030103,6,2,-1,1,-1,-1,-1,10,8,0,-1,15,40,1,-1,1,1,2,-1,-1,-1,-1,-1,-1,-1,-1,2,-1,3,-1,"""-1""","""-1""",…,0,0,0,0,0,2,-1,1,2,1,2,1,0,0,0,1,3958100.0,8155500.0,95348.462599,0,0,1,66000,2200.0,1,-1,2,-1,-1,2,2,0,2,0,41,7848.584109,3779800.0
2003,1,20030100013344,1,20030104,7,1,1,1,-1,-1,0,4,16,0,-1,5,16,3,0,1,-1,1,-1,-1,-1,2,-1,-1,-1,-1,1,-1,-1,-1,"""07:00:00""","""21:00:00""",…,0,760,0,530,760,1,-1,-1,1,1,2,1,0,0,0,11,1720500.0,1735300.0,98728.09459,170,0,2,20000,-1.0,1,50,2,-1,-1,1,2,2,1,170,41,7421.117608,1644000.0
2003,1,20030100013352,1,20030104,7,2,-1,0,-1,-1,-1,10,15,0,-1,16,43,2,-1,1,1,2,-1,-1,-1,-1,-1,-1,-1,-1,2,-1,2,-1,"""-1""","""-1""",…,265,0,0,0,0,2,-1,0,2,1,2,1,0,0,0,0,3674600.0,3830500.0,210864.817792,0,0,1,20000,1250.0,0,-1,2,-1,-1,2,2,0,2,0,0,13326.156827,3575900.0
2003,1,20030100013848,1,20030102,5,1,1,-1,-1,-1,9,-1,-1,0,-1,-1,-1,-1,0,1,2,2,-1,-1,-1,-1,-1,-1,-1,2,2,-1,-1,-1,"""09:00:00""","""01:00:00""",…,0,70,0,0,70,4,-1,-1,1,-1,-1,-1,0,-1,-1,-1,5209300.0,6622000.0,125488.784672,715,0,-1,-1,-1.0,-1,40,-1,-1,-1,1,2,2,1,715,-1,7595.906825,5038000.0
2003,1,20030100014165,1,20030109,5,2,-1,-1,-1,-1,14,10,10,0,-1,16,42,1,-1,1,-1,1,-1,-1,-1,-1,-1,-1,-1,-1,1,-1,2,-1,"""-1""","""-1""",…,0,280,0,85,280,1,-1,-1,2,1,2,1,0,0,0,-1,2588900.0,3068400.0,62364.61682,0,0,-1,-1,-1.0,-1,-1,1,-1,-1,1,2,2,1,0,-1,4900.934095,2514700.0


In [42]:
# pull the int columns from the "correct table"
(int_columns := example_correct_table.select(cs.integer()).columns
)

['TUYEAR',
 'TUMONTH',
 'TUCASEID',
 'TULINENO',
 'TUDIARYDATE',
 'TUDIARYDAY',
 'TESPEMPNOT',
 'TRSPFTPT',
 'TRWERNAL',
 'TUIO1MFG',
 'TUDIS1',
 'TRYHHCHILD',
 'TRMJIND1',
 'TRDTOCC1',
 'TXSPEMPNOT',
 'TXRET1',
 'TRIMIND1',
 'TRDTIND1',
 'TRMJOCGR',
 'TXSPUHRS',
 'TRSPPRES',
 'TUABSOT',
 'TUBUS',
 'TUBUS1',
 'TUBUS2OT',
 'TUBUSL1',
 'TUBUSL2',
 'TUBUSL3',
 'TUBUSL4',
 'TUDIS',
 'TULAY',
 'TUFWK',
 'TURETOT',
 'TUSPABS',
 'TUSPUSFT',
 'TUCC9',
 'TULAY6M',
 'TULAYAVR',
 'TULAYDT',
 'TULK',
 'TULKAVR',
 'TULKM2',
 'TULKM3',
 'TULKM4',
 'TULKM5',
 'TULKM6',
 'TULKDK1',
 'TULKDK2',
 'TULKDK3',
 'TULKDK4',
 'TULKDK5',
 'TULKDK6',
 'TULKPS1',
 'TULKPS2',
 'TULKPS3',
 'TULKPS4',
 'TULKPS5',
 'TULKPS6',
 'TUERNH1C',
 'TUERN2',
 'TRMJOCC1',
 'TXTCC',
 'TXTCOC',
 'TXABSRSN',
 'TXMJOT',
 'TXHRFTPT',
 'TXHRUSL1',
 'TXHRUSL2',
 'TXHRUSLT',
 'TXIO1OCD',
 'TXLAYAVL',
 'TXLAYLK',
 'TXLKAVL',
 'TXLKM1',
 'TXERNH1O',
 'TXERNHRO',
 'TXERNPER',
 'TXERNUOT',
 'TXERNWKP',
 'TXERN',
 'TXERNH2',
 'TXSCHENR',


In [43]:
# pull the string columns from the "correct table"
(str_columns := example_correct_table.select(cs.string()).columns
)

['TUCC2', 'TUCC4']

In [44]:
#combine the int and string columns and map to their type
(col_and_types := {c: pl.String() for c in str_columns
} |    # Merge operator
{c:pl.Int64() for c in int_columns
 if c in respondent_columns  # only cast columns that we need      
}
 )


{'TUCC2': String,
 'TUCC4': String,
 'TUYEAR': Int64,
 'TUCASEID': Int64,
 'TULINENO': Int64}

In [45]:
# set the type for the  new weight column.
col_and_types['weight'] = pl.Int64()

In [46]:
# conduct renaming of columns and handle casting of data types. IT also pulls in each table.

(dfs := [pl.read_csv(p,infer_schema_length=10000)
       .rename(col_rename)
       .select([
            pl.col(c).cast(t)
            for c, t in col_and_types.items()
            if c in respondent_columns  # only cast columns that exist
        ]) for p, col_rename in fixed_columns.items()])

[shape: (20_720, 4)
 ┌────────┬────────────────┬──────────┬─────────┐
 │ TUYEAR ┆ TUCASEID       ┆ TULINENO ┆ weight  │
 │ ---    ┆ ---            ┆ ---      ┆ ---     │
 │ i64    ┆ i64            ┆ i64      ┆ i64     │
 ╞════════╪════════════════╪══════════╪═════════╡
 │ 2003   ┆ 20030100013280 ┆ 1        ┆ 8155462 │
 │ 2003   ┆ 20030100013344 ┆ 1        ┆ 1735322 │
 │ 2003   ┆ 20030100013352 ┆ 1        ┆ 3830527 │
 │ 2003   ┆ 20030100013848 ┆ 1        ┆ 6622022 │
 │ 2003   ┆ 20030100014165 ┆ 1        ┆ 3068387 │
 │ …      ┆ …              ┆ …        ┆ …       │
 │ 2003   ┆ 20031212033591 ┆ 1        ┆ 1731625 │
 │ 2003   ┆ 20031212033597 ┆ 1        ┆ 3575899 │
 │ 2003   ┆ 20031212033621 ┆ 1        ┆ 6004256 │
 │ 2003   ┆ 20031212033636 ┆ 1        ┆ 1402276 │
 │ 2003   ┆ 20031212033642 ┆ 1        ┆ 6374127 │
 └────────┴────────────────┴──────────┴─────────┘,
 shape: (13_973, 4)
 ┌────────┬────────────────┬──────────┬──────────┐
 │ TUYEAR ┆ TUCASEID       ┆ TULINENO ┆ weight   │
 │ --- 

In [47]:
# Find the common columns between tables
(common_cols := set(dfs[0].columns))

{'TUCASEID', 'TULINENO', 'TUYEAR', 'weight'}

In [48]:
# remove columns that are not common to each table prior to unioning
for df in dfs[1:]:
    common_cols &= set(df.columns)

In [49]:
# conduct the union
(resp_combined := pl.concat([df.select(sorted(common_cols)) for df in dfs]))

TUCASEID,TULINENO,TUYEAR,weight
i64,i64,i64,i64
20030100013280,1,2003,8155462
20030100013344,1,2003,1735322
20030100013352,1,2003,3830527
20030100013848,1,2003,6622022
20030100014165,1,2003,3068387
…,…,…,…
20161212162307,1,2016,12958833
20161212162357,1,2016,4899612
20161212162426,1,2016,15977916
20161212162456,1,2016,6280644


## CONDUCT JOINS

In [50]:
(joined := (rost_combined # joined is the generic data that will be used in both charts.
          .join(resp_combined
                ,on = ["TUCASEID"]
                ,how ="left")
          .join(who_combined 
                  ,on = ["TUCASEID", "TULINENO"]
                  ,how = "left")
          .join(act_combined 
                 ,on = ["TUCASEID", "TUACTIVITY_N"]
                 ,how ="left")
).drop('TUTIER1CODE')
)


TESEX,TUCASEID,TULINENO,TULINENO_right,TUYEAR,weight,TUACTIVITY_N,TUACTDUR24,TUTIER2CODE,TUTIER3CODE
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
2,20030100013344,1,1,2003,1735322,,,,
1,20030100013344,2,1,2003,1735322,,,,
2,20030100013352,1,1,2003,3830527,7,15,1,4
2,20030100013352,1,1,2003,3830527,10,180,2,1
1,20030100013352,2,1,2003,3830527,14,60,2,3
…,…,…,…,…,…,…,…,…,…
2,20161212162217,1,1,2016,16822110,23,15,9,4
2,20161212162217,1,1,2016,16822110,24,30,9,4
2,20161212162217,1,1,2016,16822110,25,15,9,2
2,20161212162217,1,1,2016,16822110,28,15,1,2


In [51]:
# next we will add a few columns that will be needed in both  of our charts in  the next step of the workbook and handle a few renamings.
( final_data := joined
 .with_columns(TUACTDUR24=pl.col('TUACTDUR24').fill_null(0)  
       ,TESEX = pl.when(pl.col('TESEX')==1) #gender recode.
             .then(pl.lit("Male"))
             .when(pl.col('TESEX')==2)
             .then(pl.lit('Female'))
       ,activity_type = pl.when((pl.col('TUTIER2CODE')==1) & (pl.col('TUTIER3CODE')==1)) # activity types recode.
              .then(pl.lit("Interior_Cleaning"))
              .when((pl.col('TUTIER2CODE')==1) & (pl.col('TUTIER3CODE')==2))
              .then(pl.lit('Laundry'))
              .when((pl.col('TUTIER2CODE')==2) & (pl.col('TUTIER3CODE').is_in([1,2,3])))
              .then(pl.lit('Food_Prep_And_Cleanup'))
       ,year = pl.when(pl.col('TUYEAR').is_not_null()) # using to fill any null years.
               .then(pl.col('TUYEAR'))
               .when(pl.col('TUYEAR').is_null())
               .then(pl.col('TUCASEID').cast(pl.Utf8).str.slice(0,4).cast(pl.Int64))
              )
 .rename({'TESEX':'gender','TUACTDUR24':'avg_weighted_duration'})
.drop('TUYEAR')
)

gender,TUCASEID,TULINENO,TULINENO_right,weight,TUACTIVITY_N,avg_weighted_duration,TUTIER2CODE,TUTIER3CODE,activity_type,year
str,i64,i64,i64,i64,i64,i64,i64,i64,str,i64
"""Female""",20030100013344,1,1,1735322,,0,,,,2003
"""Male""",20030100013344,2,1,1735322,,0,,,,2003
"""Female""",20030100013352,1,1,3830527,7,15,1,4,,2003
"""Female""",20030100013352,1,1,3830527,10,180,2,1,"""Food_Prep_And_Cleanup""",2003
"""Male""",20030100013352,2,1,3830527,14,60,2,3,"""Food_Prep_And_Cleanup""",2003
…,…,…,…,…,…,…,…,…,…,…
"""Female""",20161212162217,1,1,16822110,23,15,9,4,,2016
"""Female""",20161212162217,1,1,16822110,24,30,9,4,,2016
"""Female""",20161212162217,1,1,16822110,25,15,9,2,,2016
"""Female""",20161212162217,1,1,16822110,28,15,1,2,"""Laundry""",2016


## EXPORT DATA

In [52]:
(final_data.write_csv('./data/union_joined.csv'))