In [77]:
from pathlib import Path

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

In [78]:
DATA_PATH = Path("data/CNEF_PSID")

TEN_PERCENT_OF_ANNUAL_FULLTIME_HOURS = 208
IS_HEAD = 'head           1'

variables = {
    # "x11101ll": "individual",
    "x11102": "household", 
    "i11102": "income",
    "d11105": "relationship_to_hh_head",
    "d11101": "age",
    "d11106": "num_persons",
    "d11107": "num_children",
    "d11109": "education",
    "e11101": "hours",
}

def yearly_variables(year):
    as_dict = {f"{key}{year}": value for key, value in variables.items()}
    as_dict = dict(as_dict, **{"x11101ll": "individual"})
    as_list = [f"{key}{year}" for key in variables.keys()]
    as_list.append("x11101ll")
    return as_dict, as_list

# see codebook for details: https://www.cnefdata.org/documentation/codebooks

## Load Data

In [79]:
def _load_data_given_year(year):
    """Load data and assign new columns."""
    renamer, cols = yearly_variables(year)
    
    df = pd.read_stata(DATA_PATH / f"pequiv{year}.dta", columns=cols)
    
    df = df.rename(columns=renamer)
    df = df.set_index(["household", "individual"])
    df = df.dropna(how="all")
    df = df.assign(**{"year": year, "num_adults": df.num_persons - df.num_children})
    df = df.convert_dtypes()
    df = df.reset_index(level=1)
    
    total_hours = df.groupby(by="household")["hours"].sum()
    df = df.assign(**{"total_hours": total_hours})
    df = df.set_index("individual", append=True).sort_index()
    
    return df

In [80]:
def _subset_data(df):
    """Subset data frame."""
    df = (
        df.reset_index(level="individual", drop=True)
        .query("relationship_to_hh_head == @IS_HEAD")
        .query("25 <= age < 56")
        .query("num_adults == 2")
        .query("total_hours >= @TEN_PERCENT_OF_ANNUAL_FULLTIME_HOURS")
        .drop(
            [
                "relationship_to_hh_head",
                "num_persons",
                "num_children",
                "num_adults",
                "total_hours",
            ],
            axis=1,
        )
        .set_index("year", append=True)
        .swaplevel()
        .sort_index()
        .dropna()
    )
    return df

In [81]:
def load_data():
    """Load, subset and merge data for years 1980 to 1997"""
    dfs = [
        _subset_data(_load_data_given_year(year)) for year in range(80, 98)
    ]
    df = pd.concat(dfs)
    return df

In [82]:
df = load_data()