# TODO:


1. Find errors in data management
2. Implement correct fixed effects iv regression
3. Implement plotting code

## Macro 2: Problem Sheet 1

### Exercise 1

> This notebook is structured as follows. First I define all necessary functions in a sort of abstract fashion. Then at the very end I call all functions to produce numerical results and plots.

In [60]:
from pathlib import Path

import numpy as np
import pandas as pd

from statsmodels.formula.api import ols

import seaborn as sns
import matplotlib.pyplot as plt

### Constants

In [61]:
DATA_PATH = Path("data/CNEF_PSID")
TEN_PERCENT_OF_ANNUAL_FULLTIME_HOURS = 208

### Functions

In [62]:
def get_yearly_variables(year):
    """Returns list and dict of relevant columns given year.
    
    Convert variables names to year specific names. For example instead
    of 'x11102' writes 'x1110285' for the year '85. Additionally add
    variable name for individual id.
    
    For details on variable names and more information seek the
    codebook: https://www.cnefdata.org/documentation/codebooks
    
    
    Args:
        year (int): Year. (Write 85 for 1985.)
        
    Returns:
        as_list (list): List of variable names.
        as_dict (dict): Dict of variable name translation to human
            readable form.
    
    """
    variables = {
        "x11102": "household",
        "i11102": "income",
        "d11105": "relationship_to_head",
        "d11101": "age",
        "d11109": "education",
        "e11101": "hours",
        "e11102": "work",
    }
    as_dict = {f"{key}{year}": value for key, value in variables.items()}
    as_dict = dict(as_dict, **{"x11101ll": "individual"})
    as_list = [f"{key}{year}" for key in variables.keys()]
    as_list.append("x11101ll")
    return as_dict, as_list

In [63]:
def load_data_given_year(year):
    """Load data and assign new columns given year.
    
    This already does some steps described in part 2 of exercise 1.
    
    Explanation of steps:
    
    1. Load data file given year
    2. Rename columns
    3. Set household and individual ID as (multi-)index
    4. Drop all rows that have NaN on ALL columns
    5. Assign / transform four variables:
        1. year (the year)
        3. relationship_to_head (remove whitespace)
        4. work (remove whitespace)
    6. Drop columns that are not needed anymore
    7. Convert data types
    
    Args:
        year (int): Year. (Write 85 for 1985.)
        
    Returns:
        df (pd.DataFrame): Data frame with columss
    
    """
    cols_mapper, cols = get_yearly_variables(year)

    df = pd.read_stata(DATA_PATH / f"pequiv{year}.dta", columns=cols)

    df = df.rename(columns=cols_mapper)
    df = df.set_index(["household", "individual"])
    df = df.dropna(how="all")
    df = df.assign(
        **{
            "year": year,
            "relationship_to_head": df.relationship_to_head.str.split(" ").apply(
                lambda s: s[0]
            ),
            "work": df.work.str[-1],
        }
    )
    df = df.convert_dtypes()
    return df

In [64]:
def clean_data(df):
    """Clean data frame.
    
    This does most steps described in part 2 of exercise 1.
    
    Args:
        df (pd.DataFrame): Frame produced by :func:`load_data_given_year`.
        
    Returns:
        df (pd.DataFrame): Cleaned data frame.
    
    """
    df = df.query("relationship_to_head in ['head', 'partner']")
    df = df.assign(
        **{
            "is_single": df.relationship_to_head.groupby("household").transform(
                lambda x: set(x) != {"head", "partner"}
            )
        }
    )
    df = df.query("is_single == False")

    total = df.groupby(by="household")[["hours", "income"]].sum()

    df = df.query("relationship_to_head == 'head'")
    df = df.reset_index(level="individual", drop=True)
    df = df.assign(**{"income": total.income, "hours": total.hours})

    df = df.query("25 <= age < 56")
    df = df.query("hours >= @TEN_PERCENT_OF_ANNUAL_FULLTIME_HOURS")
    df = df.query("income > 0")

    df = df.assign(**{"income": np.log(df.income)})

    df = df.drop(["relationship_to_head", "is_single", "hours"], axis=1)
    df = df.astype(
        {
            "income": float,
            "education": "category",
            "age": "category",
            "year": "category",
            "work": "category",
        }
    )
    df = df.set_index("age", append=True)
    df = df.dropna(how="any")
    return df

In [71]:
def load_and_clean_data():
    """Load, clean and merge data for years 1980 to 1997.
    
    Since loading and cleaning the data is time consuming there is
    a check if the clean data is already available.
    
    Returns:
        df (pd.DataFrame): Cleaned and merged data frame with index ['household', 'year']
            and columns 'income', 'age', 'education' and 'work'. Column 'income' is float
            while all other columns are category.
    
    """
    clean_data_path = DATA_PATH / "clean_data.csv"
    if clean_data_path.exists():
        df = pd.read_csv(clean_data_path)
    else:
        dfs = []
        for year in range(80, 98):
            df = load_data_given_year(year)
            df = clean_data(df)
            dfs.append(df)
        df = pd.concat(dfs).sort_index().reset_index()
    return df

In [82]:
def fit_dummy_regression(df):
    """Fit dummy regression on data in df.
    
    In the formula object C() tells statsmodels to use the variable as
    categorical variable.
    
    """
    model = ols(
        "income ~ C(year) + C(age) + C(education) + C(work)", data=df
    )
    model = model.fit()
    return model

In [87]:
def add_residuals_to_df(df, model):
    """Add column residuals to data frame.
    
    Args:
        df (pd.DataFrame): Data from :func:`load_and_clean_data()`.
        model (statsmodels.model): Model fitted in :func:`fit_dummy_regression`.
        
    Returns:
        df (pd.DataFrame): As initial df but with column residuals from model.
    
    """
    df = df.assign(**{"residuals": model.resid})
    return df

In [115]:
def estimate_quantities_approach_4(df):
    """Estimate quantities using approach in part 4.
    
    
    Returns:
        quantities (pd.DataFrame): Estimates of rho, sigma_eps^2 and sigma_mu_tau^2.
        var (pd.DataFrame): Sample variances.

    """
    var = df.query("age in [25, 40, 55]")[["age", "residuals"]].groupby("age").var()

    rho = ((var.loc[55][0] - var.loc[40][0]) / (var.loc[40][0] - var.loc[25][0]))
    rho = rho ** (1 / 30)
    gamma = rho ** 2 * (1 - rho ** 30) / (1 - rho ** 2)

    sigma_eps =  (var.loc[40][0] - var.loc[25][0]) / gamma
    sigma_mu_tau = var.loc[25][0] - sigma_eps

    quantities = pd.DataFrame(
        [rho, sigma_eps, sigma_mu_tau],
        columns=["value"],
        index=["$\rho$", "$\sigma_{\epsilon}^2$", "$\sigma_{\mu\tau}^2$"],
    )
    return quantities, var

In [261]:
def estimate_quantities_approach_5(df, var):
    """Estimate quantities using approach in part 5.
    
    """
    # compute (sample) covariances
    combinations = [(40, 39), (40, 38), (40, 37)]
    cov = pd.DataFrame(index=pd.MultiIndex.from_tuples(combinations))
    for comb in combinations:

        _df = df[["household", "age", "residuals"]].query("age in @comb")
        idx = _df.groupby("household")["age"].transform(
            lambda x:  set(x) == set(comb) and len(x) == 2
        )
        _df = _df.loc[idx, ].set_index(["household", "age"])

        _cov = _df.unstack(level="age").cov().values[0, 1]
        cov.loc[comb, "cov"] = _cov

    # compute quantities
    rho = ((cov.loc[(40, 37),] - cov.loc[(40, 38)]) / (
        cov.loc[(40, 38),] - cov.loc[(40, 39)]
    ))[0]
    
    sigma_eps = (cov.loc[(40, 37), ] - cov.loc[(40, 37), ])[0] * (1 - rho ** 2) / (rho * (rho - 1) * (rho ** 29 + 1)) 
    sigma_mu = cov.loc[(40, 39), ][0] - sigma_eps * rho * (1 - rho ** 30) / (1 - rho ** 2)
    sigma_tau = var.loc[25][0] - sigma_mu - sigma_eps
    
    quantities = pd.DataFrame(
        [rho, sigma_eps, sigma_mu, sigma_tau],
        columns=["value"],
        index=["$\rho$", "$\sigma_{\epsilon}^2$", "$\sigma_{\mu}^2$", "$\sigma_{\tau}^2$"],
    )
    return quantities

In [265]:
def estimate_rho_approach_iv(df):
    """TODO!!!"""
    
    # construct instrument
    #for household, age in df.index:
        # try:
        #     instrument = df.loc[(household, age - 2), "residual"] - df.loc[(household, age - 2), "residual"]
        # except:
        #    instrument = np.nan
        
    #    try:
    #        lagged_residual = df.loc[(household, age-1), "residual"]
    #    except:
    #        lagged_residual = np.nan
            
    #    df.loc[(household, age), "instrument"] = instrument
    #    df.loc[(household, age), "lagged_residual"] = lagged_residual
    
    # df = df.dropna(how="any")
    
    # from statsmodels.formula.api import mixedlm
    # https://www.statsmodels.org/stable/mixed_linear.html
    # model = mixedlm("residual ~ lagged_residual", data=df, groups=df.index.get_level_values("household"))
    
    # check out: https://bashtage.github.io/linearmodels/doc/index.html
    quantity = pd.Series([0.5], index=["$\rho$"], name="value").to_frame()
    return quantity

In [266]:
def combine_parts(part4, part5, part6):
    keys = ["part4", "part5", "part6"]
    df = pd.concat((part4, part5, part6), axis=1, keys=keys)
    df = df.droplevel(level=1, axis=1)
    df = df.convert_dtypes()
    return df

### Computation

In [255]:
df = load_and_clean_data()

In [256]:
model = fit_dummy_regression(df)

In [257]:
df = add_residuals_to_df(df, model)

In [258]:
part4, var = estimate_quantities_approach_4(df)

In [262]:
part5 = estimate_quantities_approach_5(df, var)

In [267]:
part6 = estimate_rho_approach_iv(df)

In [268]:
result = combine_parts(part4, part5, part6)

In [269]:
result

Unnamed: 0,part4,part5,part6
$\rho$,1.023501,1.568958,0.5
$\sigma_{\epsilon}^2$,0.00116,-0.0,
$\sigma_{\mu\tau}^2$,0.194442,,
$\sigma_{\mu}^2$,,0.01292,
$\sigma_{\tau}^2$,,0.182683,
