# Data analysis and wrangling

## Dependencies

In [12]:
"""
Fetch and analysis the following datasets:
    - MIMIC-3 from the server in COMP-WELL lab
    - Adult Census dataset from Fairlearn
    - Intersectional bias assessment for depression prediction from OpenML

docs for fetch_openml: https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_openml.html 
"""

from sklearn.datasets import fetch_openml
import pandas as pd

In [13]:
DATA = {}
ACS_INCOME = "ACSincome"
IBADepression = "IBADepression"

## Fetching data from OpenML

In [14]:
def fetch_openml_dataset(choice):
    """Load dataset according to the choice.
    
    Args:
        choice (str): dataset name
    
    Returns:
        X (pd.Dataframe): features
        y_true (pd.Series): true labels
    """
    if choice == ACS_INCOME:
        data = fetch_openml(data_id=43141, as_frame=True, parser='auto')
    elif choice == IBADepression:
        data = fetch_openml(data_id=45040, as_frame=True, parser='auto')

    X = pd.get_dummies(data.data)
    y_true = data.target
    return X, y_true

In [15]:
def test_fetch_openml_dataset():
    """Test fetch_openml_dataset function."""
    DATA[ACS_INCOME] = fetch_openml_dataset(ACS_INCOME)
    DATA[IBADepression] = fetch_openml_dataset(IBADepression)


test_fetch_openml_dataset()

## Evaluating fairness-related metrics