In [None]:
import pandas as pd
liver = pd.read_csv('https://hds5210-data.s3.amazonaws.com/indian_liver_patient.csv')

# Single Grouping Field

In [None]:
liver.groupby("Gender").count()

In [None]:
liver.groupby("Gender").count().reset_index()

In [None]:
liver.groupby("Gender", as_index=False).count()

# Grouping by Multiple Fields at Once

In [None]:
liver.groupby(["Gender","Age"], as_index=False).count()

# Common Custom Grouping Procedure

1. Create the new field you want to group by
2. Group by

In [None]:
liver["Decade"] = liver["Age"] // 10

In [None]:
liver.groupby("Decade").count()

# Custom Age Buckets

0-18, 19-44, 45-64, 65-84, and 85

1. Create a function that generates the grouping buckets you need
2. Set your input field to be the data frame index
3. Create a groupby using your custom function

In [None]:
# 1. Create a function to generate teh grouping buckets
#    Note that the labels here are prefixed with a letter to make them more easily sortable.
#    There's also an "unknown" that sorts at the bottom as Z

def cms_ages(age):
    ranges = [
        {'min': 0,  'max': 18,  'label': 'A.  0-18'},
        {'min': 19, 'max': 44,  'label': 'B. 19-44'},
        {'min': 45, 'max': 64,  'label': 'C. 45-64'},
        {'min': 65, 'max': 84,  'label': 'D. 65-84'},
        {'min': 85, 'max': 999, 'label': 'E. 85-up'}
    ]

    for r in ranges:
        if age >= r.get('min') and age <= r.get('max'):
            return r.get('label')

    return 'Z. Unknown'

In [None]:
liver.head()

In [None]:
liver.set_index('Age').head()

In [None]:
by_cms_age = liver.set_index("Age").groupby(cms_ages)

In [None]:
by_cms_age.count().reset_index()

# Using a Categorical Series

Categories are a special data type that has a defined domain...  "Gender is always Male, Female, or Unknown"

In [None]:
gender_type = pd.CategoricalDtype(categories=["Famail","Male","Unknown"], ordered=True)

In [None]:
liver = pd.read_csv('/data/indian_liver_patient.csv')

In [None]:
liver["Gender"] = liver["Gender"].astype(gender_type)

In [None]:
liver.head()

In [None]:
liver.count()

In [None]:
liver.fillna('Unknown', inplace=True)

In [None]:
liver.groupby("Gender", observed=False).count()

# Grouping DateTime Fields


In [None]:
condemnations = pd.read_csv('/data/condemn.txt')

In [None]:
condemnations.columns

In [None]:
condemnations["InspectDate"].head()

In [None]:
condemn = condemnations[["InspectDate","Status"]]

In [None]:
condemn['InspectDate'] = pd.to_datetime(condemn['InspectDate'].astype(str), format='%Y-%m-%d')

In [None]:
condemn.head()

In [None]:
by_year = condemn.groupby(pd.Grouper(key="InspectDate", freq='Y'))

In [None]:
by_year.count().plot()

In [None]:
by_year.count().head()

In [None]:
condemn.groupby(pd.Grouper(key="InspectDate", freq='Q')).count().plot()

In [None]:
condemn.groupby("InspectDate").count().plot()