<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-pandas-and-the-NLS-data" data-toc-modified-id="Import-pandas-and-the-NLS-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import pandas and the NLS data</a></span></li><li><span><a href="#Create-a-function-for-defining-the-interquartile-range" data-toc-modified-id="Create-a-function-for-defining-the-interquartile-range-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Create a function for defining the interquartile range</a></span></li><li><span><a href="#Run-the-interquartile-range-function" data-toc-modified-id="Run-the-interquartile-range-function-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Run the interquartile range function</a></span></li><li><span><a href="#Define-a-function-to-return-selected-summary-statistics-as-a-series" data-toc-modified-id="Define-a-function-to-return-selected-summary-statistics-as-a-series-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Define a function to return selected summary statistics as a series</a></span></li><li><span><a href="#Use-apply-to-run-the-function" data-toc-modified-id="Use-apply-to-run-the-function-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Use apply to run the function</a></span></li><li><span><a href="#Use-reset_index-to-use-the-default-index-instead-of-the-index-created-from-the-groupby-DataFrame" data-toc-modified-id="Use-reset_index-to-use-the-default-index-instead-of-the-index-created-from-the-groupby-DataFrame-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Use reset_index to use the default index instead of the index created from the groupby DataFrame</a></span></li><li><span><a href="#Chain-with-unstack-instead-to-create-columns-based-on-the-summary-variables" data-toc-modified-id="Chain-with-unstack-instead-to-create-columns-based-on-the-summary-variables-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Chain with unstack instead to create columns based on the summary variables</a></span></li></ul></div>

# Import pandas and the NLS data

In [1]:
import pandas as pd
import numpy as np

In [2]:
# pd.set_option('display.width', 200)
# pd.set_option('display.max_columns', 35)
# pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.1f}'.format

In [3]:
import watermark
%load_ext watermark

%watermark -n -i -iv

watermark: 2.1.0
numpy    : 1.19.2
json     : 2.0.9
pandas   : 1.2.1



In [4]:
nls97 = pd.read_csv('data/nls97b.csv')
nls97.set_index('personid', inplace=True)

# Create a function for defining the interquartile range

In [5]:
def iqr(x):
    return x.quantile(0.75) - x.quantile(0.25)

# Run the interquartile range function

In [10]:
aggdict = {
    'weeksworked06': ['count', 'mean', iqr],
    'childathome': ['count', 'mean', iqr]
}

In [11]:
nls97.groupby(['highestdegree']).agg(aggdict)

Unnamed: 0_level_0,weeksworked06,weeksworked06,weeksworked06,childathome,childathome,childathome
Unnamed: 0_level_1,count,mean,iqr,count,mean,iqr
highestdegree,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0. None,703,29.7,47.0,439,1.8,3.0
1. GED,1104,33.2,39.0,693,1.7,3.0
2. High School,3368,39.4,21.0,1961,1.9,2.0
3. Associates,722,40.7,18.0,428,2.0,2.0
4. Bachelors,1642,42.2,14.0,827,1.9,1.0
5. Masters,601,42.2,13.0,333,1.9,1.0
6. PhD,53,38.2,23.0,32,2.1,2.0
7. Professional,117,27.1,45.0,57,1.8,1.0


# Define a function to return selected summary statistics as a series

In [12]:
def gettots(x):
    out = {}
    out['qr1'] = x.quantile(0.25)
    out['med'] = x.median()
    out['qr3'] = x.quantile(0.75)
    out['count'] = x.count()
    return pd.Series(out)

# Use apply to run the function

In [13]:
pd.options.display.float_format = '{:,.0f}'.format

In [14]:
nls97.groupby(['highestdegree'])['weeksworked06'].apply(gettots)

highestdegree         
0. None          qr1         5
                 med        34
                 qr3        52
                 count     703
1. GED           qr1        13
                 med        42
                 qr3        52
                 count   1,104
2. High School   qr1        31
                 med        52
                 qr3        52
                 count   3,368
3. Associates    qr1        34
                 med        52
                 qr3        52
                 count     722
4. Bachelors     qr1        38
                 med        52
                 qr3        52
                 count   1,642
5. Masters       qr1        39
                 med        52
                 qr3        52
                 count     601
6. PhD           qr1        29
                 med        50
                 qr3        52
                 count      53
7. Professional  qr1         4
                 med        29
                 qr3        49
                

# Use reset_index to use the default index instead of the index created from the groupby DataFrame

In [15]:
nls97.groupby(['highestdegree'])['weeksworked06'].apply(gettots).reset_index()

Unnamed: 0,highestdegree,level_1,weeksworked06
0,0. None,qr1,5
1,0. None,med,34
2,0. None,qr3,52
3,0. None,count,703
4,1. GED,qr1,13
5,1. GED,med,42
6,1. GED,qr3,52
7,1. GED,count,1104
8,2. High School,qr1,31
9,2. High School,med,52


# Chain with unstack instead to create columns based on the summary variables

In [16]:
nlssums = nls97.groupby(['highestdegree'
                         ])['weeksworked06'].apply(gettots).unstack()

In [17]:
nlssums

Unnamed: 0_level_0,qr1,med,qr3,count
highestdegree,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0. None,5,34,52,703
1. GED,13,42,52,1104
2. High School,31,52,52,3368
3. Associates,34,52,52,722
4. Bachelors,38,52,52,1642
5. Masters,39,52,52,601
6. PhD,29,50,52,53
7. Professional,4,29,49,117


In [18]:
nlssums.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8 entries, 0. None to 7. Professional
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   qr1     8 non-null      float64
 1   med     8 non-null      float64
 2   qr3     8 non-null      float64
 3   count   8 non-null      float64
dtypes: float64(4)
memory usage: 320.0+ bytes


# as_index = False

In [20]:
nls97.groupby(['highestdegree'],
              as_index=False)['weeksworked06'].apply(gettots)

Unnamed: 0,highestdegree,qr1,med,qr3,count
0,0. None,5,34,52,703
1,1. GED,13,42,52,1104
2,2. High School,31,52,52,3368
3,3. Associates,34,52,52,722
4,4. Bachelors,38,52,52,1642
5,5. Masters,39,52,52,601
6,6. PhD,29,50,52,53
7,7. Professional,4,29,49,117
