<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-pandas-and-numpy-and-load-the-NLS-data" data-toc-modified-id="Import-pandas-and-numpy-and-load-the-NLS-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import pandas and numpy and load the NLS data</a></span></li><li><span><a href="#Gather-some-descriptive-statistics" data-toc-modified-id="Gather-some-descriptive-statistics-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Gather some descriptive statistics</a></span></li><li><span><a href="#Show-descriptives-for-a-subset-of-the-series" data-toc-modified-id="Show-descriptives-for-a-subset-of-the-series-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Show descriptives for a subset of the series</a></span></li><li><span><a href="#Test-for-a-condition-across-all-values" data-toc-modified-id="Test-for-a-condition-across-all-values-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Test for a condition across all values</a></span></li><li><span><a href="#Show-descriptives-for-a-subset-of-the-series-based-on-values-in-a-different-column" data-toc-modified-id="Show-descriptives-for-a-subset-of-the-series-based-on-values-in-a-different-column-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Show descriptives for a subset of the series based on values in a different column</a></span></li><li><span><a href="#Show-descriptives-and-frequencies-for-a-series-containing-categorical-data" data-toc-modified-id="Show-descriptives-and-frequencies-for-a-series-containing-categorical-data-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Show descriptives and frequencies for a series containing categorical data</a></span></li></ul></div>

# Import pandas and numpy and load the NLS data

In [1]:
import pandas as pd
import numpy as np

In [2]:
# pd.set_option('display.width', 200)
# pd.set_option('display.max_columns', 35)
# pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
import watermark
%load_ext watermark

%watermark -n -i -iv

pandas   : 1.2.1
json     : 2.0.9
watermark: 2.1.0
numpy    : 1.19.2



In [5]:
nls97 = pd.read_csv('data/nls97b.csv')
nls97.set_index('personid', inplace=True)

# Gather some descriptive statistics

In [6]:
gpaoverall = nls97['gpaoverall']
gpaoverall.mean()

2.8184077281812128

In [7]:
gpaoverall.describe()

count   6,004.00
mean        2.82
std         0.62
min         0.10
25%         2.43
50%         2.86
75%         3.26
max         4.17
Name: gpaoverall, dtype: float64

In [8]:
gpaoverall.quantile(np.arange(0.1, 1.1, 0.1))

0.10   2.02
0.20   2.31
0.30   2.52
0.40   2.70
0.50   2.86
0.60   3.01
0.70   3.17
0.80   3.36
0.90   3.60
1.00   4.17
Name: gpaoverall, dtype: float64

# Show descriptives for a subset of the series

In [10]:
gpaoverall.loc[gpaoverall.between(3, 3.5)].head(5)

personid
100061   3.06
100292   3.45
101526   3.37
101527   3.26
102125   3.14
Name: gpaoverall, dtype: float64

In [11]:
gpaoverall.loc[gpaoverall.between(3, 3.5)].sum()

5416.26

In [13]:
gpaoverall.loc[(gpaoverall < 2) | (gpaoverall > 4)].sample(5, random_state=2)

personid
932782   1.90
561335   1.82
850001   4.10
292455   1.97
644271   1.97
Name: gpaoverall, dtype: float64

In [14]:
gpaoverall.loc[gpaoverall > gpaoverall.quantile(0.99)].agg(
    ['count', 'min', 'max'])

count   60.00
min      3.98
max      4.17
Name: gpaoverall, dtype: float64

# Test for a condition across all values

In [15]:
(gpaoverall > 4).any()  # any person has GPA greater than 4

True

In [16]:
(gpaoverall >= 0).all()  # all people have GPA greater than or equal 0

False

In [18]:
(gpaoverall == 0).sum()  # of people with GPA equal to 0

0

In [19]:
gpaoverall.isnull().sum()  # of people with missing value for GPA

2980

# Show descriptives for a subset of the series based on values in a different column

In [20]:
# mean high school GPA for individuals with a wage income in 2016 that's above the 75th percentile
nls97.loc[nls97['wageincome'] > nls97['wageincome'].quantile(0.75),
          'gpaoverall'].mean()

3.080417101147028

In [21]:
# mean high school GPA for individuals with a wage income in 2016 that's below the 25th percentile
nls97.loc[nls97['wageincome'] < nls97['wageincome'].quantile(0.25),
          'gpaoverall'].mean()

2.7201434159061284

# Show descriptives and frequencies for a series containing categorical data

In [22]:
nls97['maritalstatus'].describe()

count        6672
unique          5
top       Married
freq         3066
Name: maritalstatus, dtype: object

In [23]:
nls97['maritalstatus'].value_counts()

Married          3066
Never-married    2766
Divorced          663
Separated         154
Widowed            23
Name: maritalstatus, dtype: int64