<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-pandas-and-numpy,-and-then-load-the-NLS-and-land-temperatures-data" data-toc-modified-id="Import-pandas-and-numpy,-and-then-load-the-NLS-and-land-temperatures-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import pandas and numpy, and then load the NLS and land temperatures data</a></span></li><li><span><a href="#Use-NumPy's-where-function-to-create-a-categorical-series-containing-two-values" data-toc-modified-id="Use-NumPy's-where-function-to-create-a-categorical-series-containing-two-values-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Use NumPy's where function to create a categorical series containing two values</a></span></li><li><span><a href="#Use-NumPy's-where-method-to-create-a-categorical-series-containing-three-values" data-toc-modified-id="Use-NumPy's-where-method-to-create-a-categorical-series-containing-three-values-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Use NumPy's where method to create a categorical series containing three values</a></span></li><li><span><a href="#Use-NumPy's-select-method-to-evaluate-a-list-of-conditions" data-toc-modified-id="Use-NumPy's-select-method-to-evaluate-a-list-of-conditions-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Use NumPy's select method to evaluate a list of conditions</a></span></li><li><span><a href="#Use-lambda-to-test-several-columns-in-one-statement" data-toc-modified-id="Use-lambda-to-test-several-columns-in-one-statement-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Use lambda to test several columns in one statement</a></span></li><li><span><a href="#Create-a-function-that-assigns-a-value-based-on-the-value-of-several-series" data-toc-modified-id="Create-a-function-that-assigns-a-value-based-on-the-value-of-several-series-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Create a function that assigns a value based on the value of several series</a></span></li><li><span><a href="#Another-way-to-perform-steps-6-and-7-is-to-add-a-lambda-function-to-apply.-This-produces-the-same-results" data-toc-modified-id="Another-way-to-perform-steps-6-and-7-is-to-add-a-lambda-function-to-apply.-This-produces-the-same-results-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Another way to perform steps 6 and 7 is to add a lambda function to apply. This produces the same results</a></span></li></ul></div>

# Import pandas and numpy, and then load the NLS and land temperatures data

In [1]:
import pandas as pd
import numpy as np

In [2]:
# pd.set_option('display.width', 200)
# pd.set_option('display.max_columns', 35)
# pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
import watermark
%load_ext watermark

%watermark -n -i -iv

json     : 2.0.9
numpy    : 1.19.2
pandas   : 1.2.1
watermark: 2.1.0



In [4]:
nls97 = pd.read_csv('data/nls97b.csv')
nls97.set_index('personid', inplace=True)

In [5]:
landtemps = pd.read_csv('data/landtemps2019avgs.csv')

# Use NumPy's where function to create a categorical series containing two values

In [6]:
landtemps['elevation'].quantile(np.arange(0.2, 1.1, 0.2))

0.20      48.00
0.40     190.50
0.60     393.20
0.80   1,066.80
1.00   9,999.00
Name: elevation, dtype: float64

In [7]:
landtemps['elevationgroup'] = np.where(
    landtemps['elevation'] > landtemps['elevation'].quantile(0.8), 'Low',
    'High')

In [9]:
landtemps['elevationgroup'] = landtemps['elevationgroup'].astype('category')

In [10]:
landtemps.groupby(['elevationgroup'])['elevation'].agg(['count', 'min', 'max'])

Unnamed: 0_level_0,count,min,max
elevationgroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
High,9686,-350.0,1066.8
Low,2409,1067.0,9999.0


# Use NumPy's where method to create a categorical series containing three values

In [11]:
landtemps['elevation'].median()

271.3

In [13]:
landtemps['elevationgroup'] = np.where(
    landtemps['elevation'] > landtemps['elevation'].quantile(0.8), 'High',
    np.where(landtemps['elevation'] > landtemps['elevation'].median(),
             'Median', 'Low'))
landtemps['elevationgroup'] = landtemps['elevationgroup'].astype('category')
landtemps.groupby(['elevationgroup'])['elevation'].agg(['count', 'min', 'max'])

Unnamed: 0_level_0,count,min,max
elevationgroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
High,2409,1067.0,9999.0
Low,6056,-350.0,271.3
Median,3630,271.4,1066.8


# Use NumPy's select method to evaluate a list of conditions

In [14]:
test = [(nls97['gpaoverall'] < 2) & (nls97['highestdegree'] == '0. None'),
        nls97['highestdegree'] == '0. None', nls97['gpaoverall'] < 2]

In [15]:
result = ['1. Low GPA and No Diploma', '2. No Diploma', '3. Low GPA']

In [18]:
nls97['hsachieve'] = np.select(test, result, '4. Did Okay')

In [22]:
nls97['hsachieve'].head()

personid
100061      4. Did Okay
100139      4. Did Okay
100284    2. No Diploma
100292      4. Did Okay
100583      4. Did Okay
Name: hsachieve, dtype: object

In [25]:
nls97[['hsachieve', 'gpaoverall', 'highestdegree']].head()

Unnamed: 0_level_0,hsachieve,gpaoverall,highestdegree
personid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100061,4. Did Okay,3.06,2. High School
100139,4. Did Okay,,2. High School
100284,2. No Diploma,,0. None
100292,4. Did Okay,3.45,4. Bachelors
100583,4. Did Okay,2.91,2. High School


In [26]:
nls97['hsachieve'].value_counts().sort_index()

1. Low GPA and No Diploma      95
2. No Diploma                 858
3. Low GPA                    459
4. Did Okay                  7572
Name: hsachieve, dtype: int64

# Use lambda to test several columns in one statement

In [28]:
nls97.loc[[100292, 100583, 100139], 'colenrfeb00':'colenroct04'].T

personid,100292,100583,100139
colenrfeb00,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct00,3. 4-year college,1. Not enrolled,1. Not enrolled
colenrfeb01,3. 4-year college,1. Not enrolled,1. Not enrolled
colenroct01,3. 4-year college,3. 4-year college,1. Not enrolled
colenrfeb02,3. 4-year college,3. 4-year college,1. Not enrolled
colenroct02,3. 4-year college,1. Not enrolled,1. Not enrolled
colenrfeb03,3. 4-year college,1. Not enrolled,1. Not enrolled
colenroct03,3. 4-year college,1. Not enrolled,1. Not enrolled
colenrfeb04,3. 4-year college,1. Not enrolled,1. Not enrolled
colenroct04,1. Not enrolled,1. Not enrolled,1. Not enrolled


In [29]:
# The DataFrame apply method can be used to send each row of a DataFrame to a function by specifying axis=1.

nls97['baenrollment'] = nls97.filter(
    like='colenr').apply(lambda x: x.str[0:1] == '3').any(axis=1)

In [30]:
nls97.loc[[100292, 100583, 100139], ['baenrollment']].T

personid,100292,100583,100139
baenrollment,True,True,False


# Create a function that assigns a value based on the value of several series

In [31]:
def get_sleep_deprived_reason(row):
    sleep_deprived_reason = 'Unknown'
    if row['nightlyhrssleep'] >= 6:
        sleep_deprived_reason = 'Not sleep deprived'
    elif row['nightlyhrssleep'] > 0:
        if (row['weeksworked16'] + row['weeksworked17'] < 80):
            if (row['childathome'] > 2):
                sleep_deprived_reason = 'Child rearing'
            else:
                sleep_deprived_reason = 'Other reasons'
        if (row['wageincome'] >= 62000 or row['highestgradecompleted'] >= 16):
            sleep_deprived_reason = 'Work Pressure'
        else:
            sleep_deprived_reason = 'Income Pressure'
    else:
        sleep_deprived_reason = 'Unknown'
    return sleep_deprived_reason

In [32]:
nls97['sleep_deprived_reason'] = nls97.apply(get_sleep_deprived_reason, axis=1)
nls97['sleep_deprived_reason'].astype('category')
nls97['sleep_deprived_reason'].value_counts()

Not sleep deprived    5595
Unknown               2286
Income Pressure        760
Work Pressure          343
Name: sleep_deprived_reason, dtype: int64

# Another way to perform steps 6 and 7 is to add a lambda function to apply. This produces the same results

In [33]:
def getsleepdeprivedreason(childathome, nightlyhrssleep, wageincome,
                           weeksworked16, weeksworked17,
                           highestgradecompleted):
    sleep_deprived_reason = 'Unknown'
    if nightlyhrssleep >= 6:
        sleep_deprived_reason = 'Not sleep deprived'
    elif nightlyhrssleep > 0:
        if (weeksworked16 + weeksworked17 < 80):
            if (childathome > 2):
                sleep_deprived_reason = 'Child rearing'
            else:
                sleep_deprived_reason = 'Other reasons'
        if (wageincome >= 62000 or highestgradecompleted >= 16):
            sleep_deprived_reason = 'Work Pressure'
        else:
            sleep_deprived_reason = 'Income Pressure'
    else:
        sleep_deprived_reason = 'Unknown'
    return sleep_deprived_reason

In [35]:
nls97['sleep_deprived_reason'] = nls97.apply(lambda x: getsleepdeprivedreason(
    x.childathome, x.nightlyhrssleep, x.wageincome, x.weeksworked16, x.
    weeksworked17, x.highestgradecompleted),
                                             axis=1)
nls97['sleep_deprived_reason'].astype('category')
nls97['sleep_deprived_reason'].value_counts()

Not sleep deprived    5595
Unknown               2286
Income Pressure        760
Work Pressure          343
Name: sleep_deprived_reason, dtype: int64