<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-pandas-and-numpy,-and-then-load-the-NLS-data" data-toc-modified-id="Import-pandas-and-numpy,-and-then-load-the-NLS-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import pandas and numpy, and then load the NLS data</a></span></li><li><span><a href="#Test-whether-a-pattern-exists-in-a-string" data-toc-modified-id="Test-whether-a-pattern-exists-in-a-string-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Test whether a pattern exists in a string</a></span></li><li><span><a href="#Handle-leading-or-trailing-spaces-in-a-string" data-toc-modified-id="Handle-leading-or-trailing-spaces-in-a-string-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Handle leading or trailing spaces in a string</a></span></li><li><span><a href="#Use-isin-to-compare-a-string-value-to-a-list-of-values" data-toc-modified-id="Use-isin-to-compare-a-string-value-to-a-list-of-values-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Use isin to compare a string value to a list of values</a></span></li><li><span><a href="#Use-findall-to-extract-numeric-values-from-a-text-string" data-toc-modified-id="Use-findall-to-extract-numeric-values-from-a-text-string-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Use findall to extract numeric values from a text string</a></span></li><li><span><a href="#Use-the-list-created-by-findall-to-create-a-numeric-series-from-the-weeklyhrstv-text" data-toc-modified-id="Use-the-list-created-by-findall-to-create-a-numeric-series-from-the-weeklyhrstv-text-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Use the list created by findall to create a numeric series from the weeklyhrstv text</a></span></li><li><span><a href="#Replace-the-values-in-a-series-with-alternative-values" data-toc-modified-id="Replace-the-values-in-a-series-with-alternative-values-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Replace the values in a series with alternative values</a></span></li></ul></div>

# Import pandas and numpy, and then load the NLS data

In [1]:
import pandas as pd
import numpy as np

In [2]:
# pd.set_option('display.width', 200)
# pd.set_option('display.max_columns', 35)
# pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
import watermark
%load_ext watermark

%watermark -n -i -iv

json     : 2.0.9
watermark: 2.1.0
numpy    : 1.19.2
pandas   : 1.2.1



In [4]:
nls97 = pd.read_csv('data/nls97c.csv')
nls97.set_index('personid', inplace=True)

# Test whether a pattern exists in a string

In [5]:
nls97['govprovidejobs'].value_counts()

2. Probably          617
3. Probably not      462
1. Definitely        454
4. Definitely not    300
Name: govprovidejobs, dtype: int64

In [8]:
nls97['govprovidejobsdefprob'] = np.where(
    nls97['govprovidejobs'].isnull(), np.nan,
    np.where(nls97['govprovidejobs'].str.contains('not'), 'No', 'Yes'))

In [9]:
pd.crosstab(nls97['govprovidejobs'], nls97['govprovidejobsdefprob'])

govprovidejobsdefprob,No,Yes
govprovidejobs,Unnamed: 1_level_1,Unnamed: 2_level_1
1. Definitely,0,454
2. Probably,0,617
3. Probably not,462,0
4. Definitely not,300,0


# Handle leading or trailing spaces in a string

In [10]:
nls97['maritalstatus'].value_counts()

Married          3064
Never-married    2766
Divorced          663
Separated         154
Widowed            23
Married             2
Name: maritalstatus, dtype: int64

In [11]:
nls97['maritalstatus'].str.startswith(' ').any()

False

In [12]:
nls97['maritalstatus'].str.endswith(' ').any()

True

In [13]:
nls97['evermarried'] = np.where(
    nls97['maritalstatus'].isnull(), np.nan,
    np.where(nls97['maritalstatus'].str.strip() == 'Never-married', 'No',
             'Yes'))

In [14]:
pd.crosstab(nls97['maritalstatus'], nls97['evermarried'])

evermarried,No,Yes
maritalstatus,Unnamed: 1_level_1,Unnamed: 2_level_1
Divorced,0,663
Married,0,3064
Married,0,2
Never-married,2766,0
Separated,0,154
Widowed,0,23


# Use isin to compare a string value to a list of values

In [15]:
nls97['receivedba'] = np.where(
    nls97['highestdegree'].isnull(), np.nan,
    np.where(nls97['highestdegree'].str[0:1].isin(['4', '5', '6', '7']), 'Yes',
             'No'))

In [16]:
pd.crosstab(nls97['highestdegree'], nls97['receivedba'])

receivedba,No,Yes
highestdegree,Unnamed: 1_level_1,Unnamed: 2_level_1
0. None,953,0
1. GED,1146,0
2. High School,3667,0
3. Associates,737,0
4. Bachelors,0,1673
5. Masters,0,603
6. PhD,0,54
7. Professional,0,120


# Use findall to extract numeric values from a text string

In [17]:
pd.concat([
    nls97['weeklyhrstv'].head(),
    nls97['weeklyhrstv'].str.findall('\d+').head()
],
          axis=1)

Unnamed: 0_level_0,weeklyhrstv,weeklyhrstv
personid,Unnamed: 1_level_1,Unnamed: 2_level_1
100061,11 to 20 hours a week,"[11, 20]"
100139,3 to 10 hours a week,"[3, 10]"
100284,11 to 20 hours a week,"[11, 20]"
100292,,
100583,3 to 10 hours a week,"[3, 10]"


# Use the list created by findall to create a numeric series from the weeklyhrstv text

In [18]:
def getnum(numlist):
    highval = 0
    if (type(numlist) is list):
        lastval = int(numlist[-1])
        if (numlist[0] == '40'):
            highval = 45
        elif (lastval == 2):
            highval = 1
        else:
            highval = lastval - 5
    else:
        highval = np.nan
    return highval

In [19]:
nls97['weeklyhrstvnum'] = nls97['weeklyhrstv'].str.findall('\d+').apply(getnum)

In [20]:
pd.crosstab(nls97['weeklyhrstv'], nls97['weeklyhrstvnum'])

weeklyhrstvnum,1.00,5.00,15.00,25.00,35.00,45.00
weeklyhrstv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
11 to 20 hours a week,0,0,1145,0,0,0
21 to 30 hours a week,0,0,0,299,0,0
3 to 10 hours a week,0,3625,0,0,0,0
31 to 40 hours a week,0,0,0,0,116,0
Less than 2 hours per week,1350,0,0,0,0,0
More than 40 hours a week,0,0,0,0,0,176


# Replace the values in a series with alternative values

In [21]:
comphrsold = [
    'None', 'Less than 1 hour a week', '1 to 3 hours a week',
    '4 to 6 hours a week', '7 to 9 hours a week', '10 hours or more a week'
]

comphrsnew = [
    'A. None', 'B. Less than 1 hour a week', 'C. 1 to 3 hours a week',
    'D. 4 to 6 hours a week', 'E. 7 to 9 hours a week',
    'F. 10 hours or more a week'
]

In [23]:
nls97['weeklyhrscomputer'].value_counts().sort_index()

1 to 3 hours a week         733
10 hours or more a week    3669
4 to 6 hours a week         726
7 to 9 hours a week         368
Less than 1 hour a week     296
None                        918
Name: weeklyhrscomputer, dtype: int64

In [24]:
nls97['weeklyhrscomputer'].replace(comphrsold, comphrsnew, inplace=True)

In [25]:
nls97['weeklyhrscomputer'].value_counts().sort_index()

A. None                        918
B. Less than 1 hour a week     296
C. 1 to 3 hours a week         733
D. 4 to 6 hours a week         726
E. 7 to 9 hours a week         368
F. 10 hours or more a week    3669
Name: weeklyhrscomputer, dtype: int64