<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-pandas-and-numpy,-and-then-load-the-NLS-data" data-toc-modified-id="Import-pandas-and-numpy,-and-then-load-the-NLS-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import pandas and numpy, and then load the NLS data</a></span></li><li><span><a href="#Look-at-some-of-the-employment-and-education-data" data-toc-modified-id="Look-at-some-of-the-employment-and-education-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Look at some of the employment and education data</a></span></li><li><span><a href="#Show-individuals-with-wage-income-but-no-weeks-worked" data-toc-modified-id="Show-individuals-with-wage-income-but-no-weeks-worked-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Show individuals with wage income but no weeks worked</a></span></li><li><span><a href="#Check-for-whether-an-individual-was-ever-enrolled-in-a-4-year-college-course" data-toc-modified-id="Check-for-whether-an-individual-was-ever-enrolled-in-a-4-year-college-course-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Check for whether an individual was ever enrolled in a 4-year college course</a></span></li><li><span><a href="#Show-individuals-with-post-graduate-enrollment-but-no-bachelor's-enrollment" data-toc-modified-id="Show-individuals-with-post-graduate-enrollment-but-no-bachelor's-enrollment-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Show individuals with post-graduate enrollment but no bachelor's enrollment</a></span></li><li><span><a href="#Show-individuals-with-bachelor's-degrees-or-more,-but-no-4-year-college-enrollment" data-toc-modified-id="Show-individuals-with-bachelor's-degrees-or-more,-but-no-4-year-college-enrollment-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Show individuals with bachelor's degrees or more, but no 4-year college enrollment</a></span></li><li><span><a href="#Show-individuals-with-a-high-wage-income" data-toc-modified-id="Show-individuals-with-a-high-wage-income-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Show individuals with a high wage income</a></span></li><li><span><a href="#Show-individuals-with-large-changes-in-weeks-worked-for-the-most-recent-year" data-toc-modified-id="Show-individuals-with-large-changes-in-weeks-worked-for-the-most-recent-year-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Show individuals with large changes in weeks worked for the most recent year</a></span></li><li><span><a href="#Show-inconsistencies-in-the-highest-grade-completed-and-the-highest-degree" data-toc-modified-id="Show-inconsistencies-in-the-highest-grade-completed-and-the-highest-degree-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Show inconsistencies in the highest grade completed and the highest degree</a></span></li></ul></div>

# Import pandas and numpy, and then load the NLS data

In [1]:
import pandas as pd
import numpy as np

In [2]:
# pd.set_option('display.width', 78)
# pd.set_option('display.max_columns', 7)
# pd.set_option('display.max_rows', 100)
# pd.options.display.float_format = '{:,.0f}'.format

In [3]:
import watermark
%load_ext watermark

In [4]:
%watermark -n -v -iv

Python implementation: CPython
Python version       : 3.7.9
IPython version      : 7.20.0

numpy    : 1.19.2
watermark: 2.1.0
json     : 2.0.9
pandas   : 1.2.1



In [5]:
nls97 = pd.read_csv('data/nls97.csv')
nls97.set_index('personid', inplace=True)

# Look at some of the employment and education data

In [6]:
nls97[['wageincome', 'highestgradecompleted', 'highestdegree']].head(3).T

personid,100061,100139,100284
wageincome,12500.0,120000.0,58000.0
highestgradecompleted,13.0,12.0,7.0
highestdegree,2. High School,2. High School,0. None


In [7]:
nls97.loc[:, 'weeksworked12':'weeksworked17'].head(3).T

personid,100061,100139,100284
weeksworked12,40.0,52.0,0.0
weeksworked13,52.0,52.0,
weeksworked14,52.0,52.0,11.0
weeksworked15,52.0,52.0,52.0
weeksworked16,48.0,53.0,47.0
weeksworked17,48.0,52.0,0.0


In [8]:
nls97.loc[:, 'colenroct09':'colenroct14'].head(3).T

personid,100061,100139,100284
colenroct09,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenrfeb10,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct10,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenrfeb11,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct11,3. 4-year college,1. Not enrolled,1. Not enrolled
colenrfeb12,3. 4-year college,1. Not enrolled,1. Not enrolled
colenroct12,3. 4-year college,1. Not enrolled,1. Not enrolled
colenrfeb13,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct13,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenrfeb14,1. Not enrolled,1. Not enrolled,1. Not enrolled


# Show individuals with wage income but no weeks worked

In [9]:
nls97.loc[(nls97['weeksworked16'] == 0) & nls97['wageincome'] > 0,
          ['weeksworked16', 'wageincome']]

Unnamed: 0_level_0,weeksworked16,wageincome
personid,Unnamed: 1_level_1,Unnamed: 2_level_1
102625,0.0,1200.0
109403,0.0,5000.0
118704,0.0,25000.0
130701,0.0,12000.0
131151,0.0,65000.0
...,...,...
957344,0.0,90000.0
966697,0.0,65000.0
969334,0.0,5000.0
991756,0.0,9000.0


# Check for whether an individual was ever enrolled in a 4-year college course

In [10]:
nls97.filter(like='colenr').apply(lambda x: x.str[0:1] == '3').head(2).T

personid,100061,100139
colenrfeb97,False,False
colenroct97,False,False
colenrfeb98,False,False
colenroct98,False,False
colenrfeb99,False,False
colenroct99,False,False
colenrfeb00,False,False
colenroct00,False,False
colenrfeb01,False,False
colenroct01,False,False


In [11]:
nls97.filter(like='colenr').apply(lambda x: x.str[0:1] == '3').any(
    axis=1).head(2)

personid
100061     True
100139    False
dtype: bool

# Show individuals with post-graduate enrollment but no bachelor's enrollment

In [12]:
nobach = nls97.loc[
    nls97.filter(like='colenr').apply(lambda x: x.str[0:1] == '4').any(axis=1)
    & ~nls97.filter(like='colenr').apply(lambda x: x.str[0:1] == '3').any(
        axis=1), 'colenrfeb97':'colenroct17']

In [13]:
len(nobach)

22

In [14]:
nobach.head(3).T

personid,153051,154535,184721
colenrfeb97,,,
colenroct97,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenrfeb98,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct98,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenrfeb99,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct99,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenrfeb00,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct00,2. 2-year college,1. Not enrolled,1. Not enrolled
colenrfeb01,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct01,2. 2-year college,1. Not enrolled,1. Not enrolled


# Show individuals with bachelor's degrees or more, but no 4-year college enrollment

In [15]:
nls97['highestdegree'].value_counts(sort=False)

1. GED             1146
0. None             953
6. PhD               54
3. Associates       737
4. Bachelors       1673
7. Professional     120
2. High School     3667
5. Masters          603
Name: highestdegree, dtype: int64

In [18]:
no4yearenrollment = nls97.loc[
    nls97['highestdegree'].str[0:1].isin(['4', '5', '6', '7']) & ~nls97.filter(
        like='colenr').apply(lambda x: x.str[0:1] == '3').any(axis=1),
    'colenrfeb97':'colenroct17']

In [19]:
no4yearenrollment.head(3).T

personid,113486,118749,124616
colenrfeb97,1. Not enrolled,,1. Not enrolled
colenroct97,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenrfeb98,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct98,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenrfeb99,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct99,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenrfeb00,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct00,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenrfeb01,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct01,2. 2-year college,1. Not enrolled,1. Not enrolled


# Show individuals with a high wage income

In [22]:
highwages = nls97.loc[nls97['wageincome'] > nls97['wageincome'].mean() +
                      (nls97['wageincome'].std() * 3), ['wageincome']]

In [23]:
highwages

Unnamed: 0_level_0,wageincome
personid,Unnamed: 1_level_1
131858,235884.0
133619,235884.0
151863,235884.0
164058,235884.0
164897,235884.0
...,...
964406,235884.0
966024,235884.0
976141,235884.0
983819,235884.0


# Show individuals with large changes in weeks worked for the most recent year

In [26]:
workchanges = nls97.loc[~nls97.loc[:, 'weeksworked12':'weeksworked16'].mean(
    axis=1).between(nls97['weeksworked17'] * 0.5, nls97['weeksworked17'] * 2.0)
                        & ~nls97['weeksworked17'].isnull(),
                        'weeksworked12':'weeksworked17']

In [27]:
len(workchanges)

1160

In [28]:
workchanges.head(7).T

personid,100284,101526,101718,101724,102228,102454,102625
weeksworked12,0.0,0.0,52.0,52.0,52.0,52.0,14.0
weeksworked13,,0.0,9.0,52.0,52.0,52.0,3.0
weeksworked14,11.0,0.0,0.0,52.0,17.0,7.0,52.0
weeksworked15,52.0,0.0,32.0,17.0,0.0,0.0,44.0
weeksworked16,47.0,0.0,0.0,0.0,0.0,0.0,0.0
weeksworked17,0.0,45.0,0.0,17.0,0.0,0.0,0.0


# Show inconsistencies in the highest grade completed and the highest degree

In [29]:
ltgrade12 = nls97.loc[nls97['highestgradecompleted'] < 12,
                      ['highestgradecompleted', 'highestdegree']]

In [31]:
pd.crosstab(ltgrade12['highestgradecompleted'], ltgrade12['highestdegree'])

highestdegree,0. None,1. GED,2. High School
highestgradecompleted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5.0,0,0,1
6.0,11,5,0
7.0,24,6,1
8.0,113,78,7
9.0,112,169,8
10.0,111,204,13
11.0,120,200,41
