<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-pandas-and-the-NLS-data" data-toc-modified-id="Import-pandas-and-the-NLS-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import pandas and the NLS data</a></span></li><li><span><a href="#View-some-of-the-values-for-the-number-of-weeks-worked" data-toc-modified-id="View-some-of-the-values-for-the-number-of-weeks-worked-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>View some of the values for the number of weeks worked</a></span></li><li><span><a href="#Use-stack-to-transform-the-data-from-wide-to-long" data-toc-modified-id="Use-stack-to-transform-the-data-from-wide-to-long-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Use stack to transform the data from wide to long</a></span></li><li><span><a href="#Fix-the-year-values" data-toc-modified-id="Fix-the-year-values-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Fix the year values</a></span></li><li><span><a href="#Alternatively,-use-melt-to-transform-the-data-from-wide-to-long" data-toc-modified-id="Alternatively,-use-melt-to-transform-the-data-from-wide-to-long-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Alternatively, use melt to transform the data from wide to long</a></span></li><li><span><a href="#Reshape-the-college-enrollment-columns-with-melt." data-toc-modified-id="Reshape-the-college-enrollment-columns-with-melt.-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Reshape the college enrollment columns with melt.</a></span></li><li><span><a href="#Merge-the-weeks-worked-and-college-enrollment-data" data-toc-modified-id="Merge-the-weeks-worked-and-college-enrollment-data-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Merge the weeks worked and college enrollment data</a></span></li></ul></div>

# Import pandas and the NLS data

In [1]:
import pandas as pd

In [2]:
# pd.set_option('display.width', 200)
# pd.set_option('display.max_columns', 30)
# pd.set_option('display.max_rows', 200)
# pd.options.display.float_format = '{:,.0f}'.format

In [3]:
import watermark
%load_ext watermark

%watermark -n -i -iv

watermark: 2.1.0
json     : 2.0.9
pandas   : 1.2.1



In [4]:
nls97 = pd.read_csv('data/nls97f.csv')

# View some of the values for the number of weeks worked

In [5]:
nls97.set_index(['originalid'], inplace=True)

In [6]:
weeksworkedcols = [
    'weeksworked00', 'weeksworked01', 'weeksworked02', 'weeksworked03',
    'weeksworked04'
]

In [7]:
nls97[weeksworkedcols].head(2).T

originalid,8245,3962
weeksworked00,46.0,5.0
weeksworked01,52.0,49.0
weeksworked02,52.0,52.0
weeksworked03,48.0,52.0
weeksworked04,52.0,52.0


In [8]:
nls97.shape

(8984, 89)

# Use stack to transform the data from wide to long

In [10]:
weeksworked = nls97[weeksworkedcols].stack(dropna=False).reset_index().rename(
    columns={
        'level_1': 'year',
        0: 'weeksworked'
    })

In [11]:
weeksworked.head(10)

Unnamed: 0,originalid,year,weeksworked
0,8245,weeksworked00,46.0
1,8245,weeksworked01,52.0
2,8245,weeksworked02,52.0
3,8245,weeksworked03,48.0
4,8245,weeksworked04,52.0
5,3962,weeksworked00,5.0
6,3962,weeksworked01,49.0
7,3962,weeksworked02,52.0
8,3962,weeksworked03,52.0
9,3962,weeksworked04,52.0


# Fix the year values

In [12]:
weeksworked['year'] = weeksworked['year'].str[-2:].astype(int) + 2000

In [13]:
weeksworked.head(10)

Unnamed: 0,originalid,year,weeksworked
0,8245,2000,46.0
1,8245,2001,52.0
2,8245,2002,52.0
3,8245,2003,48.0
4,8245,2004,52.0
5,3962,2000,5.0
6,3962,2001,49.0
7,3962,2002,52.0
8,3962,2003,52.0
9,3962,2004,52.0


In [14]:
weeksworked.shape

(44920, 3)

# Alternatively, use melt to transform the data from wide to long

In [15]:
weeksworked = nls97.reset_index().loc[:,
                                      ['originalid'] + weeksworkedcols].melt(
                                          id_vars=['originalid'],
                                          value_vars=weeksworkedcols,
                                          var_name='year',
                                          value_name='weeksworked')

In [16]:
weeksworked['year'] = weeksworked.year.str[-2:].astype(int) + 2000

In [17]:
weeksworked.set_index(['originalid'], inplace=True)

In [18]:
weeksworked.loc[[8245, 3962]]

Unnamed: 0_level_0,year,weeksworked
originalid,Unnamed: 1_level_1,Unnamed: 2_level_1
8245,2000,46.0
8245,2001,52.0
8245,2002,52.0
8245,2003,48.0
8245,2004,52.0
3962,2000,5.0
3962,2001,49.0
3962,2002,52.0
3962,2003,52.0
3962,2004,52.0


# Reshape the college enrollment columns with melt.

In [19]:
colenrcols = [
    'colenroct00', 'colenroct01', 'colenroct02', 'colenroct03', 'colenroct04'
]

In [22]:
colenr = nls97.reset_index().loc[:, ['originalid'] + colenrcols].melt(
    id_vars=['originalid'],
    value_vars=colenrcols,
    var_name='year',
    value_name='colenr')

In [23]:
colenr['year'] = colenr.year.str[-2:].astype(int) + 2000

In [24]:
colenr.set_index(['originalid'], inplace=True)

In [25]:
colenr.loc[[8245, 3962]]

Unnamed: 0_level_0,year,colenr
originalid,Unnamed: 1_level_1,Unnamed: 2_level_1
8245,2000,1. Not enrolled
8245,2001,1. Not enrolled
8245,2002,1. Not enrolled
8245,2003,1. Not enrolled
8245,2004,1. Not enrolled
3962,2000,1. Not enrolled
3962,2001,1. Not enrolled
3962,2002,1. Not enrolled
3962,2003,1. Not enrolled
3962,2004,1. Not enrolled


# Merge the weeks worked and college enrollment data

In [26]:
workschool = pd.merge(weeksworked,
                      colenr,
                      on=['originalid', 'year'],
                      how='inner')

In [27]:
workschool.shape

(44920, 3)

In [28]:
workschool.loc[[8245, 3962]]

Unnamed: 0_level_0,year,weeksworked,colenr
originalid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8245,2000,46.0,1. Not enrolled
8245,2001,52.0,1. Not enrolled
8245,2002,52.0,1. Not enrolled
8245,2003,48.0,1. Not enrolled
8245,2004,52.0,1. Not enrolled
3962,2000,5.0,1. Not enrolled
3962,2001,49.0,1. Not enrolled
3962,2002,52.0,1. Not enrolled
3962,2003,52.0,1. Not enrolled
3962,2004,52.0,1. Not enrolled
