<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-the-pandas-library-and-load-the-NLS-data-into-pandas" data-toc-modified-id="Import-the-pandas-library-and-load-the-NLS-data-into-pandas-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import the pandas library and load the NLS data into pandas</a></span></li><li><span><a href="#Select-a-column-using-the-pandas-[]-bracket-operator,-and-the-loc-and-iloc-accessors." data-toc-modified-id="Select-a-column-using-the-pandas-[]-bracket-operator,-and-the-loc-and-iloc-accessors.-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Select a column using the pandas [] bracket operator, and the <code>loc</code> and <code>iloc</code> accessors.</a></span></li><li><span><a href="#Select-multiple-columns-from-a-pandas-DataFrame" data-toc-modified-id="Select-multiple-columns-from-a-pandas-DataFrame-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Select multiple columns from a pandas DataFrame</a></span></li><li><span><a href="#Select-multiple-columns-based-on-a-list-of-columns" data-toc-modified-id="Select-multiple-columns-based-on-a-list-of-columns-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Select multiple columns based on a list of columns</a></span></li><li><span><a href="#Select-one-or-more-columns-by-filtering-on-column-name" data-toc-modified-id="Select-one-or-more-columns-by-filtering-on-column-name-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Select one or more columns by filtering on column name</a></span></li><li><span><a href="#Select-all-columns-with-the-category-data-type" data-toc-modified-id="Select-all-columns-with-the-category-data-type-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Select all columns with the category data type</a></span></li><li><span><a href="#Select-all-columns-with-numeric-data-types" data-toc-modified-id="Select-all-columns-with-numeric-data-types-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Select all columns with numeric data types</a></span></li><li><span><a href="#Organize-columns-using-lists-of-column-names" data-toc-modified-id="Organize-columns-using-lists-of-column-names-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Organize columns using lists of column names</a></span></li><li><span><a href="#Create-the-new,-reorganized-DataFrame" data-toc-modified-id="Create-the-new,-reorganized-DataFrame-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Create the new, reorganized DataFrame</a></span></li><li><span><a href="#There's-more…" data-toc-modified-id="There's-more…-10"><span class="toc-item-num">10&nbsp;&nbsp;</span>There's more…</a></span></li></ul></div>

# Import the pandas library and load the NLS data into pandas

In [1]:
import pandas as pd
import numpy as np

In [2]:
# pd.set_option('display.width', 100)
# pd.set_option('display.max_columns', 20)
# pd.set_option('display.max_rows', 15)
# pd.options.display.float_format = '{:,.0f}'.format

In [3]:
import watermark
%load_ext watermark

In [4]:
%watermark -n -v -iv

Python implementation: CPython
Python version       : 3.7.9
IPython version      : 7.20.0

watermark: 2.1.0
pandas   : 1.2.1
json     : 2.0.9
numpy    : 1.19.2



In [5]:
nls97 = pd.read_csv('data/nls97.csv')

In [6]:
nls97.set_index('personid', inplace=True)

In [7]:
nls97.loc[:, nls97.dtypes == 'object'] = nls97.select_dtypes(
    ['object']).apply(lambda x: x.astype('category'))

In [8]:
nls97.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8984 entries, 100061 to 999963
Data columns (total 88 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   gender                 8984 non-null   category
 1   birthmonth             8984 non-null   int64   
 2   birthyear              8984 non-null   int64   
 3   highestgradecompleted  6663 non-null   float64 
 4   maritalstatus          6672 non-null   category
 5   childathome            4791 non-null   float64 
 6   childnotathome         4791 non-null   float64 
 7   wageincome             5091 non-null   float64 
 8   weeklyhrscomputer      6710 non-null   category
 9   weeklyhrstv            6711 non-null   category
 10  nightlyhrssleep        6706 non-null   float64 
 11  satverbal              1406 non-null   float64 
 12  satmath                1407 non-null   float64 
 13  gpaoverall             6004 non-null   float64 
 14  gpaenglish             5798 non-n

# Select a column using the pandas [] bracket operator, and the `loc` and `iloc` accessors.

In [18]:
analysisdemo = nls97['gender']

In [19]:
type(analysisdemo)

pandas.core.series.Series

In [20]:
analysisdemo.tail(2)

personid
999698    Female
999963    Female
Name: gender, dtype: category
Categories (2, object): ['Female', 'Male']

In [21]:
analysisdemo = nls97[['gender']]

In [22]:
type(analysisdemo)

pandas.core.frame.DataFrame

In [23]:
analysisdemo.sample(2)

Unnamed: 0_level_0,gender
personid,Unnamed: 1_level_1
173637,Male
868922,Female


In [24]:
analysisdemo = nls97.loc[:, ['gender']]

In [25]:
type(analysisdemo)

pandas.core.frame.DataFrame

In [26]:
analysisdemo.sample(2)

Unnamed: 0_level_0,gender
personid,Unnamed: 1_level_1
797651,Male
520655,Male


In [27]:
# 0th column is gender
analysisdemo = nls97.iloc[:, [0]]

In [28]:
type(analysisdemo)

pandas.core.frame.DataFrame

In [29]:
analysisdemo.sample(2)

Unnamed: 0_level_0,gender
personid,Unnamed: 1_level_1
644266,Male
865040,Male


# Select multiple columns from a pandas DataFrame

In [30]:
analysisdemo = nls97[['gender', 'maritalstatus', 'highestgradecompleted']]

In [31]:
analysisdemo.shape

(8984, 3)

In [32]:
analysisdemo.head(3)

Unnamed: 0_level_0,gender,maritalstatus,highestgradecompleted
personid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100061,Female,Married,13.0
100139,Male,Married,12.0
100284,Male,Never-married,7.0


In [33]:
analysisdemo = nls97.loc[:,
                         ['gender', 'maritalstatus', 'highestgradecompleted']]

In [34]:
analysisdemo.shape

(8984, 3)

In [35]:
analysisdemo.head(3)

Unnamed: 0_level_0,gender,maritalstatus,highestgradecompleted
personid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100061,Female,Married,13.0
100139,Male,Married,12.0
100284,Male,Never-married,7.0


# Select multiple columns based on a list of columns

In [36]:
keyvars = [
    'gender', 'maritalstatus', 'highestgradecompleted', 'wageincome',
    'gpaoverall', 'weeksworked17', 'colenroct17'
]

In [37]:
analysiskeys = nls97[keyvars]

In [38]:
analysiskeys.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8984 entries, 100061 to 999963
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   gender                 8984 non-null   category
 1   maritalstatus          6672 non-null   category
 2   highestgradecompleted  6663 non-null   float64 
 3   wageincome             5091 non-null   float64 
 4   gpaoverall             6004 non-null   float64 
 5   weeksworked17          6670 non-null   float64 
 6   colenroct17            6734 non-null   category
dtypes: category(3), float64(4)
memory usage: 377.8 KB


# Select one or more columns by filtering on column name

In [39]:
analysiswork = nls97.filter(like='weeksworked')

In [40]:
analysiswork.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8984 entries, 100061 to 999963
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   weeksworked00  8603 non-null   float64
 1   weeksworked01  8564 non-null   float64
 2   weeksworked02  8556 non-null   float64
 3   weeksworked03  8490 non-null   float64
 4   weeksworked04  8458 non-null   float64
 5   weeksworked05  8403 non-null   float64
 6   weeksworked06  8340 non-null   float64
 7   weeksworked07  8272 non-null   float64
 8   weeksworked08  8186 non-null   float64
 9   weeksworked09  8146 non-null   float64
 10  weeksworked10  8054 non-null   float64
 11  weeksworked11  7968 non-null   float64
 12  weeksworked12  7747 non-null   float64
 13  weeksworked13  7680 non-null   float64
 14  weeksworked14  7612 non-null   float64
 15  weeksworked15  7389 non-null   float64
 16  weeksworked16  7068 non-null   float64
 17  weeksworked17  6670 non-null   float64
dtypes

# Select all columns with the category data type

In [41]:
analysiscats = nls97.select_dtypes(include=['category'])

In [42]:
analysiscats.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8984 entries, 100061 to 999963
Data columns (total 57 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   gender                 8984 non-null   category
 1   maritalstatus          6672 non-null   category
 2   weeklyhrscomputer      6710 non-null   category
 3   weeklyhrstv            6711 non-null   category
 4   highestdegree          8953 non-null   category
 5   govprovidejobs         1833 non-null   category
 6   govpricecontrols       1859 non-null   category
 7   govhealthcare          1874 non-null   category
 8   govelderliving         1872 non-null   category
 9   govindhelp             1815 non-null   category
 10  govunemp               1811 non-null   category
 11  govincomediff          1775 non-null   category
 12  govcollegefinance      1875 non-null   category
 13  govdecenthousing       1847 non-null   category
 14  govprotectenvironment  1860 non-n

# Select all columns with numeric data types

In [43]:
analysisnums = nls97.select_dtypes(include=['number'])

In [44]:
analysisnums.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8984 entries, 100061 to 999963
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   birthmonth             8984 non-null   int64  
 1   birthyear              8984 non-null   int64  
 2   highestgradecompleted  6663 non-null   float64
 3   childathome            4791 non-null   float64
 4   childnotathome         4791 non-null   float64
 5   wageincome             5091 non-null   float64
 6   nightlyhrssleep        6706 non-null   float64
 7   satverbal              1406 non-null   float64
 8   satmath                1407 non-null   float64
 9   gpaoverall             6004 non-null   float64
 10  gpaenglish             5798 non-null   float64
 11  gpamath                5766 non-null   float64
 12  gpascience             5684 non-null   float64
 13  weeksworked00          8603 non-null   float64
 14  weeksworked01          8564 non-null   float64
 1

# Organize columns using lists of column names

In [45]:
demo = ['gender', 'birthmonth', 'birthyear']

In [46]:
highschoolrecord = [
    'satverbal', 'satmath', 'gpaoverall', 'gpaenglish', 'gpamath', 'gpascience'
]

In [47]:
govresp = [
    'govprovidejobs', 'govpricecontrols', 'govhealthcare', 'govelderliving',
    'govindhelp', 'govunemp', 'govincomediff', 'govcollegefinance',
    'govdecenthousing', 'govprotectenvironment'
]

demoadult = [
    'highestgradecompleted', 'maritalstatus', 'childathome', 'childnotathome',
    'wageincome', 'weeklyhrscomputer', 'weeklyhrstv', 'nightlyhrssleep',
    'highestdegree'
]

In [48]:
weeksworked = [
    'weeksworked00', 'weeksworked01', 'weeksworked02', 'weeksworked03',
    'weeksworked04', 'weeksworked05', 'weeksworked06', 'weeksworked07',
    'weeksworked08', 'weeksworked09', 'weeksworked10', 'weeksworked11',
    'weeksworked12', 'weeksworked13', 'weeksworked14', 'weeksworked15',
    'weeksworked16', 'weeksworked17'
]

In [49]:
colenr = [
    'colenrfeb97', 'colenroct97', 'colenrfeb98', 'colenroct98', 'colenrfeb99',
    'colenroct99', 'colenrfeb00', 'colenroct00', 'colenrfeb01', 'colenroct01',
    'colenrfeb02', 'colenroct02', 'colenrfeb03', 'colenroct03', 'colenrfeb04',
    'colenroct04', 'colenrfeb05', 'colenroct05', 'colenrfeb06', 'colenroct06',
    'colenrfeb07', 'colenroct07', 'colenrfeb08', 'colenroct08', 'colenrfeb09',
    'colenroct09', 'colenrfeb10', 'colenroct10', 'colenrfeb11', 'colenroct11',
    'colenrfeb12', 'colenroct12', 'colenrfeb13', 'colenroct13', 'colenrfeb14',
    'colenroct14', 'colenrfeb15', 'colenroct15', 'colenrfeb16', 'colenroct16',
    'colenrfeb17', 'colenroct17'
]

# Create the new, reorganized DataFrame

In [50]:
nls97 = nls97[demoadult + demo + highschoolrecord + govresp + weeksworked +
              colenr]

In [51]:
nls97.dtypes

highestgradecompleted     float64
maritalstatus            category
childathome               float64
childnotathome            float64
wageincome                float64
                           ...   
colenroct15              category
colenrfeb16              category
colenroct16              category
colenrfeb17              category
colenroct17              category
Length: 88, dtype: object

# There's more…

In [52]:
nls97.select_dtypes(exclude=['category']).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8984 entries, 100061 to 999963
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   highestgradecompleted  6663 non-null   float64
 1   childathome            4791 non-null   float64
 2   childnotathome         4791 non-null   float64
 3   wageincome             5091 non-null   float64
 4   nightlyhrssleep        6706 non-null   float64
 5   birthmonth             8984 non-null   int64  
 6   birthyear              8984 non-null   int64  
 7   satverbal              1406 non-null   float64
 8   satmath                1407 non-null   float64
 9   gpaoverall             6004 non-null   float64
 10  gpaenglish             5798 non-null   float64
 11  gpamath                5766 non-null   float64
 12  gpascience             5684 non-null   float64
 13  weeksworked00          8603 non-null   float64
 14  weeksworked01          8564 non-null   float64
 1

In [53]:
nls97.filter(regex='income')

Unnamed: 0_level_0,wageincome,govincomediff
personid,Unnamed: 1_level_1,Unnamed: 2_level_1
100061,12500.0,
100139,120000.0,
100284,58000.0,
100292,,
100583,30000.0,
...,...,...
999291,35000.0,
999406,116000.0,
999543,,
999698,,
