<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-pandas,-numpy,-and-pyreadstat" data-toc-modified-id="Import-pandas,-numpy,-and-pyreadstat-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import pandas, numpy, and pyreadstat</a></span></li><li><span><a href="#Retrieve-the-SPSS-data" data-toc-modified-id="Retrieve-the-SPSS-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Retrieve the SPSS data</a></span></li><li><span><a href="#Grab-the-metadata-to-improve-column-labels-and-value-labels" data-toc-modified-id="Grab-the-metadata-to-improve-column-labels-and-value-labels-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Grab the metadata to improve column labels and value labels</a></span></li><li><span><a href="#Use-column-labels-in-the-metadata-to-rename-the-columns" data-toc-modified-id="Use-column-labels-in-the-metadata-to-rename-the-columns-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Use column labels in the metadata to rename the columns</a></span></li><li><span><a href="#Simplify-the-process-by-applying-the-value-labels-from-the-beginning" data-toc-modified-id="Simplify-the-process-by-applying-the-value-labels-from-the-beginning-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Simplify the process by applying the value labels from the beginning</a></span></li><li><span><a href="#Show-the-columns-and-a-few-rows" data-toc-modified-id="Show-the-columns-and-a-few-rows-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Show the columns and a few rows</a></span></li><li><span><a href="#Run-frequencies-on-one-of-the-columns-and-set-the-index" data-toc-modified-id="Run-frequencies-on-one-of-the-columns-and-set-the-index-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Run frequencies on one of the columns and set the index</a></span></li><li><span><a href="#Import-the-Stata-data,-apply-value-labels,-and-improve-the-column-headings" data-toc-modified-id="Import-the-Stata-data,-apply-value-labels,-and-improve-the-column-headings-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Import the Stata data, apply value labels, and improve the column headings</a></span></li><li><span><a href="#View-a-few-rows-of-the-data-and-run-frequency" data-toc-modified-id="View-a-few-rows-of-the-data-and-run-frequency-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>View a few rows of the data and run frequency</a></span></li><li><span><a href="#Fix-the-logical-missing-values-that-show-up-with-the-Stata-data-and-set-an-index" data-toc-modified-id="Fix-the-logical-missing-values-that-show-up-with-the-Stata-data-and-set-an-index-10"><span class="toc-item-num">10&nbsp;&nbsp;</span>Fix the logical missing values that show up with the Stata data and set an index</a></span></li><li><span><a href="#Retrieve-the-SAS-data,-using-the-SAS-catalog-file-for-value-labels" data-toc-modified-id="Retrieve-the-SAS-data,-using-the-SAS-catalog-file-for-value-labels-11"><span class="toc-item-num">11&nbsp;&nbsp;</span>Retrieve the SAS data, using the SAS catalog file for value labels</a></span></li></ul></div>

# Import pandas, numpy, and pyreadstat

In [1]:
# import pandas, numpy, and pyreadstat
import pandas as pd
import numpy as np
import pyreadstat

In [2]:
import watermark
%load_ext watermark

%watermark -n -v -g -iv

Python implementation: CPython
Python version       : 3.7.9
IPython version      : 7.20.0

Git hash: e46d7d9e1e5b40118eae31fc01d4a7fc17b6eed9

pyreadstat: 1.0.8
watermark : 2.1.0
numpy     : 1.19.2
pandas    : 1.2.1
json      : 2.0.9



In [3]:
# pd.set_option('display.max_columns', 5)
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.width', 75)

# Retrieve the SPSS data

In [4]:
nls97spss, metaspss = pyreadstat.read_sav('data/nls97.sav')

In [5]:
nls97spss.dtypes

R0000100    float64
R0536300    float64
R0536401    float64
R0536402    float64
R1235800    float64
R1482600    float64
R9793800    float64
R9793900    float64
R9871900    float64
R9872000    float64
R9872200    float64
R9872400    float64
S8646900    float64
S8647000    float64
S8647100    float64
S8647200    float64
S8647300    float64
S8647400    float64
S8647500    float64
S8647600    float64
S8647700    float64
S8647800    float64
T6651700    float64
U1836800    float64
U1836900    float64
U1837000    float64
U1837100    float64
U1837200    float64
U1837300    float64
U1845400    float64
U1852400    float64
U1852600    float64
U1852700    float64
U2166200    float64
U2166300    float64
U2166400    float64
U2166500    float64
U2857300    float64
U2962800    float64
U2962900    float64
U2963000    float64
Z9063900    float64
dtype: object

In [6]:
nls97spss.head()

Unnamed: 0,R0000100,R0536300,R0536401,R0536402,R1235800,R1482600,R9793800,R9793900,R9871900,R9872000,...,U1852700,U2166200,U2166300,U2166400,U2166500,U2857300,U2962800,U2962900,U2963000,Z9063900
0,1.0,2.0,9.0,1981.0,1.0,4.0,350.0,470.0,309.0,310.0,...,,,,,,,,,,52.0
1,2.0,1.0,7.0,1982.0,1.0,2.0,460.0,440.0,217.0,280.0,...,,,,,,,4.0,2.0,6.0,0.0
2,3.0,2.0,9.0,1983.0,1.0,2.0,,,,,...,0.0,,,,,,6.0,2.0,6.0,0.0
3,4.0,2.0,2.0,1981.0,1.0,2.0,,,253.0,216.0,...,1.0,,,,,,3.0,2.0,6.0,4.0
4,5.0,1.0,10.0,1982.0,1.0,2.0,,,243.0,235.0,...,0.0,,,,,,2.0,2.0,5.0,12.0


In [7]:
nls97spss['R0536300'].value_counts(normalize=True)

1.00   0.51
2.00   0.49
Name: R0536300, dtype: float64

# Grab the metadata to improve column labels and value labels

In [8]:
metaspss.variable_value_labels['R0536300']

{0.0: 'No Information', 1.0: 'Male', 2.0: 'Female'}

In [9]:
nls97spss['R0536300'].map(metaspss.variable_value_labels['R0536300']).value_counts(normalize=True)

Male     0.51
Female   0.49
Name: R0536300, dtype: float64

In [10]:
nls97spss = pyreadstat.set_value_labels(nls97spss, metaspss, formats_as_category=True)

In [11]:
nls97spss.head()

Unnamed: 0,R0000100,R0536300,R0536401,R0536402,R1235800,R1482600,R9793800,R9793900,R9871900,R9872000,...,U1852700,U2166200,U2166300,U2166400,U2166500,U2857300,U2962800,U2962900,U2963000,Z9063900
0,1.0,Female,9.0,1981.0,Cross-sectional,Non-Black / Non-Hispanic,350.0,470.0,309.0,310.0,...,,,,,,,,,,52.0
1,2.0,Male,7.0,1982.0,Cross-sectional,Hispanic,460.0,440.0,217.0,280.0,...,,,,,,,4 to 6 hours a week,3 to 10 hours a week,6.0,0.0
2,3.0,Female,9.0,1983.0,Cross-sectional,Hispanic,,,,,...,0.0,,,,,,10 hours or more a week,3 to 10 hours a week,6.0,0.0
3,4.0,Female,2.0,1981.0,Cross-sectional,Hispanic,,,253.0,216.0,...,1.0,,,,,,1 to 3 hours a week,3 to 10 hours a week,6.0,4.0
4,5.0,Male,10.0,1982.0,Cross-sectional,Hispanic,,,243.0,235.0,...,0.0,,,,,,Less than 1 hour a week,3 to 10 hours a week,5.0,12.0


# Use column labels in the metadata to rename the columns

In [12]:
nls97spss.columns = metaspss.column_labels

In [13]:
nls97spss.head(2)

Unnamed: 0,PUBID - YTH ID CODE 1997,KEY!SEX (SYMBOL) 1997,KEY!BDATE M/Y (SYMBOL) 1997,KEY!BDATE M/Y (SYMBOL) 1997.1,CV_SAMPLE_TYPE 1997,KEY!RACE_ETHNICITY (SYMBOL) 1997,TRANS_SAT_VERBAL HSTR,TRANS_SAT_MATH HSTR,TRANS CRD GPA OVERALL HSTR,TRANS CRD GPA ENG HSTR,...,CV_BIO_CHILD_NR 2017,DIPLOMA/DEGREE RCVD? L1 2017,DIPLOMA/DEGREE RCVD? L2 2017,DIPLOMA/DEGREE RCVD? L3 2017,DIPLOMA/DEGREE RCVD? L4 2017,"EST INC WAGES, TIPS PAST YR 2017",HRS/WK R USES A COMPUTER 2017,HRS/WK R WATCHES TELEVISION 2017,HRS/NIGHT R SLEEPS 2017,CVC_WKSWK_YR_ALL L99
0,1.0,Female,9.0,1981.0,Cross-sectional,Non-Black / Non-Hispanic,350.0,470.0,309.0,310.0,...,,,,,,,,,,52.0
1,2.0,Male,7.0,1982.0,Cross-sectional,Hispanic,460.0,440.0,217.0,280.0,...,,,,,,,4 to 6 hours a week,3 to 10 hours a week,6.0,0.0


In [15]:
nls97spss['KEY!SEX (SYMBOL) 1997'].value_counts(normalize=True)

Male     0.51
Female   0.49
Name: KEY!SEX (SYMBOL) 1997, dtype: float64

In [16]:
nls97spss.dtypes

PUBID - YTH ID CODE 1997                        float64
KEY!SEX (SYMBOL) 1997                          category
KEY!BDATE M/Y (SYMBOL) 1997                     float64
KEY!BDATE M/Y (SYMBOL) 1997                     float64
CV_SAMPLE_TYPE 1997                            category
KEY!RACE_ETHNICITY (SYMBOL) 1997               category
TRANS_SAT_VERBAL HSTR                           float64
TRANS_SAT_MATH HSTR                             float64
TRANS CRD GPA OVERALL HSTR                      float64
TRANS CRD GPA ENG HSTR                          float64
TRANS CRD GPA MATH HSTR                         float64
TRANS CRD GPA LP SCI HSTR                       float64
GOVT RESPONSIBILITY - PROVIDE JOBS 2006        category
GOVT RESPNSBLTY - KEEP PRICES UND CTRL 2006    category
GOVT RESPNSBLTY - HLTH CARE FOR SICK 2006      category
GOVT RESPNSBLTY -PROV ELD LIV STAND 2006       category
GOVT RESPNSBLTY -PROV IND HELP 2006            category
GOVT RESPNSBLTY -PROV UNEMP LIV STAND 2006     c

In [17]:
nls97spss.columns = nls97spss.columns.\
str.lower().str.replace(' ', '_').\
str.replace('[^a-z0-9_]','')

  This is separate from the ipykernel package so we can avoid doing imports until


In [18]:
nls97spss.dtypes

pubid__yth_id_code_1997                        float64
keysex_symbol_1997                            category
keybdate_my_symbol_1997                        float64
keybdate_my_symbol_1997                        float64
cv_sample_type_1997                           category
keyrace_ethnicity_symbol_1997                 category
trans_sat_verbal_hstr                          float64
trans_sat_math_hstr                            float64
trans_crd_gpa_overall_hstr                     float64
trans_crd_gpa_eng_hstr                         float64
trans_crd_gpa_math_hstr                        float64
trans_crd_gpa_lp_sci_hstr                      float64
govt_responsibility__provide_jobs_2006        category
govt_respnsblty__keep_prices_und_ctrl_2006    category
govt_respnsblty__hlth_care_for_sick_2006      category
govt_respnsblty_prov_eld_liv_stand_2006       category
govt_respnsblty_prov_ind_help_2006            category
govt_respnsblty_prov_unemp_liv_stand_2006     category
govt_respn

In [19]:
nls97spss.set_index('pubid__yth_id_code_1997', inplace=True)

# Simplify the process by applying the value labels from the beginning

In [20]:
nls97spss, metaspss = pyreadstat.read_sav('data/nls97.sav', apply_value_formats=True, formats_as_category=True)

In [21]:
nls97spss.head(2)

Unnamed: 0,R0000100,R0536300,R0536401,R0536402,R1235800,R1482600,R9793800,R9793900,R9871900,R9872000,...,U1852700,U2166200,U2166300,U2166400,U2166500,U2857300,U2962800,U2962900,U2963000,Z9063900
0,1.0,Female,9.0,1981.0,Cross-sectional,Non-Black / Non-Hispanic,350.0,470.0,309.0,310.0,...,,,,,,,,,,52.0
1,2.0,Male,7.0,1982.0,Cross-sectional,Hispanic,460.0,440.0,217.0,280.0,...,,,,,,,4 to 6 hours a week,3 to 10 hours a week,6.0,0.0


In [22]:
nls97spss.columns = metaspss.column_labels

In [23]:
nls97spss.columns = nls97spss.columns.str.lower().str.replace(' ','_').str.replace('[^a-z0-9_]', '')

  """Entry point for launching an IPython kernel.


# Show the columns and a few rows

In [24]:
nls97spss.dtypes

pubid__yth_id_code_1997                        float64
keysex_symbol_1997                            category
keybdate_my_symbol_1997                        float64
keybdate_my_symbol_1997                        float64
cv_sample_type_1997                           category
keyrace_ethnicity_symbol_1997                 category
trans_sat_verbal_hstr                          float64
trans_sat_math_hstr                            float64
trans_crd_gpa_overall_hstr                     float64
trans_crd_gpa_eng_hstr                         float64
trans_crd_gpa_math_hstr                        float64
trans_crd_gpa_lp_sci_hstr                      float64
govt_responsibility__provide_jobs_2006        category
govt_respnsblty__keep_prices_und_ctrl_2006    category
govt_respnsblty__hlth_care_for_sick_2006      category
govt_respnsblty_prov_eld_liv_stand_2006       category
govt_respnsblty_prov_ind_help_2006            category
govt_respnsblty_prov_unemp_liv_stand_2006     category
govt_respn

In [26]:
nls97spss.head(2)

Unnamed: 0,pubid__yth_id_code_1997,keysex_symbol_1997,keybdate_my_symbol_1997,keybdate_my_symbol_1997.1,cv_sample_type_1997,keyrace_ethnicity_symbol_1997,trans_sat_verbal_hstr,trans_sat_math_hstr,trans_crd_gpa_overall_hstr,trans_crd_gpa_eng_hstr,...,cv_bio_child_nr_2017,diplomadegree_rcvd_l1_2017,diplomadegree_rcvd_l2_2017,diplomadegree_rcvd_l3_2017,diplomadegree_rcvd_l4_2017,est_inc_wages_tips_past_yr_2017,hrswk_r_uses_a_computer_2017,hrswk_r_watches_television_2017,hrsnight_r_sleeps_2017,cvc_wkswk_yr_all_l99
0,1.0,Female,9.0,1981.0,Cross-sectional,Non-Black / Non-Hispanic,350.0,470.0,309.0,310.0,...,,,,,,,,,,52.0
1,2.0,Male,7.0,1982.0,Cross-sectional,Hispanic,460.0,440.0,217.0,280.0,...,,,,,,,4 to 6 hours a week,3 to 10 hours a week,6.0,0.0


# Run frequencies on one of the columns and set the index

In [27]:
nls97spss.govt_responsibility__provide_jobs_2006.value_counts(sort=False)

Definitely should be        454
Definitely should not be    300
Probably should be          617
Probably should not be      462
Name: govt_responsibility__provide_jobs_2006, dtype: int64

In [28]:
nls97spss.set_index('pubid__yth_id_code_1997',inplace=True)

# Import the Stata data, apply value labels, and improve the column headings

In [29]:
nls97stata, metastata = pyreadstat.read_dta('data/nls97.dta', apply_value_formats=True, formats_as_category=True)

In [30]:
nls97stata.columns = metastata.column_labels

In [31]:
nls97stata.columns = nls97stata.columns.str.lower().str.replace(' ','_').str.replace('[^a-z0-9_]', '')

  """Entry point for launching an IPython kernel.


In [32]:
nls97stata.dtypes

pubid__yth_id_code_1997                        float64
keysex_symbol_1997                            category
keybdate_my_symbol_1997                        float64
keybdate_my_symbol_1997                        float64
cv_sample_type_1997                           category
keyrace_ethnicity_symbol_1997                 category
trans_sat_verbal_hstr                          float64
trans_sat_math_hstr                            float64
trans_crd_gpa_overall_hstr                     float64
trans_crd_gpa_eng_hstr                         float64
trans_crd_gpa_math_hstr                        float64
trans_crd_gpa_lp_sci_hstr                      float64
govt_responsibility__provide_jobs_2006        category
govt_respnsblty__keep_prices_und_ctrl_2006    category
govt_respnsblty__hlth_care_for_sick_2006      category
govt_respnsblty_prov_eld_liv_stand_2006       category
govt_respnsblty_prov_ind_help_2006            category
govt_respnsblty_prov_unemp_liv_stand_2006     category
govt_respn

# View a few rows of the data and run frequency

In [33]:
nls97stata.head(2)

Unnamed: 0,pubid__yth_id_code_1997,keysex_symbol_1997,keybdate_my_symbol_1997,keybdate_my_symbol_1997.1,cv_sample_type_1997,keyrace_ethnicity_symbol_1997,trans_sat_verbal_hstr,trans_sat_math_hstr,trans_crd_gpa_overall_hstr,trans_crd_gpa_eng_hstr,...,cv_bio_child_nr_2017,diplomadegree_rcvd_l1_2017,diplomadegree_rcvd_l2_2017,diplomadegree_rcvd_l3_2017,diplomadegree_rcvd_l4_2017,est_inc_wages_tips_past_yr_2017,hrswk_r_uses_a_computer_2017,hrswk_r_watches_television_2017,hrsnight_r_sleeps_2017,cvc_wkswk_yr_all_l99
0,1.0,Female,9.0,1981.0,Cross-sectional,Non-Black / Non-Hispanic,350.0,470.0,309.0,310.0,...,-5.0,-5.0,-5.0,-5.0,-5.0,-5.0,-5.0,-5.0,-5.0,52.0
1,2.0,Male,7.0,1982.0,Cross-sectional,Hispanic,460.0,440.0,217.0,280.0,...,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,4 to 6 hours a week,3 to 10 hours a week,6.0,0.0


In [34]:
nls97stata.govt_responsibility__provide_jobs_2006.value_counts(sort=False)

-5.0                        1425
-4.0                        5665
-2.0                          56
-1.0                           5
Definitely should be         454
Definitely should not be     300
Probably should be           617
Probably should not be       462
Name: govt_responsibility__provide_jobs_2006, dtype: int64

# Fix the logical missing values that show up with the Stata data and set an index

In [35]:
nls97stata.min()

pubid__yth_id_code_1997          1.00
keybdate_my_symbol_1997          1.00
keybdate_my_symbol_1997      1,980.00
trans_sat_verbal_hstr           -4.00
trans_sat_math_hstr             -4.00
trans_crd_gpa_overall_hstr      -9.00
trans_crd_gpa_eng_hstr          -9.00
trans_crd_gpa_math_hstr         -9.00
trans_crd_gpa_lp_sci_hstr       -9.00
cv_ba_credits_l1_2011           -5.00
cv_bio_child_hh_2017            -5.00
cv_bio_child_nr_2017            -5.00
hrsnight_r_sleeps_2017          -5.00
cvc_wkswk_yr_all_l99            -4.00
dtype: float64

In [36]:
nls97stata.replace(list(range(-9, 0)), np.nan, inplace=True)

In [37]:
nls97stata.min()

pubid__yth_id_code_1997          1.00
keybdate_my_symbol_1997          1.00
keybdate_my_symbol_1997      1,980.00
trans_sat_verbal_hstr           14.00
trans_sat_math_hstr              7.00
trans_crd_gpa_overall_hstr      10.00
trans_crd_gpa_eng_hstr           0.00
trans_crd_gpa_math_hstr          0.00
trans_crd_gpa_lp_sci_hstr        0.00
cv_ba_credits_l1_2011            0.00
cv_bio_child_hh_2017             0.00
cv_bio_child_nr_2017             0.00
hrsnight_r_sleeps_2017           0.00
cvc_wkswk_yr_all_l99             0.00
dtype: float64

In [38]:
nls97stata.set_index('pubid__yth_id_code_1997', inplace=True)

# Retrieve the SAS data, using the SAS catalog file for value labels

In [39]:
nls97sas, metasas = pyreadstat.read_sas7bdat(
    'data/nls97.sas7bdat',
    catalog_file='data/nlsformats3.sas7bcat',
    formats_as_category=True)

In [40]:
nls97sas.columns = metasas.column_labels

In [41]:
nls97sas.columns = nls97sas.columns.\
    str.lower().\
    str.replace(' ','_').\
    str.replace('[^a-z0-9_]', '')

  after removing the cwd from sys.path.


In [42]:
nls97sas.head()

Unnamed: 0,pubid__yth_id_code_1997,keysex_symbol_1997,keybdate_my_symbol_1997,keybdate_my_symbol_1997.1,cv_sample_type_1997,keyrace_ethnicity_symbol_1997,trans_sat_verbal_hstr,trans_sat_math_hstr,trans_crd_gpa_overall_hstr,trans_crd_gpa_eng_hstr,...,cv_bio_child_nr_2017,diplomadegree_rcvd_l1_2017,diplomadegree_rcvd_l2_2017,diplomadegree_rcvd_l3_2017,diplomadegree_rcvd_l4_2017,est_inc_wages_tips_past_yr_2017,hrswk_r_uses_a_computer_2017,hrswk_r_watches_television_2017,hrsnight_r_sleeps_2017,cvc_wkswk_yr_all_l99
0,1.0,Female,9.0,1981.0,Cross-sectional,Non-Black / Non-Hispanic,350.0,470.0,309.0,310.0,...,,,,,,,,,,52.0
1,2.0,Male,7.0,1982.0,Cross-sectional,Hispanic,460.0,440.0,217.0,280.0,...,,,,,,,4 to 6 hours a week,3 to 10 hours a week,6.0,0.0
2,3.0,Female,9.0,1983.0,Cross-sectional,Hispanic,,,,,...,0.0,,,,,,10 hours or more a week,3 to 10 hours a week,6.0,0.0
3,4.0,Female,2.0,1981.0,Cross-sectional,Hispanic,,,253.0,216.0,...,1.0,,,,,,1 to 3 hours a week,3 to 10 hours a week,6.0,4.0
4,5.0,Male,10.0,1982.0,Cross-sectional,Hispanic,,,243.0,235.0,...,0.0,,,,,,Less than 1 hour a week,3 to 10 hours a week,5.0,12.0


In [43]:
nls97sas.keysex_symbol_1997.value_counts()

Male      4599
Female    4385
Name: keysex_symbol_1997, dtype: int64