### Import some packages.

In [5]:
import pandas as pd
import numpy as np
import pyreadstat

### Retrieve the SPSS data.

In [7]:
nls97spss, metaspss = pyreadstat.read_sav('/work/nls97.sav')
nls97spss.dtypes

R0000100    float64
R0536300    float64
R0536401    float64
R0536402    float64
R1235800    float64
R1482600    float64
R9793800    float64
R9793900    float64
R9871900    float64
R9872000    float64
R9872200    float64
R9872400    float64
S8646900    float64
S8647000    float64
S8647100    float64
S8647200    float64
S8647300    float64
S8647400    float64
S8647500    float64
S8647600    float64
S8647700    float64
S8647800    float64
T6651700    float64
U1836800    float64
U1836900    float64
U1837000    float64
U1837100    float64
U1837200    float64
U1837300    float64
U1845400    float64
U1852400    float64
U1852600    float64
U1852700    float64
U2166200    float64
U2166300    float64
U2166400    float64
U2166500    float64
U2857300    float64
U2962800    float64
U2962900    float64
U2963000    float64
Z9063900    float64
dtype: object

In [8]:
nls97spss.head()

Unnamed: 0,R0000100,R0536300,R0536401,R0536402,R1235800,R1482600,R9793800,R9793900,R9871900,R9872000,...,U1852700,U2166200,U2166300,U2166400,U2166500,U2857300,U2962800,U2962900,U2963000,Z9063900
0,1.0,2.0,9.0,1981.0,1.0,4.0,350.0,470.0,309.0,310.0,...,,,,,,,,,,52.0
1,2.0,1.0,7.0,1982.0,1.0,2.0,460.0,440.0,217.0,280.0,...,,,,,,,4.0,2.0,6.0,0.0
2,3.0,2.0,9.0,1983.0,1.0,2.0,,,,,...,0.0,,,,,,6.0,2.0,6.0,0.0
3,4.0,2.0,2.0,1981.0,1.0,2.0,,,253.0,216.0,...,1.0,,,,,,3.0,2.0,6.0,4.0
4,5.0,1.0,10.0,1982.0,1.0,2.0,,,243.0,235.0,...,0.0,,,,,,2.0,2.0,5.0,12.0


In [9]:
nls97spss['R0536300'].value_counts(normalize=True)

1.0    0.51191
2.0    0.48809
Name: R0536300, dtype: float64

### Grad the metadata to improve column lables and value labels.

In [10]:
metaspss.variable_value_labels['R0536300']

{0.0: 'No Information', 1.0: 'Male', 2.0: 'Female'}

In [11]:
nls97spss["R0536300"].map(metaspss.variable_value_labels['R0536300']).value_counts(normalize=True)

Male      0.51191
Female    0.48809
Name: R0536300, dtype: float64

In [12]:
nls97spss = pyreadstat.set_value_labels(nls97spss, metaspss, formats_as_category=True)

### Use column labels in the metadata to rename the columns.

In [15]:
nls97spss.columns = metaspss.column_labels

In [18]:
nls97spss['KEY!SEX (SYMBOL) 1997'].value_counts(normalize=True)

Male      0.51191
Female    0.48809
Name: KEY!SEX (SYMBOL) 1997, dtype: float64

In [20]:
nls97spss.dtypes

PUBID - YTH ID CODE 1997                        float64
KEY!SEX (SYMBOL) 1997                          category
KEY!BDATE M/Y (SYMBOL) 1997                     float64
KEY!BDATE M/Y (SYMBOL) 1997                     float64
CV_SAMPLE_TYPE 1997                            category
KEY!RACE_ETHNICITY (SYMBOL) 1997               category
TRANS_SAT_VERBAL HSTR                           float64
TRANS_SAT_MATH HSTR                             float64
TRANS CRD GPA OVERALL HSTR                      float64
TRANS CRD GPA ENG HSTR                          float64
TRANS CRD GPA MATH HSTR                         float64
TRANS CRD GPA LP SCI HSTR                       float64
GOVT RESPONSIBILITY - PROVIDE JOBS 2006        category
GOVT RESPNSBLTY - KEEP PRICES UND CTRL 2006    category
GOVT RESPNSBLTY - HLTH CARE FOR SICK 2006      category
GOVT RESPNSBLTY -PROV ELD LIV STAND 2006       category
GOVT RESPNSBLTY -PROV IND HELP 2006            category
GOVT RESPNSBLTY -PROV UNEMP LIV STAND 2006     c

In [23]:
nls97spss.columns = nls97spss.columns.str.lower().str.replace(' ', '_').str.replace('[^a-z0-9_]', '')

  """Entry point for launching an IPython kernel.


In [24]:
nls97spss.dtypes

pubid___yth_id_code_1997                        float64
key_sex__symbol__1997                          category
key_bdate_m_y__symbol__1997                     float64
key_bdate_m_y__symbol__1997                     float64
cv_sample_type_1997                            category
key_race_ethnicity__symbol__1997               category
trans_sat_verbal_hstr                           float64
trans_sat_math_hstr                             float64
trans_crd_gpa_overall_hstr                      float64
trans_crd_gpa_eng_hstr                          float64
trans_crd_gpa_math_hstr                         float64
trans_crd_gpa_lp_sci_hstr                       float64
govt_responsibility___provide_jobs_2006        category
govt_respnsblty___keep_prices_und_ctrl_2006    category
govt_respnsblty___hlth_care_for_sick_2006      category
govt_respnsblty__prov_eld_liv_stand_2006       category
govt_respnsblty__prov_ind_help_2006            category
govt_respnsblty__prov_unemp_liv_stand_2006     c

### Set the Index

In [26]:
nls97spss.set_index('pubid___yth_id_code_1997',inplace=True)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=48bf7820-5e38-454a-a6db-ca906c5b65d0' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>