<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-pandas-and-load-the-NLS-data" data-toc-modified-id="Import-pandas-and-load-the-NLS-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import pandas and load the NLS data</a></span></li><li><span><a href="#Review-the-structure-of-the-data" data-toc-modified-id="Review-the-structure-of-the-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Review the structure of the data</a></span></li><li><span><a href="#Review-some-of-the-categorical-data" data-toc-modified-id="Review-some-of-the-categorical-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Review some of the categorical data</a></span></li><li><span><a href="#Review-some-descriptive-statistics" data-toc-modified-id="Review-some-descriptive-statistics-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Review some descriptive statistics</a></span></li><li><span><a href="#Look-at-Scholastic-Assessment-Test-(SAT)-math-scores-by-gender" data-toc-modified-id="Look-at-Scholastic-Assessment-Test-(SAT)-math-scores-by-gender-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Look at Scholastic Assessment Test (SAT) math scores by gender</a></span></li><li><span><a href="#Look-at-the-SAT-math-scores-by-gender-and-highest-degree-earned" data-toc-modified-id="Look-at-the-SAT-math-scores-by-gender-and-highest-degree-earned-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Look at the SAT math scores by gender and highest degree earned</a></span></li><li><span><a href="#Add-columns-for-the-count,-max,-and-standard-deviation" data-toc-modified-id="Add-columns-for-the-count,-max,-and-standard-deviation-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Add columns for the count, max, and standard deviation</a></span></li><li><span><a href="#Use-a-dictionary-for-more-complicated-aggregations" data-toc-modified-id="Use-a-dictionary-for-more-complicated-aggregations-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Use a dictionary for more complicated aggregations</a></span></li></ul></div>

# Import pandas and load the NLS data

In [1]:
import pandas as pd

In [2]:
# pd.set_option('display.width', 90)
# pd.set_option('display.max_columns', 10)
# pd.set_option('display.max_rows', 30)
pd.options.display.float_format = '{:,.1f}'.format

In [3]:
import watermark
%load_ext watermark

%watermark -n -i -iv

watermark: 2.1.0
pandas   : 1.2.1
json     : 2.0.9



In [6]:
nls97 = pd.read_csv('data/nls97b.csv')
nls97.set_index('personid', inplace=True)

# Review the structure of the data

In [7]:
nls97.iloc[:, 0:7].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8984 entries, 100061 to 999963
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gender                 8984 non-null   object 
 1   birthmonth             8984 non-null   int64  
 2   birthyear              8984 non-null   int64  
 3   highestgradecompleted  6663 non-null   float64
 4   maritalstatus          6672 non-null   object 
 5   childathome            4791 non-null   float64
 6   childnotathome         4791 non-null   float64
dtypes: float64(3), int64(2), object(2)
memory usage: 561.5+ KB


# Review some of the categorical data

In [8]:
catvars = ['gender', 'maritalstatus', 'highestdegree']

In [9]:
for col in catvars:
    print(col,
          nls97[col].value_counts().sort_index(),
          sep='\n\n',
          end='\n\n\n')

gender

Female    4385
Male      4599
Name: gender, dtype: int64


maritalstatus

Divorced          663
Married          3066
Never-married    2766
Separated         154
Widowed            23
Name: maritalstatus, dtype: int64


highestdegree

0. None             953
1. GED             1146
2. High School     3667
3. Associates       737
4. Bachelors       1673
5. Masters          603
6. PhD               54
7. Professional     120
Name: highestdegree, dtype: int64




# Review some descriptive statistics

In [10]:
contvars = [
    'satmath', 'satverbal', 'weeksworked06', 'gpaoverall', 'childathome'
]
nls97[contvars].describe()

Unnamed: 0,satmath,satverbal,weeksworked06,gpaoverall,childathome
count,1407.0,1406.0,8340.0,6004.0,4791.0
mean,500.6,499.7,38.4,2.8,1.9
std,115.0,112.2,18.9,0.6,1.3
min,7.0,14.0,0.0,0.1,0.0
25%,430.0,430.0,27.0,2.4,1.0
50%,500.0,500.0,51.0,2.9,2.0
75%,580.0,570.0,52.0,3.3,3.0
max,800.0,800.0,52.0,4.2,9.0


# Look at Scholastic Assessment Test (SAT) math scores by gender

In [11]:
nls97.groupby('gender')['satmath'].mean()

gender
Female   486.6
Male     516.9
Name: satmath, dtype: float64

# Look at the SAT math scores by gender and highest degree earned

In [12]:
nls97.groupby(['gender', 'highestdegree'])['satmath'].mean()

gender  highestdegree  
Female  0. None           332.6
        1. GED            405.0
        2. High School    430.8
        3. Associates     458.0
        4. Bachelors      501.9
        5. Masters        508.3
        6. PhD            575.5
        7. Professional   599.4
Male    0. None           540.0
        1. GED            320.0
        2. High School    467.7
        3. Associates     481.1
        4. Bachelors      542.2
        5. Masters        574.4
        6. PhD            621.4
        7. Professional   587.7
Name: satmath, dtype: float64

# Add columns for the count, max, and standard deviation

In [13]:
nls97.groupby(['gender', 'highestdegree'
               ])['gpaoverall'].agg(['count', 'mean', 'max', 'std'])

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max,std
gender,highestdegree,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,0. None,148,2.5,4.0,0.7
Female,1. GED,227,2.3,3.9,0.7
Female,2. High School,1212,2.8,4.2,0.5
Female,3. Associates,290,2.9,4.0,0.5
Female,4. Bachelors,734,3.2,4.1,0.5
Female,5. Masters,312,3.3,4.1,0.4
Female,6. PhD,22,3.5,4.0,0.5
Female,7. Professional,53,3.5,4.1,0.4
Male,0. None,193,2.2,4.0,0.6
Male,1. GED,345,2.2,4.0,0.6


# Use a dictionary for more complicated aggregations

In [16]:
aggdict = {
    'weeksworked06': ['count', 'mean', 'max', 'std'],
    'childathome': ['count', 'mean', 'max', 'std']
}
nls97.groupby(['highestdegree']).agg(aggdict)

Unnamed: 0_level_0,weeksworked06,weeksworked06,weeksworked06,weeksworked06,childathome,childathome,childathome,childathome
Unnamed: 0_level_1,count,mean,max,std,count,mean,max,std
highestdegree,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0. None,703,29.7,52.0,21.6,439,1.8,8.0,1.6
1. GED,1104,33.2,52.0,20.6,693,1.7,9.0,1.5
2. High School,3368,39.4,52.0,18.6,1961,1.9,7.0,1.3
3. Associates,722,40.7,52.0,17.7,428,2.0,6.0,1.1
4. Bachelors,1642,42.2,52.0,16.1,827,1.9,8.0,1.0
5. Masters,601,42.2,52.0,16.1,333,1.9,5.0,0.9
6. PhD,53,38.2,52.0,18.6,32,2.1,6.0,1.1
7. Professional,117,27.1,52.0,20.4,57,1.8,4.0,0.8


In [17]:
nls97.groupby(['highestdegree'
               ])['weeksworked06',
                  'childathome'].agg(['count', 'mean', 'max', 'std'])

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,weeksworked06,weeksworked06,weeksworked06,weeksworked06,childathome,childathome,childathome,childathome
Unnamed: 0_level_1,count,mean,max,std,count,mean,max,std
highestdegree,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0. None,703,29.7,52.0,21.6,439,1.8,8.0,1.6
1. GED,1104,33.2,52.0,20.6,693,1.7,9.0,1.5
2. High School,3368,39.4,52.0,18.6,1961,1.9,7.0,1.3
3. Associates,722,40.7,52.0,17.7,428,2.0,6.0,1.1
4. Bachelors,1642,42.2,52.0,16.1,827,1.9,8.0,1.0
5. Masters,601,42.2,52.0,16.1,333,1.9,5.0,0.9
6. PhD,53,38.2,52.0,18.6,32,2.1,6.0,1.1
7. Professional,117,27.1,52.0,20.4,57,1.8,4.0,0.8


In [18]:
nls97.groupby(['maritalstatus']).agg(aggdict)

Unnamed: 0_level_0,weeksworked06,weeksworked06,weeksworked06,weeksworked06,childathome,childathome,childathome,childathome
Unnamed: 0_level_1,count,mean,max,std,count,mean,max,std
maritalstatus,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Divorced,660,37.5,52.0,19.1,524,1.5,5.0,1.2
Married,3033,40.3,52.0,17.9,2563,2.1,8.0,1.1
Never-married,2734,37.2,52.0,19.1,1502,1.6,9.0,1.3
Separated,153,33.8,52.0,20.2,137,1.5,8.0,1.4
Widowed,23,37.1,52.0,19.3,18,1.8,5.0,1.4
