<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-pandas-and-load-the-NLS-data" data-toc-modified-id="Import-pandas-and-load-the-NLS-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import pandas and load the NLS data</a></span></li><li><span><a href="#Check-data-for-missing-values" data-toc-modified-id="Check-data-for-missing-values-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Check data for missing values</a></span></li><li><span><a href="#Remove-rows-where-nearly-all-the-data-is-missing" data-toc-modified-id="Remove-rows-where-nearly-all-the-data-is-missing-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Remove rows where nearly all the data is missing</a></span></li><li><span><a href="#Assign-the-mean-of-the-GPA-values-where-it's-missing" data-toc-modified-id="Assign-the-mean-of-the-GPA-values-where-it's-missing-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Assign the mean of the GPA values where it's missing</a></span></li><li><span><a href="#Use-forward-fill-to-replace-missing-values" data-toc-modified-id="Use-forward-fill-to-replace-missing-values-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Use forward fill to replace missing values</a></span></li><li><span><a href="#Fill-missing-values-with-the-mean-by-group" data-toc-modified-id="Fill-missing-values-with-the-mean-by-group-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Fill missing values with the mean by group</a></span></li></ul></div>

# Import pandas and load the NLS data

In [1]:
import pandas as pd

In [2]:
# pd.set_option('display.width', 200)
# pd.set_option('display.max_columns', 12)
# pd.set_option('display.max_rows', 100)
# pd.options.display.float_format = '{:,.0f}'.format

In [3]:
import watermark
%load_ext watermark

%watermark -n -i -iv

json     : 2.0.9
pandas   : 1.2.1
watermark: 2.1.0



In [4]:
nls97 = pd.read_csv('data/nls97c.csv')
nls97.set_index('personid', inplace=True)

In [6]:
schoolrecordlist = [
    'satverbal', 'satmath', 'gpaoverall', 'gpaenglish', 'gpamath',
    'gpascience', 'highestdegree', 'highestgradecompleted'
]

In [7]:
demolist = [
    'maritalstatus', 'childathome', 'childnotathome', 'wageincome',
    'weeklyhrscomputer', 'weeklyhrstv', 'nightlyhrssleep'
]

In [8]:
schoolrecord = nls97[schoolrecordlist]

In [9]:
demo = nls97[demolist]

In [10]:
schoolrecord.shape

(8984, 8)

In [11]:
demo.shape

(8984, 7)

# Check data for missing values

In [12]:
# By setting axis=1, we can check the number of missing values for each row.

schoolrecord.isnull().sum(axis=0)

satverbal                7578
satmath                  7577
gpaoverall               2980
gpaenglish               3186
gpamath                  3218
gpascience               3300
highestdegree              31
highestgradecompleted    2321
dtype: int64

In [13]:
missing_count = schoolrecord.isnull().sum(axis=1)

In [14]:
missing_count.value_counts().sort_index()

0    1087
1     312
2    3210
3    1102
4     176
5     101
6    2039
7     946
8      11
dtype: int64

In [15]:
schoolrecord.loc[missing_count >= 7].head(4).T

personid,101705,102061,102648,104627
satverbal,,,,
satmath,,,,
gpaoverall,,,,
gpaenglish,,,,
gpamath,,,,
gpascience,,,,
highestdegree,1. GED,0. None,1. GED,0. None
highestgradecompleted,,,,


# Remove rows where nearly all the data is missing

In [16]:
schoolrecord = schoolrecord.dropna(thresh=2)

In [17]:
schoolrecord.isnull().sum(axis=1).value_counts().sort_index()

0    1087
1     312
2    3210
3    1102
4     176
5     101
6    2039
dtype: int64

# Assign the mean of the GPA values where it's missing

In [18]:
int(schoolrecord['gpaoverall'].mean())

2

In [20]:
schoolrecord['gpaoverall'].isnull().sum()

2023

In [21]:
schoolrecord['gpaoverall'].fillna(int(schoolrecord['gpaoverall'].mean()),
                                  inplace=True)

In [22]:
schoolrecord['gpaoverall'].isnull().sum()

0

# Use forward fill to replace missing values

In [23]:
demo['wageincome'].head().T

personid
100061     12500.0
100139    120000.0
100284     58000.0
100292         NaN
100583     30000.0
Name: wageincome, dtype: float64

In [24]:
demo['wageincome'].isnull().sum()

3893

In [27]:
nls97['wageincome'].fillna(method='ffill', inplace=True)

In [28]:
demo = nls97[demolist]

In [29]:
demo['wageincome'].head().T

personid
100061     12500.0
100139    120000.0
100284     58000.0
100292     58000.0
100583     30000.0
Name: wageincome, dtype: float64

In [30]:
demo['wageincome'].isnull().sum()

0

# Fill missing values with the mean by group

In [31]:
nls97[['highestdegree', 'weeksworked17']].head()

Unnamed: 0_level_0,highestdegree,weeksworked17
personid,Unnamed: 1_level_1,Unnamed: 2_level_1
100061,2. High School,48.0
100139,2. High School,52.0
100284,0. None,0.0
100292,4. Bachelors,
100583,2. High School,52.0


In [32]:
workbydegree = nls97.groupby(
    ['highestdegree'])['weeksworked17'].mean().reset_index().rename(
        columns={'weeksworked17': 'meanweeksworked17'})

In [33]:
nls97 = nls97.reset_index().merge(workbydegree,
                                  left_on=['highestdegree'],
                                  right_on=['highestdegree'],
                                  how='left').set_index('personid')

In [34]:
nls97['weeksworked17'].fillna(nls97['meanweeksworked17'], inplace=True)

In [35]:
nls97[['highestdegree', 'weeksworked17', 'meanweeksworked17']].head()

Unnamed: 0_level_0,highestdegree,weeksworked17,meanweeksworked17
personid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100061,2. High School,48.0,38.150469
100139,2. High School,52.0,38.150469
100284,0. None,0.0,28.719608
100292,4. Bachelors,43.565574,43.565574
100583,2. High School,52.0,38.150469
