<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-the-pandas,-os,-and-sys-libraries" data-toc-modified-id="Import-the-pandas,-os,-and-sys-libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import the pandas, os, and sys libraries</a></span></li><li><span><a href="#Import-the-basicdescriptives-module" data-toc-modified-id="Import-the-basicdescriptives-module-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Import the basicdescriptives module</a></span></li><li><span><a href="#Show-summary-statistics-for-continuous-variables" data-toc-modified-id="Show-summary-statistics-for-continuous-variables-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Show summary statistics for continuous variables</a></span></li><li><span><a href="#Create-a-function-to-count-missing-values-by-columns-and-rows" data-toc-modified-id="Create-a-function-to-count-missing-values-by-columns-and-rows-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Create a function to count missing values by columns and rows</a></span></li><li><span><a href="#Call-the-getmissings-function" data-toc-modified-id="Call-the-getmissings-function-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Call the getmissings function</a></span></li><li><span><a href="#Call-the-makefreqs-function" data-toc-modified-id="Call-the-makefreqs-function-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Call the makefreqs function</a></span></li><li><span><a href="#Pass-the-marital-status,-gender,-and-college-enrollment-columns-to-the-getcnts-function" data-toc-modified-id="Pass-the-marital-status,-gender,-and-college-enrollment-columns-to-the-getcnts-function-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Pass the marital status, gender, and college enrollment columns to the getcnts function</a></span></li><li><span><a href="#Use-the-rowsel-parameter-of-getcnts-to-limit-the-output-to-specific-rows" data-toc-modified-id="Use-the-rowsel-parameter-of-getcnts-to-limit-the-output-to-specific-rows-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Use the rowsel parameter of getcnts to limit the output to specific rows</a></span></li></ul></div>

# Import the pandas, os, and sys libraries

In [1]:
import pandas as pd
import os
import sys

In [2]:
import watermark
%load_ext watermark

%watermark -n -i -iv

watermark: 2.1.0
pandas   : 1.2.1
sys      : 3.7.9 (default, Aug 31 2020, 17:10:11) [MSC v.1916 64 bit (AMD64)]
json     : 2.0.9



In [3]:
nls97 = pd.read_csv('data/nls97f.csv')
nls97.set_index('personid', inplace=True)

# Import the basicdescriptives module

In [4]:
sys.path.append(os.getcwd() + '\helperfunctions')
# print(sys.path)

In [5]:
import basicdescriptives as bd

# Show summary statistics for continuous variables

In [6]:
bd.gettots(nls97[['satverbal', 'satmath']]).T

Unnamed: 0,satverbal,satmath
min,14.0,7.0
per15,390.0,390.0
qr1,430.0,430.0
med,500.0,500.0
qr3,570.0,580.0
per85,620.0,621.0
max,800.0,800.0
count,1406.0,1407.0
mean,499.72404,500.590618
iqr,140.0,150.0


In [7]:
bd.gettots(nls97.filter(like='weeksworked'))

Unnamed: 0,min,per15,qr1,med,qr3,per85,max,count,mean,iqr
weeksworked00,0.0,0.0,5.0,26.0,50.0,53.0,53.0,8603,26.417761,45.0
weeksworked01,0.0,0.0,10.0,33.0,51.0,52.0,52.0,8564,29.784096,41.0
weeksworked02,0.0,0.0,13.0,38.0,52.0,52.0,52.0,8556,31.8054,39.0
weeksworked03,0.0,0.0,14.0,43.0,52.0,52.0,52.0,8490,33.469611,38.0
weeksworked04,0.0,1.0,18.0,46.0,52.0,52.0,52.0,8458,35.104635,34.0
weeksworked05,0.0,5.0,22.0,50.0,53.0,53.0,53.0,8403,37.316435,31.0
weeksworked06,0.0,9.0,27.0,51.0,52.0,52.0,52.0,8340,38.429976,25.0
weeksworked07,0.0,10.0,30.0,52.0,52.0,52.0,52.0,8272,39.241296,22.0
weeksworked08,0.0,9.0,30.0,52.0,52.0,52.0,52.0,8186,39.287564,22.0
weeksworked09,0.0,0.0,22.0,52.0,52.0,52.0,52.0,8146,37.419961,30.0


# Create a function to count missing values by columns and rows

In [8]:
missingsbycols, missingsbyrows = bd.getmissings(
    nls97[['weeksworked16', 'weeksworked17']], True)

In [9]:
missingsbycols

weeksworked16    1916
weeksworked17    2314
dtype: int64

In [10]:
# the missingbyrows value shows that 73.9% of rows have 0 missing values for
# weeksworked16 and weeksworked17

missingsbyrows

0    0.739203
1    0.050757
2    0.210040
dtype: float64

In [11]:
nls97.shape

(8984, 89)

# Call the getmissings function

In [12]:
missingsbycols, missingsbyrows = bd.getmissings(
    nls97[['weeksworked16', 'weeksworked17']])

In [13]:
missingsbyrows

0    6641
1     456
2    1887
dtype: int64

In [14]:
# Create a function to calculate frequencies for all categorical variables

# Call the makefreqs function

In [15]:
# change data type of each object column to category
nls97.loc[:, nls97.dtypes == 'object'] = nls97.select_dtypes(
    ['object']).apply(lambda x: x.astype('category'))

In [16]:
nls97.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8984 entries, 100061 to 999963
Data columns (total 89 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   gender                 8984 non-null   category
 1   birthmonth             8984 non-null   int64   
 2   birthyear              8984 non-null   int64   
 3   highestgradecompleted  6663 non-null   float64 
 4   maritalstatus          6672 non-null   category
 5   childathome            4791 non-null   float64 
 6   childnotathome         4791 non-null   float64 
 7   wageincome             5091 non-null   float64 
 8   weeklyhrscomputer      6710 non-null   category
 9   weeklyhrstv            6711 non-null   category
 10  nightlyhrssleep        6706 non-null   float64 
 11  satverbal              1406 non-null   float64 
 12  satmath                1407 non-null   float64 
 13  gpaoverall             6004 non-null   float64 
 14  gpaenglish             5798 non-n

In [17]:
bd.makefreqs(nls97, 'views/nlsfreqs.txt')

# Pass the marital status, gender, and college enrollment columns to the getcnts function

In [18]:
# group counts and percentages for subgroups within groups
bd.get_counts(nls97, ['maritalstatus', 'gender', 'colenroct00'])

Unnamed: 0,maritalstatus,gender,colenroct00,cat_count,tot_count,percent
0,Divorced,Female,1. Not enrolled,317,393,0.806616
1,Divorced,Female,2. 2-year college,35,393,0.089059
2,Divorced,Female,3. 4-year college,41,393,0.104326
3,Divorced,Male,1. Not enrolled,238,270,0.881481
4,Divorced,Male,2. 2-year college,15,270,0.055556
5,Divorced,Male,3. 4-year college,17,270,0.062963
6,Married,Female,1. Not enrolled,1168,1636,0.713936
7,Married,Female,2. 2-year college,143,1636,0.087408
8,Married,Female,3. 4-year college,325,1636,0.198655
9,Married,Male,1. Not enrolled,1094,1430,0.765035


# Use the rowsel parameter of getcnts to limit the output to specific rows

In [19]:
bd.get_counts(nls97, ['maritalstatus', 'gender', 'colenroct00'],
              'colenroct00.str[0:1] == "1"')

Unnamed: 0,maritalstatus,gender,colenroct00,cat_count,tot_count,percent
0,Divorced,Female,1. Not enrolled,317,393,0.806616
3,Divorced,Male,1. Not enrolled,238,270,0.881481
6,Married,Female,1. Not enrolled,1168,1636,0.713936
9,Married,Male,1. Not enrolled,1094,1430,0.765035
12,Never-married,Female,1. Not enrolled,1094,1307,0.837031
15,Never-married,Male,1. Not enrolled,1268,1459,0.869088
18,Separated,Female,1. Not enrolled,66,79,0.835443
21,Separated,Male,1. Not enrolled,67,75,0.893333
24,Widowed,Female,1. Not enrolled,16,19,0.842105
27,Widowed,Male,1. Not enrolled,3,4,0.75
