<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-pandas,-numpy,-and-the-datetime-module,-and-then-load-the-NLS-and-COVID-case-daily-data:" data-toc-modified-id="Import-pandas,-numpy,-and-the-datetime-module,-and-then-load-the-NLS-and-COVID-case-daily-data:-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import pandas, numpy, and the datetime module, and then load the NLS and COVID case daily data:</a></span></li><li><span><a href="#Show-the-birth-month-and-year-values" data-toc-modified-id="Show-the-birth-month-and-year-values-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Show the birth month and year values</a></span></li><li><span><a href="#Use-the-series-fillna-method-to-set-a-value-for-the-missing-birth-month" data-toc-modified-id="Use-the-series-fillna-method-to-set-a-value-for-the-missing-birth-month-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Use the series fillna method to set a value for the missing birth month</a></span></li><li><span><a href="#Use-month-and-date-integers-to-create-a-datetime-column" data-toc-modified-id="Use-month-and-date-integers-to-create-a-datetime-column-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Use month and date integers to create a datetime column</a></span></li><li><span><a href="#Calculate-age-values-using-a-datetime-column" data-toc-modified-id="Calculate-age-values-using-a-datetime-column-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Calculate age values using a datetime column</a></span></li><li><span><a href="#Convert-a-string-column-into-a-datetime-column" data-toc-modified-id="Convert-a-string-column-into-a-datetime-column-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Convert a string column into a datetime column</a></span></li><li><span><a href="#Show-descriptive-statistics-on-the-datetime-column" data-toc-modified-id="Show-descriptive-statistics-on-the-datetime-column-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Show descriptive statistics on the datetime column</a></span></li><li><span><a href="#Create-a-timedelta-object-to-capture-a-date-interval" data-toc-modified-id="Create-a-timedelta-object-to-capture-a-date-interval-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Create a timedelta object to capture a date interval</a></span></li></ul></div>

# Import pandas, numpy, and the datetime module, and then load the NLS and COVID case daily data:

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
# pd.set_option('display.width', 200)
# pd.set_option('display.max_columns', 35)
# pd.set_option('display.max_rows', 220)
# pd.options.display.float_format = '{:,.0f}'.format

In [3]:
import watermark
%load_ext watermark

%watermark -n -i -iv

numpy    : 1.19.2
watermark: 2.1.0
pandas   : 1.2.1
json     : 2.0.9



In [4]:
covidcases = pd.read_csv('data/covidcases720.csv')
nls97 = pd.read_csv('data/nls97c.csv')
nls97.set_index('personid', inplace=True)

# Show the birth month and year values

In [5]:
nls97[['birthmonth', 'birthyear']].isnull().sum()

birthmonth    1
birthyear     0
dtype: int64

In [6]:
nls97['birthmonth'].value_counts().sort_index()

1.0     815
2.0     693
3.0     760
4.0     659
5.0     689
6.0     720
7.0     762
8.0     782
9.0     839
10.0    765
11.0    763
12.0    736
Name: birthmonth, dtype: int64

In [7]:
nls97['birthyear'].value_counts().sort_index()

1980    1691
1981    1874
1982    1841
1983    1807
1984    1771
Name: birthyear, dtype: int64

# Use the series fillna method to set a value for the missing birth month

In [8]:
nls97['birthmonth'].fillna(int(nls97['birthmonth'].mean()), inplace=True)

In [9]:
nls97['birthmonth'].value_counts().sort_index()

1.0     815
2.0     693
3.0     760
4.0     659
5.0     689
6.0     721
7.0     762
8.0     782
9.0     839
10.0    765
11.0    763
12.0    736
Name: birthmonth, dtype: int64

In [10]:
nls97.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8984 entries, 100061 to 999963
Data columns (total 88 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gender                 8984 non-null   object 
 1   birthmonth             8984 non-null   float64
 2   birthyear              8984 non-null   int64  
 3   highestgradecompleted  6663 non-null   float64
 4   maritalstatus          6672 non-null   object 
 5   childathome            4791 non-null   float64
 6   childnotathome         4791 non-null   float64
 7   wageincome             5091 non-null   float64
 8   weeklyhrscomputer      6710 non-null   object 
 9   weeklyhrstv            6711 non-null   object 
 10  nightlyhrssleep        6706 non-null   float64
 11  satverbal              1406 non-null   float64
 12  satmath                1407 non-null   float64
 13  gpaoverall             6004 non-null   float64
 14  gpaenglish             5798 non-null   float64
 1

# Use month and date integers to create a datetime column

In [11]:
nls97['birthdate'] = pd.to_datetime(
    dict(year=nls97['birthyear'], month=nls97['birthmonth'], day=15))

In [12]:
nls97[['birthmonth', 'birthyear', 'birthdate']].head()

Unnamed: 0_level_0,birthmonth,birthyear,birthdate
personid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100061,5.0,1980,1980-05-15
100139,9.0,1983,1983-09-15
100284,11.0,1984,1984-11-15
100292,4.0,1982,1982-04-15
100583,6.0,1980,1980-06-15


In [13]:
nls97[['birthmonth', 'birthyear', 'birthdate']].isnull().sum()

birthmonth    0
birthyear     0
birthdate     0
dtype: int64

# Calculate age values using a datetime column

In [24]:
def calc_age(startdate, enddate):
    age = enddate.year - startdate.year
    if (enddate.month < startdate.month or
        (enddate.month == startdate.month and enddate.day < startdate.day)):
        age = age - 1
    return age

In [25]:
rundate = pd.to_datetime('2020-07-20')

In [26]:
nls97['age'] = nls97.apply(lambda x: calc_age(x['birthdate'], rundate), axis=1)

In [27]:
nls97.loc[100061:100583, ['age', 'birthdate']]

Unnamed: 0_level_0,age,birthdate
personid,Unnamed: 1_level_1,Unnamed: 2_level_1
100061,40,1980-05-15
100139,36,1983-09-15
100284,35,1984-11-15
100292,38,1982-04-15
100583,40,1980-06-15


# Convert a string column into a datetime column

In [28]:
covidcases.iloc[:, 0:6].dtypes

iso_code        object
continent       object
location        object
casedate        object
total_cases    float64
new_cases      float64
dtype: object

In [29]:
covidcases.iloc[:, 0:6].sample(2, random_state=1).T

Unnamed: 0,13482,2445
iso_code,IMN,BRB
continent,Europe,North America
location,Isle of Man,Barbados
casedate,2020-06-20,2020-04-28
total_cases,336.0,80.0
new_cases,0.0,1.0


In [30]:
covidcases['casedate'] = pd.to_datetime(covidcases['casedate'],
                                        format='%Y-%m-%d')

In [31]:
covidcases.iloc[:, 0:6].dtypes

iso_code               object
continent              object
location               object
casedate       datetime64[ns]
total_cases           float64
new_cases             float64
dtype: object

In [32]:
covidcases.dtypes

iso_code                                   object
continent                                  object
location                                   object
casedate                           datetime64[ns]
total_cases                               float64
new_cases                                 float64
total_deaths                              float64
new_deaths                                float64
total_cases_per_million                   float64
new_cases_per_million                     float64
total_deaths_per_million                  float64
new_deaths_per_million                    float64
total_tests                               float64
new_tests                                 float64
total_tests_per_thousand                  float64
new_tests_per_thousand                    float64
new_tests_smoothed                        float64
new_tests_smoothed_per_thousand           float64
tests_units                                object
stringency_index                          float64


# Show descriptive statistics on the datetime column

In [33]:
covidcases['casedate'].describe()

  """Entry point for launching an IPython kernel.


count                   29529
unique                    195
top       2020-05-23 00:00:00
freq                      209
first     2019-12-31 00:00:00
last      2020-07-12 00:00:00
Name: casedate, dtype: object

# Create a timedelta object to capture a date interval

In [34]:
firstcase = covidcases.loc[covidcases['new_cases'] > 0,
                           ['location', 'casedate']].sort_values(
                               ['location', 'casedate']).drop_duplicates(
                                   ['location'], keep='first').rename(
                                       columns={'casedate': 'firstcasedate'})

In [36]:
covidcases = pd.merge(covidcases,
                      firstcase,
                      left_on=['location'],
                      right_on=['location'],
                      how='left')

In [37]:
covidcases['dayssincefirstcase'] = covidcases['casedate'] - covidcases[
    'firstcasedate']

In [39]:
covidcases['dayssincefirstcase'].describe()

count                         29529
mean     56 days 00:15:12.892410850
std      47 days 00:35:41.813685246
min              -62 days +00:00:00
25%                21 days 00:00:00
50%                57 days 00:00:00
75%                92 days 00:00:00
max               194 days 00:00:00
Name: dayssincefirstcase, dtype: object