In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import pathlib
import os
import zipfile
import seaborn as sns
from multiprocessing import Pool
from src.data.data_prep_utils import df_from_csv_no_geo_extra, df_from_csv_no_geo

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Explore the Different CSVs

In [2]:
# to check CPU count
import multiprocessing
print('CPUs avail:', multiprocessing.cpu_count()) # or os.cpu_count()

# memory available
# https://stackoverflow.com/a/48140392/9214620
!cat /proc/meminfo | grep Mem

CPUs avail: 32
MemTotal:       263745244 kB
MemFree:        70700172 kB
MemAvailable:   227684088 kB


In [3]:
root_dir = Path.cwd().parent
folder_raw_data = root_dir / 'data/raw'
folder_processed_data = root_dir / 'data/processed'
folder_external_data = root_dir / 'data/external'

In [4]:
# get a list of file names
files = os.listdir(folder_raw_data)
file_list = [folder_raw_data / filename for filename in files if filename.endswith('.csv')]
# file_list

The CSV is very large, therefore we will only load certain columns. The columns names are listed in the description pdf. These are the ones we will use for **2003-2004**:
* **dob_yy**: date of birth year
* **dob_mm**: date of birth month
* **dob_wk**: date of birth weekday
* **mrstate**: mother's resident state
* **mrecntyfips**: mother's resident county FIPS code
* **mrcityfips**: mother's place of residence (city) FIPS code
* **apgar5**: five minute Apgar score
* **apgar5r**: five minute Apgar score, recoded

Geographic data is not available from **2005 onwards** (see [column description for 2005 on NBER](https://data.nber.org/natality/2005/desc/natl2005/desc.txt)). Therefore, only these columns will be used.
* **dob_yy**: date of birth year
* **dob_mm**: date of birth month
* **dob_wk**: date of birth weekday
* **apgar5**: five minute Apgar score
* **apgar5r**: five minute Apgar score, recoded


Depending on the year, there are different naming conventions. Here's the ones we'll use for **1989 through 2002** (see [column description for 1989 on NBER](https://data.nber.org/natality/1989/desc/natl1989/desc.txt)):

* **biryr**: date of birth year (can also use 'datayear')
* **birmon**: date of birth month
* **weekday**: date of birth weekday
* **stresfip**: mother's resident state FIPS code
* **cntyrfip**: mother's resident county FIPS code
* **cityres**: mother's place of residence (city) - unsure if fips code
* **fmaps**: five minute Apgar score
* **fmapsr**: five minute Apgar score, recoded


Here's the ones we'll use for 1982 through 1988 (see [column description on NBER](https://data.nber.org/natality/1988/desc/natl1988/desc.txt)):

* **datayear**: date of birth year
* **birmon**: date of birth month
* **birday**: birth date - day (like the 15th of June) (1968-1988 only)
* ~~**weekday**: date of birth weekday~~ (does not exist in this date range)
* **stresfip**: mother's resident state FIPS code
* **cntyrfip**: mother's resident county FIPS code
* **cityres**: mother's place of residence (city) - unsure if fips code
* **fmaps**: five minute Apgar score
* **fmapsr**: five minute Apgar score, recoded

From 1978 onwards, APGAR score is included. (fmaps)

In [None]:
root_dir = Path.cwd().parent
folder_raw_data = root_dir / 'data/raw'
folder_processed_data = root_dir / 'data/processed'
folder_external_data = root_dir / 'data/external'

def main(folder_raw_data):
    """Runs data processing scripts to turn raw data from (../raw) into
    cleaned data ready to be analyzed (saved in ../processed).
    """

    # get a list of file names
    files = os.listdir(folder_raw_data)
    file_list = [
        Path(folder_raw_data) / filename
        for filename in files
        if filename.endswith(".csv")
    ]

    # set up your pool
    with Pool(processes=12) as pool:  # or whatever your hardware can support

        # have your pool map the file names to dataframes
        df_list = pool.map(df_from_csv_no_geo_extra, file_list)

        # reduce the list of dataframes to a single dataframe
        combined_df = pd.concat(df_list, ignore_index=True)

        return combined_df


if __name__ == "__main__":

    # not used in this stub but often useful for finding various files
    project_dir = Path.cwd().parent
    print(type(project_dir))

    df = main(project_dir / "data/raw/")
    print("Final df shape:", df.shape)

    df.to_csv(project_dir / "data/processed" / "birth_no_geo_apgar.csv", index=False)

In [None]:
df.head()

## Fix 2019 and 2020 input

In [20]:
year = 2019
file_natl = folder_raw_data / f'natl{str(year)}.csv'

In [15]:
df = pd.read_csv(file_natl, nrows=1, dtype=str)
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,dob_yy,dob_mm,dob_tt,dob_wk,octerr,ocntyfips,ocntypop,bfacil,f_facility,bfacil3,...,ca_cleft,ca_clpal,f_ca_cleft,f_ca_downs,f_ca_chrom,f_ca_hypos,no_congen,itran,ilive,f_bfed
0,2020,1,1123,4,,,,1,1,1,...,N,N,1,1,1,1,1,N,Y,1


In [21]:
df_from_csv_no_geo_extra(file_natl, nrows=3)

2019 processing complete


Unnamed: 0,dob_yy,dob_mm,apgar5,births
0,2019,1,8,2
1,2019,1,9,1


In [25]:
df_from_csv_no_geo(file_natl, nrows=3)

2019 processing complete


Unnamed: 0,dob_yy,dob_mm,births
0,2019,1,3


# Load Previously Created Data

In [9]:
root_dir = Path.cwd().parent
folder_processed_data = root_dir / 'data/processed'

df = pd.read_csv(folder_processed_data /"birth_no_geo_apgar.csv")
# df = df[df['dob_yy'].isin([1978, 1979])]
df

Unnamed: 0,dob_yy,dob_mm,apgar5,births
0,2011,1,0,167
1,2011,1,1,720
2,2011,1,10,10864
3,2011,1,2,492
4,2011,1,3,586
...,...,...,...,...
6043,1995,9,6,1498
6044,1995,9,7,3924
6045,1995,9,8,18047
6046,1995,9,9,207028


In [None]:
df = df[df['apgar5']!=99]
df['apgar5'].unique()

In [None]:
df_count = df.groupby(['dob_yy', 'apgar5']).agg({'births':'sum'})
df_count = df_count.groupby(level=0).apply(lambda x: 100 * x / x.sum())
df_count

In [None]:
df_count = df_count.reset_index().rename(columns={'births':'births_pct'})
df_count.head()

In [None]:
# plot the apgar5 distribution by year
df_a = df_count[df_count['apgar5']==1]

plt.plot(df_a['dob_yy'], df_a['births_pct'])

In [None]:
# plot the "births_pct" distribution by year all on the same plot
df_b = df_count[df_count['apgar5']==2]

In [None]:
years = np.sort(df_count['dob_yy'].unique())
years

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
for year in np.sort(df_count['dob_yy'].unique()):
    df_year = df_count[(df_count['dob_yy']==year) & (df_count['apgar5']<3) 
    # & (df_count['apgar5']<6)
    ]
    ax.scatter(df_year['apgar5'], df_year['births_pct'], label=year)

# set y-axis to log scale
ax.set_yscale('log')


In [None]:
plt.plot(df_count[df_count['dob_yy']==2008]['apgar5'], df_count[df_count['dob_yy']==2008]['births_pct'],)
plt.plot(df_count[df_count['dob_yy']==2009]['apgar5'], df_count[df_count['dob_yy']==2009]['births_pct'],)

# change the y-axis scale to log
plt.yscale('log')
# plt.plot(df_count[df_count['dob_yy']==2009]['births_pct'], df_count[df_count['dob_yy']==2009]['apgar5'])

## Create Data with Percentage by Month
Decent article on groupby: https://towardsdatascience.com/data-grouping-in-python-d64f1203f8d3

In [None]:
root_dir = Path.cwd().parent
folder_processed_data = root_dir / 'data/processed'

df = pd.read_csv(folder_processed_data /"birth_no_geo_apgar.csv")

In [None]:
df = df[df['apgar5']!=99].sort_values(by=['dob_yy', 'dob_mm'])
df = df.groupby(['dob_yy', 'dob_mm', 'apgar5']).agg({'births':'sum'}).reset_index()
df['yy_mm_births_total'] = df.groupby(['dob_yy', 'dob_mm'])['births'].transform('sum')
df['birth_pct'] = df['births'] / df['yy_mm_births_total'] * 100
df[:12]

How can we best see if the apgar score is changing over time?
* Heatmap of the apgar score by year or month?
* Should we measure the change in apgar score as a percentage of change over some average?

Let's start working on the heatmap.

## Heatmap Experiment

In [None]:
root_dir = Path.cwd().parent
folder_processed_data = root_dir / 'data/processed'

df = pd.read_csv(folder_processed_data /"birth_no_geo_apgar.csv")

In [None]:
df = df[df['apgar5']!=99].sort_values(by=['dob_yy', 'dob_mm'])
df = df.groupby(['dob_yy', 'dob_mm', 'apgar5']).agg({'births':'sum'}).reset_index()
df['yy_mm_births_total'] = df.groupby(['dob_yy', 'dob_mm'])['births'].transform('sum')
df['births_pct'] = df['births'] / df['yy_mm_births_total'] * 100
df.head()

In [None]:
df[(df['apgar5']==8) & (df['dob_yy']==2008)]

In [None]:
sns.heatmap(df[df['apgar5'] == 10][['dob_yy', 'dob_mm', 'births_pct']].pivot('dob_yy', 'dob_mm', 'births_pct'))

# Scratch
https://www.kite.com/python/answers/how-to-generate-percentages-of-pandas-columns-in-python

In [None]:
dfp = df.groupby(['dob_yy', 'dob_mm', 'apgar5'], as_index=False).agg({'births': 'sum'})
dfp

In [None]:
dp_perc = dfp.groupby(level=0,).apply(lambda x : 100.0 * x / x.sum())
dp_perc

In [None]:
dp_perc[dp_perc['dob_yy']==1978]

In [None]:
desc = df.describe()
desc

In [None]:
a = list(df['dob_yy'].unique()).sort()
a

In [None]:
unique

In [None]:
year = 1968

df = df_from_csv_no_geo_extra(folder_raw_data / f'natl{str(year)}.csv', nrows=2000)
df

In [None]:
df[(df['dob_yy']==1989) & (df['apgar5']==1) & (df['dob_mm']==1)]['apgar5']

In [None]:
df.dtypes

In [None]:
df = pd.DataFrame(columns=["dob_yy", "dob_mm", "apgar5", "births"])
df

In [None]:
# columns for 2003 through 2004
col_load_1 = ['dob_yy','dob_mm','dob_wk','mrstate','mrcntyfips','mrcityfips', 'apgar5']
col_1_dtype = [int, int, int, str, int, int, int]

# columns for 1989 through 2002
col_load_2 =['biryr', 'birmon', 'weekday', 'stresfip', 'cntyrfip', 'cityres']

# columns for 2005+
col_load_3 =['dob_yy','dob_mm','dob_wk']

# columns for 1982 through 1988
col_load_4 =['datayear', 'birmon','birday','stresfip', 'cntyrfip', 'cityres']
rename_col4 = ['dob_yy','dob_mm','dob_day','mrstate','mrcntyfips','mrcityfips',]

# create dictionary to rename older csvs
col_rename_dict = dict(zip(col_load_2, col_load_1))
col_rename_dict4 = dict(zip(col_load_4, rename_col4))
col_rename_dict

Load csv from 1972. This does not have any geographic data...

In [None]:
year = 1972
df = pd.read_csv(folder_raw_data / f'natl{str(year)}.csv', nrows=1, usecols=['datayear', 'birmon','birday',], dtype=str).rename(columns=col_rename_dict4)
# df = df.rename(columns={'mrstate':'mrstatefips'})
df.head()

Load "older" csv (from 1991-2002)

In [None]:
year = 1993
df = pd.read_csv(folder_raw_data / f'natl{str(year)}.csv', nrows=1, usecols=col_load_2, dtype=str).rename(columns=col_rename_dict)
df = df.rename(columns={'mrstate':'mrstatefips'})
df.head()

In [None]:
# load 'all-geocodes-v2017.xlsx'
# df_fips = pd.read_csv('./other_data/all-geocodes-v2017.csv', 
#                         dtype={'State Code (FIPS)': int, 
#                                'County Code (FIPS)': int, 
#                                'County Subdivision Code (FIPS)': int, 
#                                'Place Code (FIPS)': int, 
#                                'Consolidtated City Code (FIPS)': int})

df_fips = pd.read_csv(folder_external_data / 'all-geocodes-v2017.csv', dtype=str)

df_fips.head()

In [None]:
# get the fips codes for the states only
# df_state_fips = df_fips[(df_fips['State Code (FIPS)']>0) & 
#                         (df_fips['County Code (FIPS)']==0) & 
#                         (df_fips['County Subdivision Code (FIPS)']==0) & 
#                         (df_fips['Place Code (FIPS)']==0) & 
#                         (df_fips['Consolidtated City Code (FIPS)']==0)
#                        ][['State Code (FIPS)','Area Name (including legal/statistical area description)']]


df_state_fips = df_fips[(df_fips['County Code (FIPS)']=='000') & 
                        (df_fips['County Subdivision Code (FIPS)']=='00000') & 
                        (df_fips['Place Code (FIPS)']=='00000') & 
                        (df_fips['Consolidtated City Code (FIPS)']=='00000')
                       ][['State Code (FIPS)','Area Name (including legal/statistical area description)']]

# rename columns in df
df_state_fips.columns = ['state_fips', 'state_name_mr']
df_state_fips.head()

In [None]:
df = pd.merge(df, df_state_fips, left_on='mrstatefips',
              right_on='state_fips', how='inner', copy=False).drop(['state_fips'], axis=1).drop(['mrstatefips'], axis=1)
df.head()

In [None]:
# change the dtype for the numerical columns
df = df.astype({'dob_mm':int, 'dob_wk':int, 'dob_yy':int})
df.head()

Load 1982-1988.

In [None]:
year = 1982
df = pd.read_csv(folder_raw_data / f'natl{str(year)}.csv', nrows=1, usecols=col_load_4, dtype=str).rename(columns=col_rename_dict4)
df = df.rename(columns={'mrstate':'mrstatefips'})
df.head()

In [None]:
df = pd.merge(df, df_state_fips, left_on='mrstatefips',
              right_on='state_fips', how='inner', copy=False).drop(['state_fips'], axis=1).drop(['mrstatefips'], axis=1)
df.head()

Load 2005+.

In [None]:
year = 2007

# columns for 2005+
col_load_3 =['dob_yy','dob_mm','dob_wk',]
# col_load_3 =['dob_yy','dob_mm','dob_wk', 'ocntyfips']

df = pd.read_csv(folder_raw_data / f'natl{str(year)}.csv', nrows=1, usecols=col_load_3, dtype=str)
df

Load 2003-2004 csv.

In [None]:
year = 2003
df = pd.read_csv(folder_raw_data / f'natl{str(year)}.csv', nrows=1, usecols=col_load_1, dtype=str)
df

In [None]:
# load 'state_abbreviations.csv'
df_abbr = pd.read_csv(folder_external_data / 'state_abbreviations.csv',header=None, names=['state','abbr'])
df_abbr.head()

In [None]:
# use pandas merge. Also, dorp the 'abbr' columns right away
df = pd.merge(df, df_abbr, left_on='mrstate',right_on='abbr', how='inner', copy=False).drop(['abbr'], axis=1).drop(['mrstate'], axis=1)
df = df.rename(columns={'state':'state_name_mr'})
df = df.astype({'dob_mm':int, 'dob_wk':int, 'dob_yy':int})
df.head()

In [None]:
# add state fips code
df = pd.merge(df, df_state_fips, left_on='state_name_mr',
              right_on='state_name_mr', how='inner', copy=False)
df = df.rename(columns={'state_fips':'mrstatefips'})
df.head()

Practice grouping a table together by birth year/month/week-day.

In [None]:
year = 2005

# columns for 2005+
col_load_3 =['dob_yy','dob_mm','dob_wk',]

df = pd.read_csv(folder_raw_data / f'natl{str(year)}.csv', nrows=100, usecols=col_load_3, dtype=int)
df.head()

In [None]:
df.shape

In [None]:
df['births'] = np.ones(df.shape[0])
df1 = df.groupby(['dob_yy', 'dob_mm', 'dob_wk'], as_index=False).count().sort_values(by=['dob_yy','dob_mm','dob_wk'])
df1[:10]

The final dataframe is much smaller - only 84 rows.

In [None]:
df.columns

In [None]:
# load the data
root_dir = Path.cwd().parent
data_file = "birth_geo_births_ind_test.csv.gz"

dtype_dict = {'dob_yy':int, 'dob_mm':int, 'mrcntyfips':str, 'mrcityfips':str, 'state_name_mr':str,
       'mrstatefips':int, 'apgar5':int, 'births':int}

df = pd.read_csv(root_dir / data_file, compression='gzip', dtype=dtype_dict)
df.head()

In [None]:
np.sort(df['dob_yy'].unique())

In [None]:
df.shape

In [None]:
data_file = "birth_geo_births_summed_test.csv.gz"

dtype_dict = {'dob_yy':int, 'dob_mm':int, 'mrcntyfips':str, 'mrcityfips':str, 'state_name_mr':str,
       'mrstatefips':int, 'apgar5':int, 'births':int}

df = pd.read_csv(root_dir / data_file, compression='gzip', dtype=dtype_dict)
df.head()

In [None]:
df.shape