Product Injury
Data Prep and Load
***  
# Introduction  
TBD - Repeatable project intro from slides

TBD - Add links to data


***
# Notebook Setup
***

In [1]:
# Import libraries
import pandas as pd

***  
# Read Raw Data
***

In [2]:
# Read in the NEIS_FMT worksheet from each workbook
path = '../Data/'
files = ['neiss' + str(x) + '.xlsx' for x in range(2013, 2023)]

all_codes_df = pd.DataFrame()

for file in files:
    raw_df = pd.read_excel(path + file, sheet_name = 'NEISS_FMT')
    raw_df.insert(0, 'Year', file[-9:-5])
    all_codes_df = pd.concat([all_codes_df, raw_df])

all_codes_df

Unnamed: 0,Year,Format name,Starting value for format,Ending value for format,Format value label
0,2013,AGELTTWO,0,0,UNK
1,2013,AGELTTWO,2,120,2 YEARS AND OLDER
2,2013,AGELTTWO,201,201,1 MONTH
3,2013,AGELTTWO,202,202,2 MONTHS
4,2013,AGELTTWO,203,203,3 MONTHS
...,...,...,...,...,...
1244,2022,RACE,2,2,BLACK/AFRICAN AMERICAN
1245,2022,RACE,3,3,OTHER
1246,2022,RACE,4,4,ASIAN
1247,2022,RACE,5,5,AMERICAN INDIAN/ALASKA NATIVE


In [3]:
# Reset index
all_codes_df.reset_index(drop=True, inplace=True)

***  
# Verify Data and Types
***  

In [4]:
all_codes_df.columns[all_codes_df.isnull().any()]

Index(['Format value label'], dtype='object')

In [5]:
# Delete all codes that do not contain a label
rows_to_del = all_codes_df[all_codes_df['Format value label'].isnull()].index
all_codes_df.drop(rows_to_del, axis='index', inplace=True)

In [6]:
all_codes_df.dtypes

Year                         object
Format name                  object
Starting value for format    object
Ending value for format      object
Format value label           object
dtype: object

In [7]:
# Verify all Year data is in our present, and in our date range
all_codes_df['Year'].unique()

array(['2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020',
       '2021', '2022'], dtype=object)

In [8]:
# Set Year data type to int
all_codes_df['Year'] = all_codes_df['Year'].astype(int)

In [9]:
# Verify all Format name data is value
all_codes_df['Format name'].unique()

array(['AGELTTWO', 'ALC_DRUG', 'BDYPT', 'DIAG', 'DISP', 'FIRE', 'GENDER',
       'HISP', 'LOC', 'PROD', 'RACE'], dtype=object)

In [10]:
# Review Starting Value data
all_codes_df['Starting value for format'].unique()

array(['               0', '               2', '             201', ...,
       '            9999', '             714', '            1552'],
      dtype=object)

In [11]:
# Strip leading spaces and check that the remaining data is numeric
all_codes_df['Starting value for format'] = all_codes_df['Starting value for format'].str.strip()
all_codes_df[~all_codes_df['Starting value for format'].str.isnumeric()]

Unnamed: 0,Year,Format name,Starting value for format,Ending value for format,Format value label
25,2013,ALC_DRUG,.,.,NA before 2019
105,2013,HISP,.,.,NA before 2019
1273,2014,ALC_DRUG,.,.,NA before 2019
1353,2014,HISP,.,.,NA before 2019
2521,2015,ALC_DRUG,.,.,NA before 2019
2601,2015,HISP,.,.,NA before 2019
3769,2016,ALC_DRUG,.,.,NA before 2019
3849,2016,HISP,.,.,NA before 2019
5017,2017,ALC_DRUG,.,.,NA before 2019
5097,2017,HISP,.,.,NA before 2019


In [12]:
# Remove rows with Starting value for format of period and convert column type to int
rows_to_del = all_codes_df[~all_codes_df['Starting value for format'].str.isnumeric()].index
all_codes_df.drop(rows_to_del, axis='index', inplace=True)
all_codes_df['Starting value for format'] = all_codes_df['Starting value for format'].astype(int)

In [13]:
# Review Starting Value data
all_codes_df['Ending value for format'].unique()

array(['               0', '             120', '             201', ...,
       '            9999', '             714', '            1552'],
      dtype=object)

In [14]:
# Strip leading spaces and check that the remaining data is numeric
all_codes_df['Ending value for format'] = all_codes_df['Ending value for format'].str.strip()
all_codes_df[~all_codes_df['Ending value for format'].str.isnumeric()]

Unnamed: 0,Year,Format name,Starting value for format,Ending value for format,Format value label


In [15]:
# Convert Ending value data to int
all_codes_df['Ending value for format'] = all_codes_df['Ending value for format'].astype(int)

***
# <font color='red'> Pick up here </font>
Review labels for any additional issues
*** 

In [16]:
all_codes_df.dtypes

Year                          int32
Format name                  object
Starting value for format     int32
Ending value for format       int32
Format value label           object
dtype: object

***  
# Validate Code Categories
*** 

In [17]:
# View unique categories and look for duplication
code_cats_list = all_codes_df['Format name'].unique()
code_cats_list

array(['AGELTTWO', 'ALC_DRUG', 'BDYPT', 'DIAG', 'DISP', 'FIRE', 'GENDER',
       'HISP', 'LOC', 'PROD', 'RACE'], dtype=object)

In [18]:
# Verify that a all code categories are present in each annual set of data
missing_cats = 0
for cat in code_cats_list:
    for yr in range(2013, 2023):
        if len(all_codes_df[all_codes_df['Format name'] == cat]) == 0:
            mssing_cats += 1
            print(f'{yr}: Missing {cat}')
if missing_cats == 0:
    print('Validation check passed - all categories are included in all years.')

Validation check passed - all categories are included in all years.


***  
# Validate Codes
*** 