In [18]:
from pathlib import Path
import pandas as pd
import numpy as np
import os

pd.set_option("mode.copy_on_write", True)

repo_root = Path("../..")
# source files
src_folder = repo_root / "00_source_data" / "mortality_data"

# output file
out_file = repo_root / "20_intermediate_file" / "overdose_03-15.tsv"

In [19]:
df_list = [pd.read_csv(p, sep="\t") for p in src_folder.glob("*")]
df = pd.concat(df_list)
df

Unnamed: 0,Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths
0,,"Autauga County, AL",1001.0,2009.0,2009.0,All other non-drug and non-alcohol causes,O9,408.0
1,,"Baldwin County, AL",1003.0,2009.0,2009.0,Drug poisonings (overdose) Unintentional (X40-...,D1,29.0
2,,"Baldwin County, AL",1003.0,2009.0,2009.0,All other non-drug and non-alcohol causes,O9,1669.0
3,,"Barbour County, AL",1005.0,2009.0,2009.0,All other non-drug and non-alcohol causes,O9,299.0
4,,"Bibb County, AL",1007.0,2009.0,2009.0,All other non-drug and non-alcohol causes,O9,207.0
...,...,...,...,...,...,...,...,...
4604,Suggested Citation: Centers for Disease Contro...,,,,,,,
4605,"1999-2017 on CDC WONDER Online Database, relea...",,,,,,,
4606,compiled from data provided by the 57 vital st...,,,,,,,
4607,at http://wonder.cdc.gov/ucd-icd10.html on Oct...,,,,,,,


In [20]:
causes = df[
    ["Drug/Alcohol Induced Cause", "Drug/Alcohol Induced Cause Code"]
].value_counts()
causes

Drug/Alcohol Induced Cause                          Drug/Alcohol Induced Cause Code
All other non-drug and non-alcohol causes           O9                                 40337
Drug poisonings (overdose) Unintentional (X40-X44)  D1                                  7573
All other alcohol-induced causes                    A9                                  6123
Drug poisonings (overdose) Suicide (X60-X64)        D2                                  1465
Drug poisonings (overdose) Undetermined (Y10-Y14)   D4                                   761
All other drug-induced causes                       D9                                   628
Alcohol poisonings (overdose) (X45, X65, Y15)       A1                                   349
Drug poisonings (overdose) Homicide (X85)           D3                                     5
Name: count, dtype: int64

In [21]:
overdoses = ["D1", "D2", "D3", "D4"]

In [22]:
df.County.nunique(), df[
    df["Drug/Alcohol Induced Cause Code"].isin(overdoses)
].County.nunique()

(3132, 1047)

In [23]:
df[df["Deaths"].isna()]

Unnamed: 0,Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths
4364,---,,,,,,,
4365,"Dataset: Underlying Cause of Death, 1999-2017",,,,,,,
4366,Query Parameters:,,,,,,,
4367,Group By: County; Year; Drug/Alcohol Induced C...,,,,,,,
4368,Show Totals: Disabled,,,,,,,
...,...,...,...,...,...,...,...,...
4604,Suggested Citation: Centers for Disease Contro...,,,,,,,
4605,"1999-2017 on CDC WONDER Online Database, relea...",,,,,,,
4606,compiled from data provided by the 57 vital st...,,,,,,,
4607,at http://wonder.cdc.gov/ucd-icd10.html on Oct...,,,,,,,


In [24]:
# check that all these entries are notes
assert (df["Notes"].notna() == df["Deaths"].isna()).all()

In [25]:
clean_df = df[df["Notes"].isna()]
clean_df

Unnamed: 0,Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths
0,,"Autauga County, AL",1001.0,2009.0,2009.0,All other non-drug and non-alcohol causes,O9,408.0
1,,"Baldwin County, AL",1003.0,2009.0,2009.0,Drug poisonings (overdose) Unintentional (X40-...,D1,29.0
2,,"Baldwin County, AL",1003.0,2009.0,2009.0,All other non-drug and non-alcohol causes,O9,1669.0
3,,"Barbour County, AL",1005.0,2009.0,2009.0,All other non-drug and non-alcohol causes,O9,299.0
4,,"Bibb County, AL",1007.0,2009.0,2009.0,All other non-drug and non-alcohol causes,O9,207.0
...,...,...,...,...,...,...,...,...
4589,,"Sweetwater County, WY",56037.0,2013.0,2013.0,All other non-drug and non-alcohol causes,O9,238.0
4590,,"Teton County, WY",56039.0,2013.0,2013.0,All other non-drug and non-alcohol causes,O9,83.0
4591,,"Uinta County, WY",56041.0,2013.0,2013.0,All other non-drug and non-alcohol causes,O9,130.0
4592,,"Washakie County, WY",56043.0,2013.0,2013.0,All other non-drug and non-alcohol causes,O9,72.0


In [26]:
clean_df.drop(
    inplace=True, columns=["Notes", "Year Code", "Drug/Alcohol Induced Cause"]
)
clean_df

Unnamed: 0,County,County Code,Year,Drug/Alcohol Induced Cause Code,Deaths
0,"Autauga County, AL",1001.0,2009.0,O9,408.0
1,"Baldwin County, AL",1003.0,2009.0,D1,29.0
2,"Baldwin County, AL",1003.0,2009.0,O9,1669.0
3,"Barbour County, AL",1005.0,2009.0,O9,299.0
4,"Bibb County, AL",1007.0,2009.0,O9,207.0
...,...,...,...,...,...
4589,"Sweetwater County, WY",56037.0,2013.0,O9,238.0
4590,"Teton County, WY",56039.0,2013.0,O9,83.0
4591,"Uinta County, WY",56041.0,2013.0,O9,130.0
4592,"Washakie County, WY",56043.0,2013.0,O9,72.0


In [27]:
clean_df[clean_df["Drug/Alcohol Induced Cause Code"].isin(overdoses)]

Unnamed: 0,County,County Code,Year,Drug/Alcohol Induced Cause Code,Deaths
1,"Baldwin County, AL",1003.0,2009.0,D1,29.0
8,"Calhoun County, AL",1015.0,2009.0,D1,12.0
28,"Escambia County, AL",1053.0,2009.0,D4,10.0
30,"Etowah County, AL",1055.0,2009.0,D1,14.0
38,"Houston County, AL",1069.0,2009.0,D1,10.0
...,...,...,...,...,...
4552,"Washington County, WI",55131.0,2013.0,D1,19.0
4555,"Waukesha County, WI",55133.0,2013.0,D1,39.0
4561,"Winnebago County, WI",55139.0,2013.0,D1,14.0
4576,"Laramie County, WY",56021.0,2013.0,D1,15.0


In [28]:
clean_df = clean_df[clean_df["Drug/Alcohol Induced Cause Code"].isin(overdoses)]
clean_df["County Code"] = clean_df["County Code"].astype(int)
clean_df["Year"] = clean_df["Year"].astype(int)
clean_df

Unnamed: 0,County,County Code,Year,Drug/Alcohol Induced Cause Code,Deaths
1,"Baldwin County, AL",1003,2009,D1,29.0
8,"Calhoun County, AL",1015,2009,D1,12.0
28,"Escambia County, AL",1053,2009,D4,10.0
30,"Etowah County, AL",1055,2009,D1,14.0
38,"Houston County, AL",1069,2009,D1,10.0
...,...,...,...,...,...
4552,"Washington County, WI",55131,2013,D1,19.0
4555,"Waukesha County, WI",55133,2013,D1,39.0
4561,"Winnebago County, WI",55139,2013,D1,14.0
4576,"Laramie County, WY",56021,2013,D1,15.0


In [29]:
death_values = clean_df["Deaths"].unique().astype(str)

In [30]:
death_values.sort()
death_values

array(['10', '10.0', '100', '100.0', '101', '101.0', '102', '102.0',
       '103', '103.0', '104', '104.0', '105', '105.0', '106', '106.0',
       '107', '107.0', '108', '108.0', '109', '109.0', '11', '11.0',
       '110', '110.0', '111.0', '112.0', '113.0', '114', '114.0', '115',
       '115.0', '116', '116.0', '117.0', '118', '118.0', '119', '119.0',
       '12', '12.0', '120.0', '121.0', '122.0', '123.0', '124', '124.0',
       '125', '125.0', '126', '126.0', '127', '127.0', '128.0', '129.0',
       '13', '13.0', '130', '130.0', '131.0', '132', '132.0', '133.0',
       '134.0', '135.0', '136.0', '137', '137.0', '138', '138.0', '139',
       '139.0', '14', '14.0', '140', '140.0', '141.0', '142', '142.0',
       '143', '143.0', '144.0', '145.0', '146', '146.0', '147.0', '148',
       '148.0', '149.0', '15', '15.0', '150.0', '151.0', '152', '152.0',
       '153.0', '154', '154.0', '155', '155.0', '156', '156.0', '157',
       '157.0', '158.0', '159.0', '16', '16.0', '160.0', '161', '16

In [31]:
clean_df = clean_df[clean_df["Deaths"] != "Missing"]
clean_df["Deaths"] = clean_df["Deaths"].astype(int)
clean_df.drop(inplace=True, columns="Drug/Alcohol Induced Cause Code")
clean_df

Unnamed: 0,County,County Code,Year,Deaths
1,"Baldwin County, AL",1003,2009,29
8,"Calhoun County, AL",1015,2009,12
28,"Escambia County, AL",1053,2009,10
30,"Etowah County, AL",1055,2009,14
38,"Houston County, AL",1069,2009,10
...,...,...,...,...
4552,"Washington County, WI",55131,2013,19
4555,"Waukesha County, WI",55133,2013,39
4561,"Winnebago County, WI",55139,2013,14
4576,"Laramie County, WY",56021,2013,15


In [32]:
clean_df.rename(inplace=True, columns={"County Code": "FIPS"})
# Remove Alaska
clean_df = clean_df[clean_df["FIPS"] // 1000 != 2]
clean_df = clean_df.groupby(["FIPS", "County", "Year"], as_index=False).sum()
clean_df

Unnamed: 0,FIPS,County,Year,Deaths
0,1003,"Baldwin County, AL",2003,10
1,1003,"Baldwin County, AL",2004,18
2,1003,"Baldwin County, AL",2005,14
3,1003,"Baldwin County, AL",2006,11
4,1003,"Baldwin County, AL",2007,24
...,...,...,...,...
7850,56025,"Natrona County, WY",2012,12
7851,56025,"Natrona County, WY",2013,14
7852,56025,"Natrona County, WY",2014,17
7853,56025,"Natrona County, WY",2015,13


In [33]:
clean_df.FIPS.nunique()

1038

In [34]:
clean_df.to_csv(out_file, index=False, sep="\t")