In [1]:
import pandas as pd
from zipfile import ZipFile
import re

In [2]:
original_zip = ZipFile('US_VitalStatistics.zip', 'r')
new_zip = ZipFile('new_archve.zip', 'w')
for item in original_zip.infolist():
    buffer = original_zip.read(item.filename)
    if not str(item.filename).startswith('__MACOSX/'):
        new_zip.writestr(item, buffer)
new_zip.close()
original_zip.close()

In [3]:
new_zip = ZipFile('new_archve.zip', 'r')

In [4]:
dfs = {}

for text_file in new_zip.infolist():
    dfs[re.search('2\d\d\d', text_file.filename).group(0)] = pd.read_csv(new_zip.open(text_file.filename), sep = "\t", usecols = [1, 2, 3, 5, 7])[:-15]

In [5]:
dfs.keys()

dict_keys(['2009', '2008', '2003', '2014', '2015', '2005', '2011', '2010', '2004', '2012', '2006', '2007', '2013'])

In [6]:
origin = pd.DataFrame({})
for key in dfs.keys():
    origin = origin.append(dfs[key])

In [7]:
origin[origin['Deaths'] == 'Missing']['County'].unique()

array(['Prince of Wales-Outer Ketchikan Census Area, AK',
       'Skagway-Hoonah-Angoon Census Area, AK',
       'Wrangell-Petersburg Census Area, AK', 'Bedford city, VA',
       'Clifton Forge city, VA'], dtype=object)

In [8]:
index_names = origin[origin['Deaths'] == 'Missing'].index

In [9]:
origin = origin.drop(index_names)

In [10]:
origin['Deaths'] = origin['Deaths'].astype('int64')
origin['Year'] = origin['Year'].astype('int64')
origin['County Code'] = origin['County Code'].astype('int64')

In [11]:
totalDeath = origin.groupby(['County','Year','County Code'], as_index = False).sum()[['County','County Code','Year','Deaths']].rename({'Deaths':'TotalDeath'}, axis = 'columns')

In [12]:
totalDeath

Unnamed: 0,County,County Code,Year,TotalDeath
0,"Abbeville County, SC",45001,2003,266
1,"Abbeville County, SC",45001,2004,239
2,"Abbeville County, SC",45001,2005,212
3,"Abbeville County, SC",45001,2006,278
4,"Abbeville County, SC",45001,2007,265
...,...,...,...,...
40183,"Ziebach County, SD",46137,2011,10
40184,"Ziebach County, SD",46137,2012,10
40185,"Ziebach County, SD",46137,2013,11
40186,"Ziebach County, SD",46137,2014,11


In [13]:
names = []
for name in origin['Drug/Alcohol Induced Cause'].unique():
    if re.match('Drug poisonings.*', name):
        names.append(name)

In [14]:
interDose = origin[origin['Drug/Alcohol Induced Cause'].isin(names)]

In [15]:
finalDose = interDose.groupby(['County', 'County Code', 'Year'], as_index = False).sum()[['County','County Code','Year','Deaths']].rename({'Deaths':'TotalOverdose'}, axis = 'columns')

In [16]:
finalDose

Unnamed: 0,County,County Code,Year,TotalOverdose
0,"Acadia Parish, LA",22001,2003,11
1,"Acadia Parish, LA",22001,2005,23
2,"Acadia Parish, LA",22001,2006,19
3,"Acadia Parish, LA",22001,2007,19
4,"Acadia Parish, LA",22001,2009,11
...,...,...,...,...
7826,"Yuma County, AZ",4027,2011,20
7827,"Yuma County, AZ",4027,2012,32
7828,"Yuma County, AZ",4027,2013,22
7829,"Yuma County, AZ",4027,2014,35


In [17]:
final = pd.merge(finalDose, totalDeath, on = ['County', 'County Code', 'Year'])

In [18]:
final['OverdoseProp'] = final['TotalOverdose'] / final['TotalDeath']

In [19]:
final[['County','State']] = final.County.str.split(", ",expand=True,)

In [20]:
final['PolicyState'] = (final['State'] == 'FL') | ((final['State'] == 'TX')) | (final['State'] == 'WA')

In [21]:
final['Post'] = ((final['State'] == 'FL') & (final['Year'] >= 2010)) | ((final['State'] == 'TX') & (final['Year'] >= 2007)) | ((final['State'] == 'WA') & (final['Year'] >= 2012))

In [22]:
final

Unnamed: 0,County,County Code,Year,TotalOverdose,TotalDeath,OverdoseProp,State,PolicyState,Post
0,Acadia Parish,22001,2003,11,602,0.018272,LA,False,False
1,Acadia Parish,22001,2005,23,674,0.034125,LA,False,False
2,Acadia Parish,22001,2006,19,636,0.029874,LA,False,False
3,Acadia Parish,22001,2007,19,655,0.029008,LA,False,False
4,Acadia Parish,22001,2009,11,603,0.018242,LA,False,False
...,...,...,...,...,...,...,...,...,...
7826,Yuma County,4027,2011,20,1380,0.014493,AZ,False,False
7827,Yuma County,4027,2012,32,1415,0.022615,AZ,False,False
7828,Yuma County,4027,2013,22,1331,0.016529,AZ,False,False
7829,Yuma County,4027,2014,35,1423,0.024596,AZ,False,False


In [23]:
final.to_csv('death.csv', index = False)