In [11]:
%matplotlib inline
import os
from glob import glob
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


import censusgeocode as cg
import geopandas as gpd

In [12]:
data_dir = '/cluster/tufts/hugheslab/datasets/NSF_OD/'
result_dir = os.path.join(data_dir, 'results_20220606_update')
mass_shapefile = os.path.join(data_dir,'shapefiles','MA_2021')

if not os.path.exists(result_dir):
    os.makedirs(result_dir)

In [13]:

# not every file is an xlsx, 🤌
pre_2017_wildcard = os.path.join(data_dir,'MAOpiDths20*_clean.xls*')
post_2017_wildcard = os.path.join(data_dir, 'RVRS_Opioids*')

pre_2017_files = glob(pre_2017_wildcard)
pre_2017_years = [int(os.path.basename(file).split('_')[0][-4:]) for file in pre_2017_files]

post_2017_files = glob(post_2017_wildcard)
post_2017_years = [int(os.path.splitext(os.path.basename(file))[0].split('_')[2]) for file in post_2017_files]

all_files = pre_2017_files + post_2017_files
all_years = pre_2017_years + post_2017_years

relevant_files = []
relevant_years = []
for file, year in zip(all_files, all_years):
    filename = os.path.basename(file)
    
    relevant_files.append(file)
    relevant_years.append(year)
    
# Make sure years are unique
assert(len(set(relevant_years))==len(relevant_years))



In [14]:
address_df = pd.DataFrame()
all_filtered_df = pd.DataFrame()
for file, year in zip(relevant_files, relevant_years):
    
    single_year_df =  pd.read_excel(file, na_filter=False)
    
    if year < 2015:
        dod_col = "DOD"
        dod_format = "%Y%m%d"
    else:
        dod_col = "DOD_4_FD"
        dod_format = "%m/%d/%Y"
        
    
    

    # add year/quarter
    if year == 2014:
        missing_date = single_year_df['death_year']=='NA'
        num_miss = np.sum(missing_date)
        print(f'{num_miss} rows in 2014 dont have a death date, {num_miss/len(single_year_df)*100:.2f}% of total')
        single_year_df = single_year_df[~missing_date]
        single_year_df['dod_dt'] = pd.to_datetime({'year':single_year_df['death_year'],
                                                   'month': single_year_df['death_month'],
                                                   'day': single_year_df['death_day']})
    else:
        missing_date = single_year_df[dod_col]==''
        num_miss = np.sum(missing_date)
        print(f'{num_miss} rows in {year} dont have a death date, {num_miss/len(single_year_df)*100:.2f}% of total')
        single_year_df = single_year_df[~missing_date]
        single_year_df['dod_dt'] = pd.to_datetime(single_year_df[dod_col], format=dod_format)
        
    single_year_df['year'] = single_year_df['dod_dt'].dt.year
    single_year_df['quarter'] = single_year_df['dod_dt'].dt.quarter

    has_ffix = year > 2014

    if has_ffix:
        address_cols = ['RES_ADDR_NUM', 'RES_STREET_PREFIX',
                    'RES_ADDR1', 'RES_STREET_DESIG',
                   'RES_STREET_SUFFIX']
        state = 'MASSACHUSETTS'
    else:
        address_cols = ['RES_ADDR_NUM',
                    'RES_ADDR1', 'RES_STREET_DESIG',
                   ]
        state = 'MA'

    if has_ffix and 'RES_STREET_PREFIX' not in single_year_df.columns:
        print(f'No decdent address column in {year}')
        #continue
        
    if year==2014:
        single_year_df[['RES_ADDR_NUM', 'RES_ADDR1', 'RES_CITY', 'RES_STATE']] = single_year_df['res_addres'].str.split(',', 3, expand=True)
        single_year_df.loc[:,['RES_STATE']] = single_year_df['RES_STATE'].str.strip()
        single_year_df[['RES_STREET_DESIG']] = ''
        single_year_df.loc[:,['RES_ZIP']] =  single_year_df['Postal'].apply(lambda x: '0'+str(x))
         
    tot_rows = single_year_df.shape[0]


    # remove unknown address
    filtered_df = single_year_df[single_year_df['RES_ADDR1'] != 'UNKNOWN']
    filtered_df = filtered_df[filtered_df['RES_ADDR1'] != 'UNK']
    try:
        count_filtered = single_year_df['RES_ADDR1'].value_counts()['UNKNOWN']
        count_filtered += single_year_df['RES_ADDR1'].value_counts()['UNK']
    except KeyError:
        count_filtered=0
    # remove blank address
    filtered_df = filtered_df[filtered_df['RES_ADDR1'] != '']
    try:
        count_filtered += single_year_df['RES_ADDR1'].value_counts()[''] 
    except KeyError:
        count_filtered += 0

    print(f'In {year} {count_filtered} rows have missing decdent address, '
          f'{count_filtered/tot_rows*100:.1f}% of total')

    # If street number is hyphenated, take first (123-125 -> 123)
    hyphenated_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].apply(lambda x: x.split('-')[0])).sum()
    if hyphenated_rows >0 :
        print(f'Adjusting {hyphenated_rows} hyphenated addresses in {year}.')
    filtered_df.loc[:,'RES_ADDR_NUM'] =filtered_df['RES_ADDR_NUM'].apply(lambda x: x.split('-')[0])

    # Remove letters from street number
    alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
    if alphabetic_rows > 0:
        print(f'Adjusting {alphabetic_rows} addresses with letters in {year}.')

    filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')



    filtered_df.loc[:,'address'] = filtered_df[address_cols].agg(' '.join, axis=1)

    if 'SFN_NUM' not in filtered_df.columns:
        print(f'{year} is missing SFN_NUM, creating new column [year]_[row]') 
        filtered_df['SFN_NUM'] = f'{year}_' + filtered_df.index.astype(str)
        
    count_other_states = filtered_df[filtered_df['RES_STATE']!=state].shape[0]
    print(f'Ignoring {count_other_states} decdencts not from  {state}')
    filtered_df = filtered_df[filtered_df['RES_STATE']==state]

    address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
    all_filtered_df = all_filtered_df.append(filtered_df)
    
    

0 rows in 2000 dont have a death date, 0.00% of total
In 2000 0 rows have missing decdent address, 0.0% of total
Adjusting 1 hyphenated addresses in 2000.
Adjusting 16 addresses with letters in 2000.
2000 is missing SFN_NUM, creating new column [year]_[row]
Ignoring 12 decdencts not from  MA


  alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
  filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')
  address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
  all_filtered_df = all_filtered_df.append(filtered_df)


0 rows in 2001 dont have a death date, 0.00% of total
In 2001 0 rows have missing decdent address, 0.0% of total
Adjusting 2 hyphenated addresses in 2001.
Adjusting 6 addresses with letters in 2001.
2001 is missing SFN_NUM, creating new column [year]_[row]
Ignoring 21 decdencts not from  MA


  alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
  filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')
  address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
  all_filtered_df = all_filtered_df.append(filtered_df)


0 rows in 2002 dont have a death date, 0.00% of total
In 2002 0 rows have missing decdent address, 0.0% of total
Adjusting 15 addresses with letters in 2002.
2002 is missing SFN_NUM, creating new column [year]_[row]
Ignoring 22 decdencts not from  MA


  alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
  filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')
  address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
  all_filtered_df = all_filtered_df.append(filtered_df)


0 rows in 2003 dont have a death date, 0.00% of total
In 2003 0 rows have missing decdent address, 0.0% of total
Adjusting 3 hyphenated addresses in 2003.
Adjusting 11 addresses with letters in 2003.
2003 is missing SFN_NUM, creating new column [year]_[row]
Ignoring 17 decdencts not from  MA


  alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
  filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')
  address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
  all_filtered_df = all_filtered_df.append(filtered_df)


0 rows in 2004 dont have a death date, 0.00% of total
In 2004 0 rows have missing decdent address, 0.0% of total
Adjusting 2 hyphenated addresses in 2004.
Adjusting 10 addresses with letters in 2004.
2004 is missing SFN_NUM, creating new column [year]_[row]
Ignoring 17 decdencts not from  MA


  alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
  filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')
  address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
  all_filtered_df = all_filtered_df.append(filtered_df)


0 rows in 2005 dont have a death date, 0.00% of total
In 2005 0 rows have missing decdent address, 0.0% of total
Adjusting 13 addresses with letters in 2005.
2005 is missing SFN_NUM, creating new column [year]_[row]
Ignoring 23 decdencts not from  MA


  alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
  filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')
  address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
  all_filtered_df = all_filtered_df.append(filtered_df)


0 rows in 2006 dont have a death date, 0.00% of total
In 2006 1 rows have missing decdent address, 0.1% of total
Adjusting 3 hyphenated addresses in 2006.
Adjusting 15 addresses with letters in 2006.
2006 is missing SFN_NUM, creating new column [year]_[row]
Ignoring 25 decdencts not from  MA


  alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
  filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')
  address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
  all_filtered_df = all_filtered_df.append(filtered_df)


0 rows in 2007 dont have a death date, 0.00% of total
In 2007 0 rows have missing decdent address, 0.0% of total
Adjusting 3 hyphenated addresses in 2007.
Adjusting 14 addresses with letters in 2007.
2007 is missing SFN_NUM, creating new column [year]_[row]
Ignoring 19 decdencts not from  MA


  alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
  filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')
  address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
  all_filtered_df = all_filtered_df.append(filtered_df)


0 rows in 2008 dont have a death date, 0.00% of total
In 2008 0 rows have missing decdent address, 0.0% of total
Adjusting 2 hyphenated addresses in 2008.
Adjusting 14 addresses with letters in 2008.
2008 is missing SFN_NUM, creating new column [year]_[row]
Ignoring 15 decdencts not from  MA


  alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
  filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')
  address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
  all_filtered_df = all_filtered_df.append(filtered_df)


0 rows in 2009 dont have a death date, 0.00% of total
In 2009 0 rows have missing decdent address, 0.0% of total
Adjusting 2 hyphenated addresses in 2009.
Adjusting 10 addresses with letters in 2009.
2009 is missing SFN_NUM, creating new column [year]_[row]
Ignoring 19 decdencts not from  MA


  alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
  filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')
  address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
  all_filtered_df = all_filtered_df.append(filtered_df)


0 rows in 2010 dont have a death date, 0.00% of total
In 2010 1 rows have missing decdent address, 0.2% of total
Adjusting 1 hyphenated addresses in 2010.
Adjusting 14 addresses with letters in 2010.
2010 is missing SFN_NUM, creating new column [year]_[row]
Ignoring 14 decdencts not from  MA


  alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
  filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')
  address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
  all_filtered_df = all_filtered_df.append(filtered_df)


0 rows in 2011 dont have a death date, 0.00% of total
In 2011 0 rows have missing decdent address, 0.0% of total
Adjusting 2 hyphenated addresses in 2011.
Adjusting 23 addresses with letters in 2011.
2011 is missing SFN_NUM, creating new column [year]_[row]
Ignoring 26 decdencts not from  MA


  alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
  filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')
  address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
  all_filtered_df = all_filtered_df.append(filtered_df)


0 rows in 2012 dont have a death date, 0.00% of total
In 2012 0 rows have missing decdent address, 0.0% of total
Adjusting 1 hyphenated addresses in 2012.
Adjusting 22 addresses with letters in 2012.
2012 is missing SFN_NUM, creating new column [year]_[row]
Ignoring 23 decdencts not from  MA


  alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
  filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')
  address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
  all_filtered_df = all_filtered_df.append(filtered_df)


0 rows in 2013 dont have a death date, 0.00% of total
In 2013 0 rows have missing decdent address, 0.0% of total
Adjusting 2 hyphenated addresses in 2013.
Adjusting 14 addresses with letters in 2013.
2013 is missing SFN_NUM, creating new column [year]_[row]
Ignoring 32 decdencts not from  MA
13 rows in 2014 dont have a death date, 0.96% of total


  alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
  filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')
  address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
  all_filtered_df = all_filtered_df.append(filtered_df)
  alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
  filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')


In 2014 0 rows have missing decdent address, 0.0% of total
Adjusting 5 hyphenated addresses in 2014.
Adjusting 27 addresses with letters in 2014.
2014 is missing SFN_NUM, creating new column [year]_[row]
Ignoring 21 decdencts not from  MA


  address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
  all_filtered_df = all_filtered_df.append(filtered_df)


0 rows in 2015 dont have a death date, 0.00% of total
In 2015 0 rows have missing decdent address, 0.0% of total
Adjusting 8 hyphenated addresses in 2015.
Adjusting 28 addresses with letters in 2015.
2015 is missing SFN_NUM, creating new column [year]_[row]
Ignoring 56 decdencts not from  MASSACHUSETTS


  alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
  filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')
  address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
  all_filtered_df = all_filtered_df.append(filtered_df)


0 rows in 2016 dont have a death date, 0.00% of total
In 2016 0 rows have missing decdent address, 0.0% of total
Adjusting 2 hyphenated addresses in 2016.
Adjusting 56 addresses with letters in 2016.
Ignoring 80 decdencts not from  MASSACHUSETTS


  alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
  filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')
  address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
  all_filtered_df = all_filtered_df.append(filtered_df)


12 rows in 2018 dont have a death date, 0.59% of total
In 2018 0 rows have missing decdent address, 0.0% of total
Adjusting 4 hyphenated addresses in 2018.
Adjusting 44 addresses with letters in 2018.
2018 is missing SFN_NUM, creating new column [year]_[row]
Ignoring 67 decdencts not from  MASSACHUSETTS


  alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
  filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')
  address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
  all_filtered_df = all_filtered_df.append(filtered_df)


12 rows in 2019 dont have a death date, 0.59% of total
In 2019 0 rows have missing decdent address, 0.0% of total
Adjusting 3 hyphenated addresses in 2019.
Adjusting 38 addresses with letters in 2019.
2019 is missing SFN_NUM, creating new column [year]_[row]
Ignoring 81 decdencts not from  MASSACHUSETTS


  alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
  filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')
  address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
  all_filtered_df = all_filtered_df.append(filtered_df)


12 rows in 2020 dont have a death date, 0.56% of total
In 2020 0 rows have missing decdent address, 0.0% of total
Adjusting 5 hyphenated addresses in 2020.
Adjusting 39 addresses with letters in 2020.
2020 is missing SFN_NUM, creating new column [year]_[row]
Ignoring 75 decdencts not from  MASSACHUSETTS


  alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
  filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')
  address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
  all_filtered_df = all_filtered_df.append(filtered_df)


12 rows in 2021 dont have a death date, 0.53% of total
In 2021 0 rows have missing decdent address, 0.0% of total
Adjusting 4 hyphenated addresses in 2021.
Adjusting 53 addresses with letters in 2021.
2021 is missing SFN_NUM, creating new column [year]_[row]
Ignoring 72 decdencts not from  MASSACHUSETTS


  alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
  filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')
  address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
  all_filtered_df = all_filtered_df.append(filtered_df)


0 rows in 2017 dont have a death date, 0.00% of total
In 2017 0 rows have missing decdent address, 0.0% of total
Adjusting 3 hyphenated addresses in 2017.
Adjusting 50 addresses with letters in 2017.
2017 is missing SFN_NUM, creating new column [year]_[row]
Ignoring 62 decdencts not from  MASSACHUSETTS


  alphabetic_rows = (filtered_df['RES_ADDR_NUM'] != filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')).sum()
  filtered_df.loc[:,'RES_ADDR_NUM'] = filtered_df['RES_ADDR_NUM'].str.replace(r'\D','')
  address_df = address_df.append(filtered_df[['SFN_NUM', 'address','RES_CITY', 'RES_STATE', 'RES_ZIP']])
  all_filtered_df = all_filtered_df.append(filtered_df)


In [22]:
address_file1 = os.path.join(result_dir,'decedent_addresses_1.csv')
address_df.iloc[:7000,:].to_csv(address_file1, index=False)
address_file2 = os.path.join(result_dir,'decedent_addresses_2.csv')
address_df.iloc[7000:14000,:].to_csv(address_file2, index=False)
address_file3 = os.path.join(result_dir,'decedent_addresses_3.csv')
address_df.iloc[14000:,:].to_csv(address_file3, index=False)

In [26]:
start = time.time()

for pt, address_file in enumerate([address_file1,
                                   address_file2,
                                   address_file3]):
    response = cg.addressbatch(address_file)
    response_df = pd.DataFrame(response)
    response_df.to_csv(os.path.join(result_dir,f'res_response_pt{pt}.csv'),index=False)
    curr = time.time()
    print(f'Elapsed: {curr-start}')
    
    

Elapsed: 99.11540365219116
Elapsed: 179.93642902374268
Elapsed: 309.226154088974


In [27]:
response_1 = pd.read_csv(os.path.join(result_dir,'res_response_pt0.csv'))
response_2 = pd.read_csv(os.path.join(result_dir,'res_response_pt1.csv'))
response_3 = pd.read_csv(os.path.join(result_dir,'res_response_pt2.csv'))

response_df = response_1.append(response_2).append(response_3)
response_df.to_csv(os.path.join(result_dir,'res_response_2000_2020.csv'), index=False)

  response_df = response_1.append(response_2).append(response_3)


In [15]:
response_df = pd.read_csv(os.path.join(result_dir,'res_response_2000_2020.csv'))

In [16]:
matched_df_no_year = response_df[response_df['match']]
count_matched = matched_df_no_year.shape[0]
count_filtered = response_df.shape[0]
print(f'Matched {count_matched} rows, {count_matched/count_filtered*100:.2f}% of all filtered rows')

Matched 22125 rows, 94.83% of all filtered rows


In [17]:
# get year into response
matched_df_no_year.loc[:,'id'] = matched_df_no_year.loc[:,'id'].astype(str)
all_filtered_df.loc[:,'SFN_NUM'] = all_filtered_df.loc[:,'SFN_NUM'].astype(str)
matched_df = matched_df_no_year.merge(all_filtered_df[['SFN_NUM','year', 'quarter', 'dod_dt']], left_on='id', right_on='SFN_NUM')
assert (len(matched_df)==len(matched_df_no_year))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matched_df_no_year.loc[:,'id'] = matched_df_no_year.loc[:,'id'].astype(str)


In [31]:
matched_df.to_csv(os.path.join(result_dir,'geocoded_deaths_2000_2020.csv'), index=False)

In [18]:
matched_df

Unnamed: 0,id,address,match,matchtype,parsed,tigerlineid,side,statefp,countyfp,tract,block,lat,lon,SFN_NUM,year,quarter,dod_dt
0,2007_605,"58 MYRTLE ST, NORWOOD, MA, 2062",True,Exact,"58 MYRTLE ST, NORWOOD, MA, 02062",87243536.0,L,25.0,21.0,413202.0,1012.0,42.201649,-71.202264,2007_605,2007,4,2007-12-15
1,2002_349,"646 BIRCH ST, FALL RIVER, MA, 2724",True,Exact,"646 BIRCH ST, FALL RIVER, MA, 02724",46767810.0,L,25.0,5.0,640400.0,2005.0,41.684234,-71.176381,2002_349,2002,3,2002-07-15
2,2007_607,"573 OSBORN ST, FALL RIVER, MA, 2724",True,Exact,"573 OSBORN ST, FALL RIVER, MA, 02724",651813019.0,R,25.0,5.0,640500.0,1005.0,41.690942,-71.160056,2007_607,2007,4,2007-12-16
3,2002_348,"8 MICHIGAN AV, LYNN, MA, 1902",True,Exact,"8 MICHIGAN AVE, LYNN, MA, 01902",86646827.0,L,25.0,9.0,206600.0,4000.0,42.467733,-70.924900,2002_348,2002,3,2002-07-17
4,2007_606,"175 OSBORNE ST, FALL RIVER, MA, 2724",True,Non_Exact,"175 OSBORN ST, FALL RIVER, MA, 02724",46770133.0,R,25.0,5.0,640500.0,3001.0,41.692127,-71.166843,2007_606,2007,4,2007-11-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22120,2017_1707,"863 WASHINGTON STREET , WHITMAN, MASSACHUSETT...",True,Exact,"863 WASHINGTON ST, WHITMAN, MA, 02382",87445852.0,R,25.0,23.0,521201.0,2004.0,42.087970,-70.941268,2017_1707,2017,4,2017-12-12
22121,2021_1252,"29 KILLDEER ISLAND ROAD , WEBSTER, MASSACHUSE...",True,Exact,"29 KILLDEER ISLAND RD, WEBSTER, MA, 01570",40020553.0,L,25.0,27.0,754100.0,3000.0,42.054861,-71.843274,2021_1252,2021,3,2021-07-29
22122,2021_1253,"75 HOPKINS PLACE , LONGMEADOW, MASSACHUSETTS,...",True,Exact,"75 HOPKINS PL, LONGMEADOW, MA, 01106",4038067.0,R,25.0,13.0,813301.0,1015.0,42.056212,-72.578282,2021_1253,2021,3,2021-07-29
22123,2021_1254,"86 BELLEVUE STREET , WORCESTER, MASSACHUSETTS...",True,Exact,"86 BELLEVUE ST, WORCESTER, MA, 01610",40009694.0,R,25.0,27.0,731400.0,4002.0,42.260255,-71.817195,2021_1254,2021,3,2021-07-25
