In [1]:
# The purpose of this notebook was to extract Federal Payment data from each report file (pdf).
#
# The output files from this notebook are separate csv files for the two year ranges of differing 
#    Federal Payment table structure.
#
# See: `fed_benefs_joining` for restructuring and joining these two files.

In [1]:
import string
import re
import pdfminer
import pandas as pd
import numpy as np
import os

from pdfminer.high_level import extract_text

In [8]:
# Requires local report files

years = ['1990', '1991', '1992', '1994', '1995', '1996', '1997', '1998', 
         '1999', '2000', '2004', '2005', '2006', '2007', '2008', '2011']
dir_ = "ssi_reports/"
file_name_prefix = "ssi_"

In [12]:
#read in all files (WARNING this will take a long time)
raw_texts = []
for idx, ele in enumerate(years):
    file_path = dir_ + file_name_prefix + years[idx] + "s.pdf"
    raw_texts.append(extract_text(file_path))

In [92]:
#remove all double+ spaces in raw_texts
for idx, ele in enumerate(raw_texts):
    raw_texts[idx] = re.sub(' +', ' ', raw_texts[idx])

In [197]:
pattern_text_block = r"(FEDERAL BENEFIT|Federal benefit rates,).*?(?=STATE ASSISTANCE|State Assistance)"
pattern_fed_benef = r"((\$)*[\d]{1,3}\.[\d]{2}|(\$)*[\d],[\d]{3}\.[\d]{2})"

fed_benefs = []
for idx, ele in enumerate(raw_texts):
    #grab text block containing benefits
    temp = ([x.group() for x in re.finditer(pattern_text_block, ele, flags=re.S)])
    #isolate benefits
    fed_benefs.append([x.group() for x in re.finditer(pattern_fed_benef, temp[0], flags=re.S)])
    
    #remove '$' from benefit amounts
    fed_benefs[idx] = [benef.replace('$', '') for benef in fed_benefs[idx]]
    
    #handle where report structure changes
    if (idx <= 9):
        fed_benefs[idx].extend(('--', '--'))
    else:
        fed_benefs[idx].append('--')

#hardfixes for known misalignment

#shuffle 1992
_1992_idxs = [3, 2, 4, 5, 0, 1, 6, 7]
fed_benefs[2] = [fed_benefs[2][idx] for idx in _1992_idxs]

#shuffle 1994-1996
_1994_96_idxs = [2, 3, 4, 5, 0, 1, 6, 7]
fed_benefs[3] = [fed_benefs[3][idx] for idx in _1994_96_idxs]
fed_benefs[4] = [fed_benefs[4][idx] for idx in _1994_96_idxs]
fed_benefs[5] = [fed_benefs[5][idx] for idx in _1994_96_idxs]

In [256]:
fed_liv_arrs = ['Living independently', 'Living in household of another', 'Medicaid facility', 'Essential person']
_fed_liv_arrs = ['Living independently', 'Living in household of another', 'Medicaid facility']

dataframes = []

#1990-2000
for idx in range(10):
    
    #define data structure
    _data = {
        'year': [years[idx]] * 4,
        'living arrangements': fed_liv_arrs,
        'individual': fed_benefs[idx][0:4],
        'couple': fed_benefs[idx][4:8]
    }
    
    #create dataframe
    fed_benef_df = pd.DataFrame(data = _data)
    dataframes.append(fed_benef_df)
    
#2004-2011
for idx in range(10, 16):
    
    #alternative structure
    __data = {
        'year': [years[idx]] * 3,
        'living arrangements': _fed_liv_arrs,
        'individual': fed_benefs[idx][0:3],
        'couple': fed_benefs[idx][3:6],
        'essential person': fed_benefs[idx][6:9]
    }
    
    fed_benef_df = pd.DataFrame(data = __data)
    dataframes.append(fed_benef_df)

In [262]:
_1990_2000_fed_benefs = []
_2004_2011_fed_benefs = []

for idx, ele in enumerate(dataframes):
    
    if (idx <= 9):
        _1990_2000_fed_benefs.append(ele)
    else:
        _2004_2011_fed_benefs.append(ele)
        
df_1990_2000_fed_benefs = pd.concat(_1990_2000_fed_benefs).reset_index()
df_2004_2011_fed_benefs = pd.concat(_2004_2011_fed_benefs).reset_index()

In [199]:
# mutator - modifies input token list
# void, returns nothing
def list_slice_inner_move(llist, left_index, right_index, replace_index):
    temp_for_move = llist[left_index:right_index]
    del llist[left_index:right_index]
    llist[replace_index:replace_index] = temp_for_move

In [263]:
df_1990_2000_fed_benefs.to_csv('1990_2000_fed_benefits.csv', index=False)

In [264]:
df_2004_2011_fed_benefs.to_csv('2004_2011_fed_benefits.csv', index=False)