In [None]:
#imports

import string
import re
import pdfminer
import pandas as pd
import numpy as np
import os

from pdfminer.high_level import extract_text

In [None]:
#define constants
YEARS_STRING = ['1990', '1991', '1992', '1994', '1995', '1996', '1997', '1998', 
                '1999', '2000', '2004', '2005', '2006', '2007', '2008', '2011']

YEARS = [1990, 1991, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2004, 2005, 2006, 2007, 2008, 2011]

In [None]:
#load in state_constants as dataframe from external file
STATE_CONSTANTS = pd.read_csv("util/state_constants.csv")

In [None]:
#load errata_list constant from file
%load util/errata_list.py

In [None]:
#data cleaning functions

#removes any new line characters from string and drops right-end spaces
def drop_new_lines_plus_strip(string):
    return re.sub('\n+', '', string).rstrip()

#removes any dollar sign characters or commas plus new lines from string
def drop_dollar_sign_and_comma(string):
    return re.sub('\$|,|\n', '', string)

In [None]:
#params: 
# - raw_text: body of pdf report file as String
#return: list of State Payment Table pages text

# Step 1 -> convert contents of pdf report into separate payment pages.
def do_preprocessing(raw_text):
    
    #create translator to strip away '\0xC' (formfeed) character in converted text
    drop_formfeed_translator = str.maketrans('', '', '\f')
    working_text = raw_text.translate(drop_formfeed_translator)
    #drop any duplicate spaces
    working_text = re.sub(' +', ' ', working_text)

    #replace known misrecognition(s) in converted text
    for idx, ele in enumerate(ERRATA_LIST):
        for jdx, err in enumerate(ele[1]):
            working_text = re.sub(err, ele[0], working_text)
    
    return working_text
    

In [None]:
#params: 
# - working_text: body of pdf report file as String after preprocessing
#return: list of States without optional supplementation, list of States with program

# Step 2 -> iterate through pdf pages and capture States without optional program.
def extract_no_program_states(working_text):

    #define Regex for capturing separate pages within pdf
    pages_pattern = r"(Digitized by Google).*?(?=Digitized by Google)"
    __matches = [x.group() for x in re.finditer(pages_pattern, working_text, flags=re.M|re.S)]

    # This string is present on all payment pages where state does not have optional program.
    _tofind = "State does not provide optional supplementation."

    saved_pages = []
    no_program_states = []

    for ele in __matches:
        #if page contains "no-program" string
        if _tofind in ele:
            saved_pages.append(re.sub('\n', '', ele))

    states_upper = STATE_CONSTANTS['full_name_upper'].tolist()
            
    for page in saved_pages:
        #split contents of page into separate strings
        temp_tokens = page.split()
        for idx, token in enumerate(temp_tokens):
            #joining "West Virginia"
            if token == 'WEST':
                token = 'WEST VIRGINIA'
            #check for "duplicate" Virginia
            if token == 'VIRGINIA':
                if (temp_tokens[idx-1] == 'WEST'):
                    token = '/'
            #find token in states list
            for state in states_upper:
                if token == state.upper():
                    no_program_states.append(state.upper())

    #create list of states with program from difference of those with no program    
    states_w_program = list(set(states_upper).difference(no_program_states))
    states_w_program.sort()

    return no_program_states, states_with_program

In [None]:
#params: 
# - working_text: body of pdf report file as String after preprocessing
#return: list of state payment pages as separate strings

# Step 3 -> split pdf body into separate pages based on State Payment Table heading.
def split_payment_pages(working_text):
    
    #define Regex for capturing State Payment Tables
    pattern_payment_levels = r"((PAYMENT LEVELS)|(Optional supplement amount)).*?(?=Digitized)"
    #split text into separate payment pages by state
    raw_payment_pages = [x.group() for x in re.finditer(pattern_payment_levels, working_text, flags=re.M|re.S)]
    
    return raw_payment_pages

In [None]:
#params: 
# - raw_payment_pages: body of pdf report file as String
# - table_data: empty list -> contents of payment table for page
# - footnotes: empty list -> footnotes for table from bottom of page (String)
# - temp: used for temporary swapping inside function
#return: void, mutates input lists

#separator = 'STATE ASSISTANCE FOR SPECIAL NEEDS'
# This text was a consistent heading in the report(s) following the conclusion of the State Payment Table, however 
#   it may or may not be present on the same page depending on the length of the specific table.

#pattern_footnotes = r"(Unless|Blind ind|Add).*"
# Regex for capturing footnotes using prefixes section text for all state tables.

#pattern_payments = r"(PAYMENT LEVELS).*(?=Unles)"
# Regex for capturing payment tables and bounding with footnotes start for all state tables.

# Step 4 -> convert separate payment pages into lists of raw table data + table footnotes.
def parse_pages(raw_payment_pages, table_data, footnotes, temp):
    for idx, ele in enumerate(raw_payment_pages):
        splitted = ele.split(separator, 1)
        #if separator present
        if (len(splitted) > 1):
            table_data.append(splitted[0])
            temp.append(splitted[1])
        else:
            table_data.append(splitted[0])
            temp.append(splitted[0])

    #grab footnotes
    for idx, ele in enumerate(temp):
        footnotes.append([x.group() for x in re.finditer(pattern_footnotes, ele, flags=re.S)])

    #grab table data
    for idx, ele in enumerate(table_data):
        temp = [x.group() for x in re.finditer(pattern_payments, ele, flags=re.S)]
        if (len(temp) >= 1):
            table_data[idx] = temp[0]
        else:
            continue

In [None]:
#params: none
#return: void, mutates local structures

#pattern_liv_arr = r"(Rec|Living (in|with|alone|I)|Req|(Un)*((L|l)ice)|Medicaid facility|Inde|Nonm|Disabled \w|(Care)* Home|Individual w|Adult|Foster|Comm|Long-|Small|Large.*/Res|Domiciliary( |-)(C|c)are|Specialized s|Semi-i|Room|Hotel-|Family|Resid|In-|Personal|Caret|Flat|Cost reimb|DOMCARE|Shared|Home|Aid to|Group home|Child|Trans|Certi|Center|Congre|Blind in|Rest|Supervised|Custo|House|Private|Shelter).*?(?=(\d{2,}\.\d{2})|[$\"\.:•A-Z\('~]|([\d]{1,2} )|([\d!]/))"
# Regex for capturing living arrangements defined and covered across all state tables.

#pattern_benef = r"((N/A)|((\$)*[\d]{1,3}\.[\d]{2})|((\$)*[\d],[\d]{3}\.[\d]{2}))"
# Regex for capturing benefit amounts from within table body text for matching living arrangements.

# Step 5 -> split raw table data into lists of living arrangements and benefit amounts.
def parse_table_data():
    for idx, ele in enumerate(table_data):
        liv_arr_matches = [x.group() for x in re.finditer(pattern_liv_arr, table_data[idx], flags=re.S)]
        benef_matches = [x.group() for x in re.finditer(pattern_benef, table_data[idx], flags=re.S)]

        mut_liv_arrs = []
        mut_benefs = []

        for jdx, match in enumerate(liv_arr_matches):
            mut_liv_arrs.append(drop_new_lines_plus_strip(match))

        for jdx, match in enumerate(benef_matches):
            mut_benefs.append(drop_dollar_sign_and_comma(match))

        #pushing data to local lists
        liv_arrs.append(mut_liv_arrs)
        benefs.append(mut_benefs)

In [None]:
#Step 6 -> manual fixes
def manual_fix(report_year):

    #create path to files
    lahc = report_year + "_lahc.py"
    bhc = report_year + "_bhc.py"
    
    #load and execute hc files for report year
    %load util/liv_arr_hc/lahc
    %load util/benef_hc/bhc

In [None]:
#params: none
#return: void, mutates local structures

# Step 7 -> modify living arrangements list to include state defined subcategories.
def add_subcats():
    for idx, state in enumerate(liv_arrs):

        #check if state has any adjustments
        if (len(LIVING_ARRANGEMENT_ADJUSTMENTS_BASE[idx]) > 0):
            temp_adjustments = []
            #loop through arrangements list
            for jdx in range(len(state)):
                made_adj = False
                #loop through potential adjustments
                for kdx, adj_list in enumerate(LIVING_ARRANGEMENT_ADJUSTMENTS_BASE[idx]):
                    #check if adjustment index matches arrangement index
                    if (adj_list[0] == jdx):
                        #flag adjustment made (avoid copies)
                        made_adj = True
                        #loop through actual adjustments
                        for mdx, add_on in enumerate(adj_list[1]):
                            temp_adjustments.append([state[jdx], add_on])
                if (not made_adj):
                    temp_adjustments.append([state[jdx], ''])
            adj_liv_arrs.append(temp_adjustments)
        #if no adjustments, push as is
        else:
            temp_adjustments = []
            for jdx in range(len(state)):
                temp_adjustments.append([state[jdx], ''])
            adj_liv_arrs.append(temp_adjustments)

In [None]:
#params: none
#return: void, mutates local structures

#Step 8 -> create list of `State_Obj` containing covered living arrangments and benefits.
def build_state_objs():
    for idx, ele in enumerate(STATES_W_PROGRAM):
        state_objs.append({
            'state': ele,
            'liv_arrs': adj_liv_arrs[idx],
            'benefs': benefs[idx]
        })

#### Benefits Parsing Functions

In [None]:
token_to_index_mappings = {
    'A':(0,4),
    'B':(4,8),
    'C':(8,12),
    'D':(12,16),
    'E':(16,20),
    'F':(20,24),
    'G':(24,28),
    'H':(28,32),
    'I':(32,36),
    'J':(36,40),
    'K':(40,44),
    'L':(44,48),
    'M':(48,52),
    'N':(52,56),
    'O':(56,60)
}

In [None]:
def shuffle(string, state_idx):
    tokens_ = list(set(string))
    tokens_.sort()

    token_ranges = []

    for token in tokens_:
        token_ranges.append([token, token_to_index_mappings[token][0], token_to_index_mappings[token][1]])

    idx_list = []
    fir_tkn = token_ranges[0][0]
    lst_tkn = token_ranges[len(token_ranges) - 1][0]    
    benef_slice = benefs[state_idx][token_to_index_mappings[fir_tkn][0] : token_to_index_mappings[lst_tkn][1]]


    x_token_idxs, y_token_idxs = [], []
    x_token = token_ranges[0][0]
    y_token = token_ranges[1][0]

    for idx, char in enumerate(string):
        if (char == x_token):
            x_token_idxs.append(idx)
        else:
            y_token_idxs.append(idx)

    joined_idxs = x_token_idxs + y_token_idxs

    benef_slice = [benef_slice[idx] for idx in joined_idxs]

    return [benef_slice, token_to_index_mappings[fir_tkn][0], token_to_index_mappings[lst_tkn][1], tokens_]

In [None]:
def payment_pattern_parser(pattern_str):

    token_buffer = ''
    tokens = []

    for char in pattern_str:
        if (not char.isspace()):
            token_buffer = token_buffer + char
        else:
            tokens.append(token_buffer)
            token_buffer = ''

    tokens.append(token_buffer)

    return tokens

In [None]:
def token_matcher(tokens):

    instruction_list = []

    for idx, token in enumerate(tokens):
        x_type_pattern = r"(?<![A-Z_*])[A-Z](?![A-Z])" 
        xy_type_pattern = r"\([A-Z]{2}\)"
        xyz_type_pattern = r"\([A-Z]{3}\)"
        wxyz_type_pattern = r"\([A-Z]{4}\)"
        vwxyz_type_pattern = r"\([A-Z]{5}\)"
        uvwxyz_type_pattern = r"\([A-Z]{6}\)"
        shuffle_rows_pattern = r"%[A-Z]*%"
        
        if re.search(x_type_pattern, token): #A
            instruction_list.append(('single_split', token_to_index_mappings[token[0]][0], token_to_index_mappings[token[0]][1]))
        elif re.search(xy_type_pattern, token): #(AB)
            first_arr = token[1:2]
            second_arr = token[2:3]
            instruction_list.append(('swap_xy', token_to_index_mappings[first_arr][0], token_to_index_mappings[second_arr][1]))
        elif re.search(xyz_type_pattern, token): #(ABC)
            first_arr = token[1:2]
            third_arr = token[3:4]
            instruction_list.append(('swap_xyz', token_to_index_mappings[first_arr][0], token_to_index_mappings[third_arr][1]))
        elif re.search(wxyz_type_pattern, token): #(ABCD)
            first_arr = token[1:2]
            fourth_arr = token[4:5]
            instruction_list.append(('swap_wxyz', token_to_index_mappings[first_arr][0], token_to_index_mappings[fourth_arr][1]))
        elif re.search(vwxyz_type_pattern, token): #(ABCDE)
            first_arr = token[1:2]
            fifth_arr = token[5:6]
            instruction_list.append(('swap_vwxyz', token_to_index_mappings[first_arr][0], token_to_index_mappings[fifth_arr][1]))
        elif re.search(uvwxyz_type_pattern, token): #(ABCDEF)
            first_arr = token[1:2]
            sixth_arr = token[6:7]
            instruction_list.append(('swap_uvwxyz', token_to_index_mappings[first_arr][0], token_to_index_mappings[sixth_arr][1]))
        elif re.search(shuffle_rows_pattern, token): #%...%
            string = token[1:-1]
            instruction_list.append(('shuffle', string))
        else:
            print(token)
            
    return instruction_list

In [None]:
def interpreter(_instruction_list, states_idx):
    sorted_payments = []
    for idx, instr in enumerate(_instruction_list):
        if (instr[0] == 'single_split'):
            sorted_payments += (single_split(state_objs[states_idx]['benefs'][instr[1]:instr[2]]))
        elif (instr[0] == 'swap_xy'):
            #swap_abc(states_objs[18]['payments'][0:12])
            sorted_payments += (swap(state_objs[states_idx]['benefs'][instr[1]:instr[2]], 2))
        elif (instr[0] == 'swap_xyz'):
            sorted_payments += (swap(state_objs[states_idx]['benefs'][instr[1]:instr[2]], 3))
        elif (instr[0] == 'swap_wxyz'):
            sorted_payments += (swap(state_objs[states_idx]['benefs'][instr[1]:instr[2]], 4))
        elif (instr[0] == 'swap_vwxyz'):
            sorted_payments += (swap(state_objs[states_idx]['benefs'][instr[1]:instr[2]], 5))
        elif (instr[0] == 'swap_uvwxyz'):
            sorted_payments += (swap(state_objs[states_idx]['benefs'][instr[1]:instr[2]], 6))
        elif (instr[0] == 'shuffle'):
            shuffle_ret = shuffle(instr[1], states_idx)
            _splits = single_split(shuffle_ret[0])
            for row in _splits:
                sorted_payments += [row]

    return sorted_payments

In [None]:
def single_split(_list):
    return ([_list[i:i+4] for i in range(0, len(_list), 4)])

In [None]:
def swap(_list, count):

    u_list = []
    v_list = []
    w_list = []
    x_list = []
    y_list = []
    z_list = []

    for idx, ele in enumerate(_list):
        
        # (xy) swap
        if (count == 2):
            if (idx % 2 == 0):
                x_list.append(ele)
            else:
                y_list.append(ele)
        
        # (xyz) swap
        elif (count == 3):
            if (idx % 3 == 0):
                x_list.append(ele)
            elif (idx % 3 == 1):
                y_list.append(ele)
            else:
                z_list.append(ele)
        
        # (wxyz) swap
        elif (count == 4):
            if (idx % 4 == 0):
                w_list.append(ele)
            elif (idx % 4 == 1):
                x_list.append(ele)
            elif (idx % 4 == 2):
                y_list.append(ele)
            else:
                z_list.append(ele)
        
        # (vwxyz) swap
        elif (count == 5):
            if (idx % 5 == 0):
                v_list.append(ele)
            elif (idx % 5 == 1):
                w_list.append(ele)
            elif (idx % 5 == 2):
                x_list.append(ele)
            elif (idx % 5 == 3):
                y_list.append(ele)
            else:
                z_list.append(ele)
                
        # (uvwxyz) swap
        elif (count == 6):
            if (idx % 6 == 0):
                u_list.append(ele)
            elif (idx % 6 == 1):
                v_list.append(ele)
            elif (idx % 6 == 2):
                w_list.append(ele)
            elif (idx % 6 == 3):
                x_list.append(ele)
            elif (idx % 6 == 4):
                y_list.append(ele)
            else:
                z_list.append(ele)
                
    if (count == 2):
        return single_split(x_list) + single_split(y_list)
    elif (count == 3):
        return single_split(x_list) + single_split(y_list) + single_split(z_list)
    elif (count == 4):
        return single_split(w_list) + single_split(x_list) + single_split(y_list) + single_split(z_list)
    elif (count == 5):
        return single_split(v_list) + single_split(w_list) + single_split(x_list) + single_split(y_list) + single_split(z_list)
    else:
        return single_split(u_list) + single_split(v_list) + single_split(w_list) + single_split(x_list) + single_split(y_list) + single_split(z_list)


#### Parse Payments

In [None]:
#params: none
#return: void, mutates local structures

#Step 9 -> transform benefits list to list-of-list format matching table structure.
def parse_payments():
    for idx, ele in enumerate(state_objs):
        #ignore unique implementation states
        if (not ele['state'] in troublesome_states):
            ttokens = payment_pattern_parser(STATE_PAYMENT_PATTERNS_BASE[idx][1])
            iinstr_list = token_matcher(ttokens)
            ssorted_benef = interpreter(iinstr_list, idx)
            state_payments.append(ssorted_benef)
        else:
            state_payments.append([])

### Execution

In [None]:
#load liv_arr_subcat_base from outside file
#
# LIVING_ARRANGEMENT_ADJUSTMENTS_BASE
%load util/subcats/LIVING_ARRANGEMENT_ADJUSTMENTS_BASE.py

In [None]:
#load benefs_base_patterns from outside file
#
# STATE_PAYMENT_PATTERNS_BASE
%load util/benef_patterns/STATE_PAYMENT_PATTERNS_BASE.py

In [None]:
# Regex patterns
pattern_liv_arr = r"(Rec|Living (in|with|alone|I)|Req|(Un)*((L|l)ice)|Medicaid facility|Inde|Nonm|Disabled \w|(Care)* Home|Individual w|Adult|Foster|Comm|Long-|Small|Large.*/Res|Domiciliary( |-)(C|c)are|Specialized s|Semi-i|Room|Hotel-|Family|Resid|In-|Personal|Caret|Flat|Cost reimb|DOMCARE|Shared|Home|Aid to|Group home|Child|Trans|Certi|Center|Congre|Blind in|Rest|Supervised|Custo|House|Private|Shelter).*?(?=(\d{2,}\.\d{2})|[$\"\.:•A-Z\('~]|([\d]{1,2} )|([\d!]/))"
pattern_benef = r"((N/A)|((\$)*[\d]{1,3}\.[\d]{2})|((\$)*[\d],[\d]{3}\.[\d]{2}))"
separator = 'STATE ASSISTANCE FOR SPECIAL NEEDS'
pattern_footnotes = r"(Unless|Blind ind|Add).*"
pattern_payments = r"(PAYMENT LEVELS).*(?=Unles)"
pattern_footnote_benefs = r"\$[\d]{2}"

for idx, year_str in enumerate(YEARS_STRING):
    
    #print("now working with " + year_str + " year ------------")
    
    #local structures
    table_data, footnotes, temp = [], [], []
    liv_arrs = []
    benefs = []
    adj_liv_arrs = []
    state_objs = []
    state_payments = []
    
    #import data
    raw_text = extract_text("ssi_reports/ssi_" + year_str + "s.pdf")
    
    # Step 1
    working_text = do_preprocessing(raw_text)
    # Step 2
    NO_PROGRAM_STATES, STATES_W_PROGRAM = extract_no_program_states(working_text)
    # Step 3
    raw_payment_pages = split_payment_pages(working_text)
    # Step 4
    parse_pages(raw_payment_pages, table_data, footnotes, temp)
    
    #if necessary, clean duplicate or run-on pages
    
    # Step 5
    parse_table_data()
    # Step 6
    manual_fix(YEARS_STRING)

    #modify for current year
    _path = "subcats_" + year_str + ".py"
    %load util/subcats/_path

    # Step 7
    add_subcats()
    # Step 8
    build_state_objs()

    #modify for current year
    _path = "patt_" + year_str + ".py"
    PATTERN_ADJUSTMENTS = %load util/benef_patterns/_path

    for idx, replacement in enumerate(PATTERN_ADJUSTMENTS):
        for jdx, pattern in enumerate(STATE_PAYMENT_PATTERNS_BASE):
            if (replacement[0] == pattern[0]):
                STATE_PAYMENT_PATTERNS_BASE[jdx] = replacement

    # Step 9
    parse_payments()
    
    column_names = ["year", "state", "liv_arr", "sub_cat", "combn_indv", "combn_cpl", "state_indv", "state_cpl"]
    liv_arr_x_benef_df = pd.DataFrame(columns = column_names)
    
    for idx, ele in enumerate(STATES_W_PROGRAM):
        for num_rows in range(len(adj_liv_arrs[idx])):
            liv_arr_x_benef_df.loc[len(liv_arr_x_benef_df.index)] = [year_str, ele, 
                                                    adj_liv_arrs[idx][num_rows][0],
                                                    adj_liv_arrs[idx][num_rows][1],
                                                    state_payments[idx][num_rows][0],
                                                    state_payments[idx][num_rows][1],
                                                    state_payments[idx][num_rows][2],
                                                    state_payments[idx][num_rows][3]]
    
    liv_arr_x_benef_df = liv_arr_x_benef_df.replace({'combn_indv':{'N/A':'0'}, 
                                                     'combn_cpl':{'N/A':'0'},
                                                     'state_indv':{'N/A':'0'},
                                                     'state_cpl':{'N/A':'0'}})
    
    liv_arr_x_benef_df = liv_arr_x_benef_df.astype({'combn_indv':'float64', 'combn_cpl':'float64', 
                                                    'state_indv':'float64', 'state_cpl':'float64'})
    
    liv_arr_x_benef_df.to_csv('_' + year_str + '_liv_arr_x_benef.csv')

In [None]:
#optimizing notebook view

#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_colwidth', None)
#pd.set_option('precision', 2)

In [None]:
#styling output table in notebook view

# final = liv_arr_x_benef_df.style
# final.set_table_styles([{
#     'selector': 'td', 'props': [
#         ('font-size', '10pt'),
#         ('border-style', 'solid'),
#         ('border-width', '1px'),
#         ('border-color', 'black')]}])