In [1]:
import os, requests, re, time, numpy as np, pandas as pd, pickle
from IPython.display import clear_output

import xmltodict

In [2]:
def get_node_names(parent):
    node_names = []
    
    for item in parent.items():
        if item[1] != None:
            if type(item[1]) == str:
                node_names.append(item[0])
            
            elif type(item[1]) == list:
                try:
                    child_list_node_names = get_node_names(item[1][0])
                    for child in child_list_node_names:
                        string_name = item[0] + '.' + child
                        node_names.append(string_name)
                except AttributeError:
                    pass
            
            else:
                child_node_names = get_node_names(item[1])
                for child in child_node_names:
                    string_name = item[0] + '.' + child
                    node_names.append(string_name)
            
    return node_names

def get_node_values(parent):
    value_list = []
    for item in parent.items():
        if item[1] != None:
            if type(item[1]) == str:
                value_list.append(item[1])
            
            elif type(item[1]) == list:
                try:
                    child_values = get_node_values(item[1][0])
                    for value in child_values:
                        value_list.append(value)
                except AttributeError:
                    pass
            
            else:
                child_values = get_node_values(item[1])
                for value in child_values:
                    value_list.append(value)
    
    return value_list

In [3]:
monroe_html = 'https://s3.amazonaws.com/irs-form-990/201911159349300636_public.xml'
monroe_baseline = xmltodict.parse(requests.get(monroe_html).content)
monroe_node_list = get_node_names(monroe_baseline)
monroe_dictionary = dict(zip(monroe_node_list, get_node_values(monroe_baseline)))
key_list = list(monroe_dictionary.keys())

In [4]:
key_list

['Return.@xmlns',
 'Return.@xmlns:xsi',
 'Return.@xsi:schemaLocation',
 'Return.@returnVersion',
 'Return.ReturnHeader.@binaryAttachmentCnt',
 'Return.ReturnHeader.ReturnTs',
 'Return.ReturnHeader.TaxPeriodEndDt',
 'Return.ReturnHeader.PreparerFirmGrp.PreparerFirmEIN',
 'Return.ReturnHeader.PreparerFirmGrp.PreparerFirmName.BusinessNameLine1Txt',
 'Return.ReturnHeader.PreparerFirmGrp.PreparerUSAddress.AddressLine1Txt',
 'Return.ReturnHeader.PreparerFirmGrp.PreparerUSAddress.CityNm',
 'Return.ReturnHeader.PreparerFirmGrp.PreparerUSAddress.StateAbbreviationCd',
 'Return.ReturnHeader.PreparerFirmGrp.PreparerUSAddress.ZIPCd',
 'Return.ReturnHeader.ReturnTypeCd',
 'Return.ReturnHeader.TaxPeriodBeginDt',
 'Return.ReturnHeader.Filer.EIN',
 'Return.ReturnHeader.Filer.BusinessName.BusinessNameLine1Txt',
 'Return.ReturnHeader.Filer.BusinessNameControlTxt',
 'Return.ReturnHeader.Filer.PhoneNum',
 'Return.ReturnHeader.Filer.USAddress.AddressLine1Txt',
 'Return.ReturnHeader.Filer.USAddress.CityNm',


In [6]:
lasttext = []
for key in key_list:
    splitlist = key.rsplit('.')
    lasttext.append(splitlist[-1])

In [7]:
lasttext

['@xmlns',
 '@xmlns:xsi',
 '@xsi:schemaLocation',
 '@returnVersion',
 '@binaryAttachmentCnt',
 'ReturnTs',
 'TaxPeriodEndDt',
 'PreparerFirmEIN',
 'BusinessNameLine1Txt',
 'AddressLine1Txt',
 'CityNm',
 'StateAbbreviationCd',
 'ZIPCd',
 'ReturnTypeCd',
 'TaxPeriodBeginDt',
 'EIN',
 'BusinessNameLine1Txt',
 'BusinessNameControlTxt',
 'PhoneNum',
 'AddressLine1Txt',
 'CityNm',
 'StateAbbreviationCd',
 'ZIPCd',
 'PersonNm',
 'PersonTitleTxt',
 'PhoneNum',
 'SignatureDt',
 'DiscussWithPaidPreparerInd',
 'PreparerPersonNm',
 'PTIN',
 'PhoneNum',
 'IPv4AddressTxt',
 'IPDt',
 'IPTm',
 'IPTimezoneCd',
 'FilingLicenseTypeCd',
 'AtSubmissionCreationDeviceId',
 'AtSubmissionFilingDeviceId',
 'TaxYr',
 'BuildTS',
 '@documentCnt',
 '@documentId',
 '@referenceDocumentId',
 'PrincipalOfficerNm',
 'AddressLine1Txt',
 'CityNm',
 'StateAbbreviationCd',
 'ZIPCd',
 'GrossReceiptsAmt',
 'GroupReturnForAffiliatesInd',
 'Organization501c3Ind',
 'WebsiteAddressTxt',
 'TypeOfOrganizationCorpInd',
 'FormationYr

In [9]:
def duplicates(lst):
    observed = set()
    observed_add = observed.add
    duplicate = set(x for x in lst if x in observed or observed_add(x))
    return list(duplicate)
    
    

In [10]:
duplicates(lasttext)

['ExpenseAmt',
 'DepreciationAmt',
 'ManagementAndGeneralAmt',
 'FormAndLineReferenceDesc',
 'CurrentTaxYearMinus4YearsAmt',
 'BOYAmt',
 'BookValueAmt',
 'PhoneNum',
 'CurrentTaxYearMinus2YearsAmt',
 'ExclusionAmt',
 'RevenueAmt',
 'TotalRevenueColumnAmt',
 'RelatedOrExemptFuncIncomeAmt',
 '@referenceDocumentId',
 'EOYAmt',
 'CurrentTaxYearMinus3YearsAmt',
 'TotalAmt',
 'StateAbbreviationCd',
 'OtherAmt',
 'CurrentTaxYearMinus1YearAmt',
 'FundraisingAmt',
 '#text',
 'ZIPCd',
 'CityNm',
 'ExplanationTxt',
 'PersonNm',
 'CurrentTaxYearAmt',
 'ProgramServicesAmt',
 'OtherCostOrOtherBasisAmt',
 'AddressLine1Txt',
 '@documentId',
 'Desc',
 'BusinessCd',
 'BusinessNameLine1Txt']

In [41]:
key_list

['Return.@xmlns',
 'Return.@xmlns:xsi',
 'Return.@xsi:schemaLocation',
 'Return.@returnVersion',
 'Return.ReturnHeader.@binaryAttachmentCnt',
 'Return.ReturnHeader.ReturnTs',
 'Return.ReturnHeader.TaxPeriodEndDt',
 'Return.ReturnHeader.PreparerFirmGrp.PreparerFirmEIN',
 'Return.ReturnHeader.PreparerFirmGrp.PreparerFirmName.BusinessNameLine1Txt',
 'Return.ReturnHeader.PreparerFirmGrp.PreparerUSAddress.AddressLine1Txt',
 'Return.ReturnHeader.PreparerFirmGrp.PreparerUSAddress.CityNm',
 'Return.ReturnHeader.PreparerFirmGrp.PreparerUSAddress.StateAbbreviationCd',
 'Return.ReturnHeader.PreparerFirmGrp.PreparerUSAddress.ZIPCd',
 'Return.ReturnHeader.ReturnTypeCd',
 'Return.ReturnHeader.TaxPeriodBeginDt',
 'Return.ReturnHeader.Filer.EIN',
 'Return.ReturnHeader.Filer.BusinessName.BusinessNameLine1Txt',
 'Return.ReturnHeader.Filer.BusinessNameControlTxt',
 'Return.ReturnHeader.Filer.PhoneNum',
 'Return.ReturnHeader.Filer.USAddress.AddressLine1Txt',
 'Return.ReturnHeader.Filer.USAddress.CityNm',


# start by cutting off the excess text

In [46]:
trimmed_list = []
for key in key_list:
    if key.startswith('Return.ReturnHeader.'):
        altered_key = key.rsplit('Return.ReturnHeader.')[-1]
        trimmed_list.append(altered_key)    
    elif key.startswith('Return.ReturnData.IRS990.'):
        altered_key = key.rsplit('Return.ReturnData.IRS990.')[-1]
        trimmed_list.append(altered_key)
    elif key.startswith('Return.ReturnData.IRS990'):
        altered_key = key.rsplit('Return.ReturnData.IRS990')[-1]
        trimmed_list.append(altered_key)
        
    else:
        trimmed_list.append(key)

In [47]:
trimmed_list

['Return.@xmlns',
 'Return.@xmlns:xsi',
 'Return.@xsi:schemaLocation',
 'Return.@returnVersion',
 '@binaryAttachmentCnt',
 'ReturnTs',
 'TaxPeriodEndDt',
 'PreparerFirmGrp.PreparerFirmEIN',
 'PreparerFirmGrp.PreparerFirmName.BusinessNameLine1Txt',
 'PreparerFirmGrp.PreparerUSAddress.AddressLine1Txt',
 'PreparerFirmGrp.PreparerUSAddress.CityNm',
 'PreparerFirmGrp.PreparerUSAddress.StateAbbreviationCd',
 'PreparerFirmGrp.PreparerUSAddress.ZIPCd',
 'ReturnTypeCd',
 'TaxPeriodBeginDt',
 'Filer.EIN',
 'Filer.BusinessName.BusinessNameLine1Txt',
 'Filer.BusinessNameControlTxt',
 'Filer.PhoneNum',
 'Filer.USAddress.AddressLine1Txt',
 'Filer.USAddress.CityNm',
 'Filer.USAddress.StateAbbreviationCd',
 'Filer.USAddress.ZIPCd',
 'BusinessOfficerGrp.PersonNm',
 'BusinessOfficerGrp.PersonTitleTxt',
 'BusinessOfficerGrp.PhoneNum',
 'BusinessOfficerGrp.SignatureDt',
 'BusinessOfficerGrp.DiscussWithPaidPreparerInd',
 'PreparerPersonGrp.PreparerPersonNm',
 'PreparerPersonGrp.PTIN',
 'PreparerPersonGrp.P

# swap special characters for their pronunciation

In [53]:
# swap # with 'hashtag'
no_hashtag_list = []
for key in trimmed_list:
    no_hashtag_list.append(key.replace('#', 'hashtag.'))
    
no_at_list = []
for key in no_hashtag_list:
    no_at_list.append(key.replace('@', 'at.'))

outlist = []
for key in no_at_list:
    outlist.append(key.replace('.', '_'))

In [54]:
outlist

['Return_at_xmlns',
 'Return_at_xmlns:xsi',
 'Return_at_xsi:schemaLocation',
 'Return_at_returnVersion',
 'at_binaryAttachmentCnt',
 'ReturnTs',
 'TaxPeriodEndDt',
 'PreparerFirmGrp_PreparerFirmEIN',
 'PreparerFirmGrp_PreparerFirmName_BusinessNameLine1Txt',
 'PreparerFirmGrp_PreparerUSAddress_AddressLine1Txt',
 'PreparerFirmGrp_PreparerUSAddress_CityNm',
 'PreparerFirmGrp_PreparerUSAddress_StateAbbreviationCd',
 'PreparerFirmGrp_PreparerUSAddress_ZIPCd',
 'ReturnTypeCd',
 'TaxPeriodBeginDt',
 'Filer_EIN',
 'Filer_BusinessName_BusinessNameLine1Txt',
 'Filer_BusinessNameControlTxt',
 'Filer_PhoneNum',
 'Filer_USAddress_AddressLine1Txt',
 'Filer_USAddress_CityNm',
 'Filer_USAddress_StateAbbreviationCd',
 'Filer_USAddress_ZIPCd',
 'BusinessOfficerGrp_PersonNm',
 'BusinessOfficerGrp_PersonTitleTxt',
 'BusinessOfficerGrp_PhoneNum',
 'BusinessOfficerGrp_SignatureDt',
 'BusinessOfficerGrp_DiscussWithPaidPreparerInd',
 'PreparerPersonGrp_PreparerPersonNm',
 'PreparerPersonGrp_PTIN',
 'PreparerP

In [56]:
len(outlist)

470

In [57]:
with open('shorter key names', 'wb') as tofile:
    pickle.dump(outlist, tofile)

In [None]:


data = pandas.dataframe(blaha )

