# Reading Data, Transforming, and Loading into ChromaDB

In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

In [49]:
def parse_lobbyactivity_xml_to_dataframe(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    data = []

    for sm in [row.find('.//SMXML//SM') for row in root.findall('.//ROW')]:
        # check to make sure information is within the file
        if sm is not None:
            row_data = {}
            for element in sm:
                # get nested data
                if len(element) > 0:
                    for sub_element in element:
                        if len(sub_element) > 0:
                            for sub_sub_element in sub_element:                    
                                row_data[sub_sub_element.tag] = sub_sub_element.text
                        else: 
                            row_data[sub_element.tag] = sub_element.text        
                else:
                    row_data[element.tag] = element.text

            # add all data from the row to the array
            data.append(row_data)

    # Convert the list of dictionaries to a Pandas DataFrame
    df = pd.DataFrame(data)

    return df

In [50]:
xml_file = './staging/lobbyactivity-active.xml'
df = parse_lobbyactivity_xml_to_dataframe(xml_file)

# Print the resulting DataFrame
print(df)

     SMNumber      Status        Type  \
0     SM24178  Superseded    In-house   
1     SM30420  Superseded    In-house   
2     SM33270      Active  Consultant   
3     SM34184      Active    In-house   
4     SM34131      Active    In-house   
...       ...         ...         ...   
2560  SM34440      Active    In-house   
2561  SM34690      Active    In-house   
2562  SM34807      Active  Consultant   
2563  SM28894      Active    In-house   
2564  SM23235      Active      Parent   

                                          SubjectMatter  \
0                          Environment;Technology;Water   
1                                            Technology   
2     Affordable Housing;Planning and Development;Pl...   
3                             Technology;Transportation   
4                                        Transportation   
...                                                 ...   
2560               Planning and Development Application   
2561                           Plan

In [22]:
df.columns

Index(['SMNumber', 'Status', 'Type', 'SubjectMatter', 'Particulars',
       'SubjectMatterDefinition', 'InitialApprovalDate', 'EffectiveDate',
       'ProposedStartDate', 'ProposedEndDate', 'RegistrationNUmber',
       'RegistrationNUmberWithSoNum', 'Prefix', 'FirstName', 'MiddleInitials',
       'LastName', 'Suffix', 'PositionTitle', 'PreviousPublicOfficeHolder',
       'PreviousPublicOfficeHoldPosition',
       'PreviousPublicOfficePositionProgramName',
       'PreviousPublicOfficeHoldLastDate', 'BusinessAddress', 'Communication',
       'Firm', 'GMTFUNDING', 'BENEFICIARY', 'Privatefunding', 'GRASSROOT',
       'Gmtfunding', 'Meeting'],
      dtype='object')

In [23]:
df.head

<bound method NDFrame.head of      SMNumber      Status        Type  \
0     SM24178  Superseded    In-house   
1     SM30420  Superseded    In-house   
2     SM33270      Active  Consultant   
3     SM34184      Active    In-house   
4     SM34131      Active    In-house   
...       ...         ...         ...   
2560  SM34440      Active    In-house   
2561  SM34690      Active    In-house   
2562  SM34807      Active  Consultant   
2563  SM28894      Active    In-house   
2564  SM23235      Active    In-house   

                                          SubjectMatter  \
0                          Environment;Technology;Water   
1                                            Technology   
2     Affordable Housing;Planning and Development;Pl...   
3                             Technology;Transportation   
4                                        Transportation   
...                                                 ...   
2560               Planning and Development Application   
2561 

## Set up the collections

In [15]:
documents = []
metadatas = []
ids = []

## loop through the dataframe and add elements to the different collections

In [16]:
for index, row in df.iterrows():
    r_document = row['VARIABLEDESC']
    r_metadata = {}
    r_metadata['COUNTRY'] = row['COUNTRY']
    r_metadata['DATASETNAME'] = row['DATASETNAME']
    r_metadata['VINTAGE'] = row['VINTAGE']
    r_metadata['VARIABLENAME'] = row['VARIABLENAME']
    r_id = row['UniqueId']

    documents.append(r_document)
    metadatas.append(r_metadata)
    ids.append(r_id)

In [17]:
documents

['South Asian AccultuRates Segment SA01',
 'Chinese AccultuRates Segment CA00',
 'Chinese AccultuRates Segment CA03',
 'South Asian AccultuRates Segment SA04',
 'Visible Minority South Asian Population',
 'South Asian AccultuRates Segment SA05',
 'Chinese AccultuRates Segment CA04',
 'South Asian AccultuRates Segment SA03',
 'Household Population for Visible Minority',
 'Visible Minority Population',
 'South Asian AccultuRates Segment SA02',
 'Chinese AccultuRates Segment CA01',
 'Chinese AccultuRates Segment CA05',
 'South Asian AccultuRates Segment SA06',
 'Chinese AccultuRates Segment CA02',
 'Chinese AccultuRates Segment CA06',
 'South Asian AccultuRates Segment SA00',
 'Visible Minority Chinese Population',
 'Maintainers Under 35 Years and Household Income $0 to $19,999',
 'Maintainers Under 35 Years and Household Income $80,000 to $99,999',
 'Maintainers Under 35 Years and Household Income $200,000 or Over',
 'Aggregate Income of Maintainers 35 to 44 Years',
 'Maintainers 35 to 4

In [18]:
metadatas

[{'COUNTRY': 1,
  'DATASETNAME': 'AccultuRates',
  'VINTAGE': 2022,
  'VARIABLENAME': 'SA01'},
 {'COUNTRY': 1,
  'DATASETNAME': 'AccultuRates',
  'VINTAGE': 2022,
  'VARIABLENAME': 'CA00'},
 {'COUNTRY': 1,
  'DATASETNAME': 'AccultuRates',
  'VINTAGE': 2022,
  'VARIABLENAME': 'CA03'},
 {'COUNTRY': 1,
  'DATASETNAME': 'AccultuRates',
  'VINTAGE': 2022,
  'VARIABLENAME': 'SA04'},
 {'COUNTRY': 1,
  'DATASETNAME': 'AccultuRates',
  'VINTAGE': 2022,
  'VARIABLENAME': 'VISSA'},
 {'COUNTRY': 1,
  'DATASETNAME': 'AccultuRates',
  'VINTAGE': 2022,
  'VARIABLENAME': 'SA05'},
 {'COUNTRY': 1,
  'DATASETNAME': 'AccultuRates',
  'VINTAGE': 2022,
  'VARIABLENAME': 'CA04'},
 {'COUNTRY': 1,
  'DATASETNAME': 'AccultuRates',
  'VINTAGE': 2022,
  'VARIABLENAME': 'SA03'},
 {'COUNTRY': 1,
  'DATASETNAME': 'AccultuRates',
  'VINTAGE': 2022,
  'VARIABLENAME': 'VISHPOP'},
 {'COUNTRY': 1,
  'DATASETNAME': 'AccultuRates',
  'VINTAGE': 2022,
  'VARIABLENAME': 'VISVM'},
 {'COUNTRY': 1,
  'DATASETNAME': 'AccultuRate

In [19]:
ids

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185

In [20]:
# are all ids unique

def are_all_items_unique(arr):
    return len(arr) == len(set(arr))

result = are_all_items_unique(ids)

print(result)

True
