In [1]:
import pandas as pd
import re
import os

In [2]:
df = pd.DataFrame(columns=["no", "org_ind", "type", "name", "state", "num_pages", "file_size", "sentiment"])

In [3]:
file_path = 'downloaded_html/Flat_Submissions.rawh'
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()
named_submissions = re.findall(r'<td>(\d+)</td><td><strong>(.*?)</strong>', content)
named_submissions_dict = {int(key): value for key, value in named_submissions}
namewithheld_submissions = re.findall(r'<td>(\d+)</td><td>Name Withheld&nbsp', content)
confidential_submissions = re.findall(r'<td>(\d+)</td><td>Confidential</td>', content)

In [4]:
named_set = set([int(t[0]) for t in named_submissions])
namewithheld_set = set([int(t) for t in namewithheld_submissions])
confidential_set = set([int(t) for t in confidential_submissions])
assert not (named_set & namewithheld_set & confidential_set) #See no intersections

In [5]:
all_numbers = named_set | namewithheld_set | confidential_set
nn = len(all_numbers)
nn, min(all_numbers), max(all_numbers)

(180, 1, 180)

In [6]:
for i in range(1,nn+1):
    if i in named_set:
        df.loc[i] = [i, "", "named", named_submissions_dict[i], "", -1, -1, -1]
    elif i in namewithheld_set:
        df.loc[i] = [i, "", "name-withheld", "", "", -1, -1, -1]
    elif i in confidential_set:
        df.loc[i] = [i, "", "confidential", "", "", -1, -1, -1]
    else:
        raise ValueError("Error")

### Determine the state

In [7]:
file_path = 'downloaded_html/State_Ter_Submissions.rawh'
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()
pattern = re.compile(r"Submissions from : <strong>(.*?)</strong></p>(.*?)((?=<p>Submissions from : <strong>)|$)", re.DOTALL)
matches = pattern.findall(content)
states_options = [m[0] for m in matches]
print(states_options)

['ACT', 'NSW', 'QLD', 'SA', 'Unavailable Information', 'VIC', 'WA']


In [8]:
for m in matches:
    state = m[0]
    nums = [int(ss) for ss in re.findall(r'<td>(\d+)</td><td><strong>.*?</strong>', m[1])]
    for i in nums:
        df.at[i, "state"] = state

### Determine if by organization or individual

In [9]:
file_path = 'downloaded_html/Org_Ind_Submissions.rawh'
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()
pattern = re.compile(r"Submissions by : <strong>(.*?)</strong></p>(.*?)((?=<p>Submissions by : <strong>)|$)", re.DOTALL)
matches = pattern.findall(content)

In [10]:
for m in matches:
    type_sub = m[0]
    sub_matches = [int(ss) for ss in re.findall(r'<td>(\d+)</td><td><strong>.*?</strong>', m[1])]
    if type_sub == 'Individual':
        for i in sub_matches:
            df.at[i,"org_ind"] = "individual"
    if type_sub == 'Organisation':
        for i in sub_matches:
            df.at[i,"org_ind"] = "organisation"

### Save the dataframe

In [11]:
df.to_csv('submissions.csv', index=False)