In [1]:
import pandas as pd
import re
import os
import ollama
import pymupdf
from datetime import datetime

In [2]:
# Make sure to download all pdf files to the raw_docs folder
pattern = re.compile(r"Sub(\d{2,3})\.pdf")
pdf_file_dict = {}
for filename in os.listdir("raw_docs"):
    match = pattern.match(filename)
    if match:
        number = int(match.group(1))
        pdf_file_dict[number] = "raw_docs/"+filename

In [3]:
df = pd.DataFrame(columns=["no", "org_ind", "type", "name", "state", "pdf_file_path", "num_pages", "file_size", "support_sentiment"])

In [4]:
file_path = 'downloaded_html/Flat_Submissions.rawh'
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()
named_submissions = re.findall(r'<td>(\d+)</td><td><strong>(.*?)</strong>', content)
named_submissions_dict = {int(key): value for key, value in named_submissions}
namewithheld_submissions = re.findall(r'<td>(\d+)</td><td>Name Withheld&nbsp', content)
confidential_submissions = re.findall(r'<td>(\d+)</td><td>Confidential</td>', content)

In [5]:
named_set = set([int(t[0]) for t in named_submissions])
namewithheld_set = set([int(t) for t in namewithheld_submissions])
confidential_set = set([int(t) for t in confidential_submissions])
assert not (named_set & namewithheld_set & confidential_set) #See no intersections

In [6]:
all_numbers = named_set | namewithheld_set | confidential_set
nn = len(all_numbers)
nn, min(all_numbers), max(all_numbers)

(180, 1, 180)

In [7]:
for i in range(1,nn+1):
    fn = pdf_file_dict.get(i, "")
    if i in named_set:
        df.loc[i] = [i, "", "named", named_submissions_dict[i], "",fn , -1, -1, ""]
    elif i in namewithheld_set:
        df.loc[i] = [i, "", "name-withheld", "", "", fn, -1, -1, ""]
    elif i in confidential_set:
        df.loc[i] = [i, "", "confidential", "", "",fn, -1, -1, ""]
    else:
        raise ValueError("Error")

### Determine the state

In [8]:
file_path = 'downloaded_html/State_Ter_Submissions.rawh'
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()
pattern = re.compile(r"Submissions from : <strong>(.*?)</strong></p>(.*?)((?=<p>Submissions from : <strong>)|$)", re.DOTALL)
matches = pattern.findall(content)
states_options = [m[0] for m in matches]
print(states_options)

['ACT', 'NSW', 'QLD', 'SA', 'Unavailable Information', 'VIC', 'WA']


In [9]:
for m in matches:
    state = m[0]
    nums = [int(ss) for ss in re.findall(r'<td>(\d+)</td><td><strong>.*?</strong>', m[1])]
    for i in nums:
        df.at[i, "state"] = state

### Determine if by organization or individual

In [10]:
file_path = 'downloaded_html/Org_Ind_Submissions.rawh'
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()
pattern = re.compile(r"Submissions by : <strong>(.*?)</strong></p>(.*?)((?=<p>Submissions by : <strong>)|$)", re.DOTALL)
matches = pattern.findall(content)

In [11]:
for m in matches:
    type_sub = m[0]
    sub_matches = [int(ss) for ss in re.findall(r'<td>(\d+)</td><td><strong>.*?</strong>', m[1])]
    if type_sub == 'Individual':
        for i in sub_matches:
            df.at[i,"org_ind"] = "individual"
    if type_sub == 'Organisation':
        for i in sub_matches:
            df.at[i,"org_ind"] = "organisation"

In [12]:
df.to_csv('submissions.csv', index=False)

### Run the LLM

In [13]:
max_text_length = 5000;

In [14]:
prompt_supports_opposes = """
The text below is for a submission to a Senate Inquiry. 
A submission either supports the inquiry, meaning that something needs to done about the antisemitism.
Alternatively, a submission does NOT support the inquiry meaning that while opposing antisemitism, 
the submission states that an inquiry of the nature proposed is not needed.
Please state if the text supports the inquiry by responding with one word: 'Supports'.
Or if the text opposes the inquiry by responding with 'Opposes'.
If it is not very clear if the answer is `Supports' or `Opposes'  respond with `Unclear'.
*******
"""

In [15]:
def clean_support_or_oppose_or_unclear(text):
    text = text.lower()
    if 'supports' in text:
        return 'supports'
    elif 'opposes' in text:
        return 'opposes'
    else:
        return 'unclear'

#Loop and use LLM, each time save the df
for index, row in df.iterrows():
    if row['pdf_file_path']:
        doc = pymupdf.open(row['pdf_file_path'])
        text = chr(12).join([page.get_text() for page in doc])
        response = ollama.chat(model='llama3:8b', messages=[
          {
            'role': 'user',
            'content': prompt_supports_opposes + text[:max_text_length],
          },
        ])
        current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        llm_answer = clean_support_or_oppose_or_unclear(response['message']['content'])
        df.at[index, "support_sentiment"] = llm_answer
        df.to_csv('submissions.csv', index=False)
        print(f"Submission {index}: {llm_answer}\t\t\t{current_time}")
    else:
        print(f"Skipping {index}")

Submission 1: supports			2024-08-27 17:45:07
Submission 2: supports			2024-08-27 17:45:17
Submission 3: supports			2024-08-27 17:45:21
Submission 4: supports			2024-08-27 17:45:27
Submission 5: opposes			2024-08-27 17:45:37
Submission 6: supports			2024-08-27 17:45:42
Submission 7: supports			2024-08-27 17:45:49
Submission 8: supports			2024-08-27 17:45:56
Submission 9: supports			2024-08-27 17:46:03
Submission 10: supports			2024-08-27 17:46:12
Submission 11: supports			2024-08-27 17:46:18
Submission 12: supports			2024-08-27 17:46:25
Submission 13: supports			2024-08-27 17:46:32
Skipping 14
Skipping 15
Submission 16: supports			2024-08-27 17:46:37
Submission 17: supports			2024-08-27 17:46:41
Submission 18: supports			2024-08-27 17:46:45
Submission 19: supports			2024-08-27 17:46:55
Skipping 20
Submission 21: opposes			2024-08-27 17:47:00
Submission 22: supports			2024-08-27 17:47:04
Submission 23: supports			2024-08-27 17:47:12
Submission 24: supports			2024-08-27 17:47:19
Submissio

In [19]:
df = pd.read_csv('submissions.csv') #read again for file if starting here.

In [39]:
not_org_df = df[df['org_ind'] != 'organisation']
total_not_org = len(not_org_df)
print(total_not_org)

130


In [40]:
total_supporting = len(not_org_df[not_org_df['support_sentiment'] == 'supports'])
print(total_supporting)

114


In [42]:
total_supporting_named = len(not_org_df[(not_org_df['support_sentiment'] == 'supports') & (not_org_df['type'] == 'named')])
print(total_supporting_named)

52


In [43]:
total_opposing = len(not_org_df[not_org_df['support_sentiment'] == 'opposes'])
print(total_opposing)

7


In [44]:
total_opposing_named = len(not_org_df[(not_org_df['support_sentiment'] == 'opposes') & (not_org_df['type'] == 'named')])
print(total_opposing_named)

6


In [46]:
total_supporting_named/total_supporting, total_opposing_named/total_opposing

(0.45614035087719296, 0.8571428571428571)

# Summary based on 180 submissions (partial) - ignoring organization submissions at this point

From 130 individual submissions, there are 114 supporting and 7 opposing. Others are unclear.

From the 114 supporting, 52 are named. This is only 46%.

From the 7 oppposing, 6 are named. This is 86%. 

Also note that submission 176 has 8 named people on it (untypical submission). It is an opposing submission. With this the number of opposing is 14, and number named are 13. That is 93%.

While the numbers are small, it appears obvious that individuals supporting feel they want to be kept unnamed at a much higher rate. About one in two people. While those opposing are confident to do it publically (about 9 in 10 people).