In [13]:
import os, sys

# Add repo
git_dir = os.path.abspath('../')
sys.path.append(os.path.join(git_dir, 'lib', 'utils'))

# Define data path
data_path = os.path.join(git_dir, 'data', 'hansard-xml')

# Define data output path
data_output_path = os.path.join(git_dir, 'data', 'hansard-parsed')

# Make output dir
if not os.path.exists(data_output_path):
    os.makedirs(data_output_path)

# Hansard Parsing

## Prereq: Dataset: hansard-xml
See `GetHansard.ipynb` to get the list of urls for Austalian Hansard for a particular year and saved to the `data/remote/hansard-urls` - then use the following to be able to get the data.
```
data/get-dataset.sh hansard-xml | sed 's/YEAR=2020/YEAR=2019/g' | sh
````

In [14]:
YEAR=2020

In [15]:
import glob, io, re
import xmlplain
from bs4 import BeautifulSoup
from IPython.display import JSON

In [16]:
input_xml_files = glob.glob("{}/*/*/*".format(data_path))

In [17]:
soup = BeautifulSoup(open(input_xml_files[0]), "lxml-xml")

In [7]:
input_xml_files[0]

'/scratch/language-of-leadership/data/hansard-xml/senate/2018/Senate_2018_03_20_5990_Official.xml'

## Removing xmlns attributes and other attributes

https://stackoverflow.com/questions/9044088/beautifulsoup-strip-specified-attributes-but-preserve-the-tag-and-its-contents

In [49]:
REMOVE_ATTRIBUTES = [
    'lang','language','onmouseover','onmouseout','script','style','font',
    'dir','face','size','color','style','class','width','height','hspace',
    'border','valign','align','background','bgcolor','text','link','vlink',
    'alink','cellpadding','cellspacing']

REMOVE_ATTRIBUTES += ['xmlns:WX', 'xmlns:a', 'xmlns:aml', 'xmlns:o', 'xmlns:pic', 
                      'xmlns:r', 'xmlns:v', 'xmlns:v', 'xmlns:wp', 'xmlns:w', 'xmlns:w10',
                     'role']

def remove_attributes_from_soup(soup):
    for attribute in REMOVE_ATTRIBUTES:
        for tag in soup.find_all(attrs={attribute: True}):
            del tag[attribute]
    return soup

## Removing tags but keep the text

https://stackoverflow.com/questions/1765848/remove-a-tag-using-beautifulsoup-but-keep-its-contents

In [9]:
REMOVE_TAGS = ['span', 'a']

def strip_tags_keep_text_from_soup(soup):
    for tag in REMOVE_TAGS: 
        for match in soup.find_all(tag):
            match.unwrap()
    return soup

# Combine consecutive p tags into one

https://stackoverflow.com/questions/50026264/beautifulsoup-combine-consecutive-tags

**Does not seem to be working**

In [139]:
for p in soup.find_all('p'):
    try:
        if p.find_previous().name=='p':
            p.find_previous().text += p.text
            p.decompose()
    except:
        pass

# Clean up the body tag

I want to find within `<p>` the string `* THIS_SPEAKER_SURNAME *:` and remove just the string. If `*:` exists but it does not have `THIS_SPEAKER_SURNAME` in it, then remove the whole flag.

In [50]:
soup = BeautifulSoup(open(input_xml_files[0]), "lxml-xml")
soup = remove_attributes_from_soup(soup)
soup = strip_tags_keep_text_from_soup(soup)

In [98]:
split_by_name = re.split('<name>|</name>',str(soup))[1:]
name_text = (zip(split_by_name[0::2], split_by_name[1::2]))

In [118]:
# add the party if it exists
for  in name_text:
    

BeautifulSoup(split_by_name[11]).find('party').text.lower()

'nats'

In [10]:
def parse_debate(debates):
    for debate in debates:
        debateinfo = get_debate_info(debate)
        try:
            print(debateinfo.prettify())
        except AttributeError:
            pass
        try:
            debatetext =  get_debate_text(debate)
            print(debatetext)
        except Exception as e:
            print(e)
            debate.decompose()
            parse_debate(get_subdebates(debate))
    return soup

In [11]:
def get_debates(soup):
    return soup.find_all('debate')

def get_debate_info(debate):
    return debate.find(re.compile('(.+)(debateinfo)'))

def get_debate_text(debate):
    t = debate.find_all(re.compile('(.+)(debate.text)'))
    if len(t) > 1:
        raise Exception("??? : multiple debate.text in {}".format(t))
    return t[0].get_text(strip=False).replace('\n', '')
        
def get_subdebates(debate):
    return debate.find_all(re.compile('(.+)(debate)(.[0-9]+)'))

def fix_spaces(mystring : str):
    mystring = re.sub(r'(?<=[.,])(?=[^\s])', r' ', mystring)
    return mystring

In [12]:
soup = BeautifulSoup(open(input_xml_files[0]), "lxml-xml")
soup = remove_attributes_from_soup(soup)
soup = strip_tags_keep_text_from_soup(soup)
soup.find_all('body')[0].get_text()

'\n\n\n\nTuesday, 20 March 2018\n\n\n\n\nThe PRESIDENT (Senator the Hon. \nScott Ryan) took the chair at 12:00, read prayers and made an acknowledgement of country.\n\n\n\xa0\n\n'

# Visualise

In [140]:
visualise = str(soup)
JSON(xmlplain.xml_to_obj(io.StringIO(visualise), strip_space=True))

<IPython.core.display.JSON object>

## Loop and write out

In [None]:
def 

for i in input_xml_files:
    file_name = i.split("/")[-1]
    year = i.split("/")[-2]
    house = i.split("/")[-3]
    print("Processing: {}".format(file_name))
    soup = BeautifulSoup(open(i), "lxml-xml")
    remove_attributes_from_soup(soup)
    strip_tags_keep_text_from_soup(soup)
    
    # Make output dir
    if not os.path.exists("{}/{}/{}".format(data_output_path, house, year)):
        os.makedirs("{}/{}/{}".format(data_output_path, house, year))
    with open("{}/{}/{}/{}".format(data_output_path, house, year, file_name), "w", encoding='utf-8') as file:
        file.write(str(soup))
    