#### for Hansard xml version 2.0 ####

In [1]:
import os
import lxml.etree as etree
import re
import sys
import datetime as dt
from datetime import datetime 

import django
import platform

if platform.node() == "srv-mcc-apsis":
    sys.path.append('/home/leey/tmv/BasicBrowser/')
    xml_path = "/home/leey/australian_parliament_downloads/downloads_coal"

else:
    # local paths
    sys.path.append('/home/leey/Documents/Data/tmv/BasicBrowser/')
    xml_path = "/home/leey/Documents/Data/australian_parliament_downloads/downloads_coal"

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "BasicBrowser.settings")
django.setup()

# import from appended path
import parliament.models as pm
import cities.models as cmodels
from django.contrib.auth.models import User
from django.db.models import Q

from find_person_in_db_aph import *

In [2]:
# write output to file and terminal

import pprint
pretty_printer = pprint.PrettyPrinter(indent=4)

time_stamp = datetime.now().strftime("%y%m%d_%H%M%S")
output_file = "./parlsessions_aph_parser_output_" + time_stamp + ".log"
print("log file: {}".format(output_file))


class Logger(object):
    def __init__(self):
        self.terminal = sys.stdout
        self.log = open(output_file, "a")

    def write(self, message):
        self.terminal.write(message)
        self.log.write(message)

    def flush(self):
        #this flush method is needed for python 3 compatibility.
        #this handles the flush command by doing nothing.
        #you might want to specify some extra behavior here.
        pass

log file: ./parlsessions_aph_parser_output_200206_101920.log


In [3]:
class parse_xml_items(object):

    def __init__(self, xtree, v=1, period=None, session=None):
        self.v = v
        self.divs = xtree.findall("//debate")
        self.pp = int(xtree.xpath("//parliament.no//text()")[0])
        self.session = int(re.findall(r'\b\d+\b', xtree.xpath("//session.no//text()")[0])[0])

        if period is not None:
            if period != self.pp:
                print("! Warning: period number not matching: {} {}".format(period, self.pp))

        if session is not None:
            if session != self.session:
                print("! Warning: session number not matching: {} {}".format(session, self.session))

        self.date = datetime.strptime(xtree.xpath("//date//text()")[0], '%Y-%m-%d')
        #try:
        #    self.original_source = xtree.xpath("//sourceDesc//url//text()")[0]
        #except IndexError:
        #    self.original_source = "NA"
        if self.v > 0:
            print("xml with protocol {}/{} from {}".format(self.pp, self.session, self.date))

    def get_or_create_objects(self):

        replace_old_documents = False

        parl, created = pm.Parl.objects.get_or_create(
            country=cmodels.Country.objects.get(name="Australia"),
            level='N')
        if created and self.v > 0:
            print("created new object for parliament")

        pp, created = pm.ParlPeriod.objects.get_or_create(
            parliament=parl,
            n=self.pp)
        if created and self.v > 0:
            print("created new object for legislative period")

        if replace_old_documents == True:
            doc, created = pm.Document.objects.get_or_create(
                parlperiod=pp,
                doc_type="Parliamentary Debate",
                date=self.date
            )
            if created:
                print("created new object for parliamentary debate")
        else:
            doc = pm.Document(
                parlperiod=pp,
                doc_type="Parliamentary Debate",
                date=self.date
            )

        doc.sitting = self.session
        #doc.text_source = "GermaParlTEI from " + self.original_source
        doc.save()

        # delete old utterances associated with the doc
        doc.utterance_set.all().delete()
        self.doc = doc
        return doc

    def create_paragraph(self, text, utterance):
        text = text.replace("\n\n", "\n")
        text = clean_text(text)
        para = pm.Paragraph(
            utterance=utterance,
            text=text,
            word_count=len(text.split()),
            char_len=len(text)
        )
        para.save()
        return para

    def add_interjection(self, text, speaker, paragraph):
        interjection = pm.Interjection(
                paragraph=paragraph,
                text=text
                )
        interjection.type = pm.Interjection.SPEECH
        interjection.save()
        
        if speaker:
            interjection.persons.add(speaker)

    def run(self):

        self.get_or_create_objects()
        accepted_tags = ['speech', 'question', 'answer']
        
        ### start parsing of speeches
        for div in self.divs:
            if self.v > 1:
                print("div type: {}".format(div.xpath("type/text()")))
            
            for uts in div.iter('talk.start'):
                if uts.getparent().tag in accepted_tags:
                    # get agenda item 
                    tops = uts.xpath('ancestor::debate/child::debateinfo/child::title/text()')[0]
                    tops = str(tops)
                    
                    # create agenda item
                    agenda_item, created = pm.AgendaItem.objects.get_or_create(
                    title = tops,
                    document = self.doc
                    )
                    
                    for name in uts.xpath('talker//name'):
                        if name.get('role') == 'metadata':
                            namemd = name.text.split(', ')
                            if len(namemd) == 1:
                                names = namemd[0]
                            else:
                                names = namemd[1] + ' ' + namemd[0]

                    # match speaker to database:
                    info_dict = {}
                    for nameidxp in uts.xpath('talker/name.id/text()'): info_dict['nameid'] = nameidxp
                    for partyxp in uts.xpath('talker/party/text()'): info_dict['party'] = partyxp
                    for electoratexp in uts.xpath('talker/electorate/text()'): info_dict['electorate'] = electoratexp
                    for rolexp in uts.xpath('talker/role/text()'): info_dict['role'] = rolexp
                    info_dict['pp'] = self.pp
                    info_dict['session'] = self.session
                    info_dict['date'] = self.date

                    speaker = find_person_in_db_aph(names, add_info=info_dict, verbosity=self.v)

                    if speaker is None:
                        print(namemd[1],namemd[0])

                    ut = pm.Utterance(
                        document=self.doc,
                        speaker=speaker,
                        agenda_item = agenda_item,
                        #speaker_role=speaker_role
                    )
                    ut.save()

                    for c in uts.iter():
                        if self.v > 1:
                            print("{}: {}".format(c.tag, c.text))

                        if c.tag == "para":
                            if c.text:
                                para = self.create_paragraph(c.text.strip(u'\u2014'), ut)


                    for j in uts.xpath('following-sibling::*'):
                        if j.tag == "para":
                            if j.text:
                                para = self.create_paragraph(j.text.strip(u'\u2014'), ut)

                        elif j.tag == "interjection":
                            # identify speaker here
                            for name in j.xpath('talk.start/talker/name'):
                                if name.get('role') == 'metadata':
                                    namemd_inj = name.text.split(', ')
                                    if len(namemd_inj) == 1:
                                        names_inj = namemd_inj[0]
                                    else:
                                        names_inj = namemd_inj[1] + ' ' + namemd_inj[0]

                                    # match speaker to database:
                                    info_dict = {}
                                    for nameidxp in j.xpath('talk.start/talker/name.id/text()'): info_dict['nameid'] = nameidxp
                                    info_dict['pp'] = self.pp
                                    info_dict['session'] = self.session
                                    info_dict['date'] = self.date

                                    speaker_inj = find_person_in_db_aph(names_inj, add_info=info_dict, verbosity=self.v)

                                    if speaker_inj is None:
                                        print(namemd_inj[1], namemd_inj[0])

                            for k in j.xpath('child::*/child::para'):
                                # add interjection text and create interjection
                                if k.text:
                                    inj = self.add_interjection(k.text, speaker_inj, para)
                                    
                                else:
                                    emptytext = ""
                                    inj = self.add_interjection(emptytext, speaker_inj, para)

                        elif j.tag == "continue":
                            for con in j.xpath('child::*/child::para'):
                                if con.text:
                                    para = self.create_paragraph(con.text.strip(u'\u2014'), ut)
                        
                        elif j.tag == "motion" or j.tag == "quote":
                            for par in j.xpath('child::para'):
                                if par.text:
                                    para = self.create_paragraph(par.text.strip(u'\u2014'), ut)
                            
                
                elif uts.getparent().tag == "interjection":
                    for l in uts.xpath('parent::interjection/preceding-sibling::debateinfo/child::type'):
                        if l.text == "Notices":
                            # speaker
                            for name in uts.xpath('talker//name'):
                                if name.get('role') == 'metadata':
                                    namemd = name.text.split(', ')
                                    if len(namemd) == 1:
                                        names = namemd[0]
                                    else:
                                        names = namemd[1] + ' ' + namemd[0]

                            # match speaker to database:
                            info_dict = {}
                            for nameidxp in uts.xpath('talker/name.id/text()'): info_dict['nameid'] = nameidxp
                            for partyxp in uts.xpath('talker/party/text()'): info_dict['party'] = partyxp
                            for electoratexp in uts.xpath('talker/electorate/text()'): info_dict['electorate'] = electoratexp
                            for rolexp in uts.xpath('talker/role/text()'): info_dict['role'] = rolexp
                            info_dict['pp'] = self.pp
                            info_dict['session'] = self.session
                            info_dict['date'] = self.date

                            speaker = find_person_in_db_aph(names, add_info=info_dict, verbosity=self.v)

                            if speaker is None:
                                print(namemd[1],namemd[0])

                            ut = pm.Utterance(
                                document=self.doc,
                                speaker=speaker,
                            )
                            ut.save()
                    
                            # text
                            for m in uts.xpath('child::para|following-sibling::motion/child::para|following-sibling::quote/child::para'):
                                if m.text:
                                    para = self.create_paragraph(m.text.strip(u'\u2014'), ut)

In [4]:
# main execution script
if __name__ == '__main__':

    sys.stdout = Logger()

    single_doc = True
    replace_docs = False

    delete_all_words = False
    delete_all_parties = False
    delete_all_people = False
    delete_additional_persons = False

    if delete_all_words:
        print("Deleting all documents, utterances, paragraphs and interjections.")
        pm.Interjection.objects.all().delete()
        pm.Paragraph.objects.all().delete()
        pm.Utterance.objects.all().delete()
        pm.Document.objects.all().delete()
        print("Deletion done.")
        
    if delete_all_parties:
        print("Deleting all parties added.")
        pm.Party.objects.all().delete()
        
    if delete_all_people:
        print("Deleting all people added.")
        pm.Person.objects.all().delete()
            
    if delete_additional_persons:
        print("Deleting all persons added from protocol parsing.")
        pm.Person.objects.filter(information_source__startswith="from protocol scraping").delete()

    if single_doc:
        # single file
        #xml_file = os.path.join(xml_path, "163-5858.xml") 
        #xml_file = os.path.join(xml_path, "168-7.xml") 
        #xml_file = os.path.join(xml_path, "173-4969.xml")
        #xml_file = os.path.join(xml_path, "6593-3.xml")
        xml_file = os.path.join(xml_path, "6602-3.xml")
        
        print("reading from {}".format(xml_file))

        xtree = etree.parse(xml_file)
        etree.strip_tags(xtree, 'inline')
        parser = parse_xml_items(xtree)

        parser.run()
        print("Done.")

        exit()

reading from /home/leey/australian_parliament_downloads/downloads_coal/6602-3.xml
xml with protocol 42/1 from 2009-03-17 00:00:00
Person not found in database as speaker: Sid Sidebottom
Trying to find person by name
Found speaker by name: Sid Sidebottom
Person not found in database as speaker: Sid Sidebottom
Trying to find person by name
Found speaker by name: Sid Sidebottom
Person not found in database as speaker: Arch Bevis
Trying to find person by name
Found speaker by name: Arch Bevis
Done.


In [None]:
# go through all scripts iteratively
    
for pperiod in range(13, 12, -1):
        for session in range(0, 300):

            xml_file = os.path.join(tei_path, "{wp:02d}/BT_{wp:02d}_{sn:03d}.xml".format(wp=pperiod, sn=session))

            if os.path.isfile(xml_file):
                print("reading from {}".format(xml_file))

                xtree = etree.parse(xml_file)
                if replace_docs:
                    pm.Document.objects.filter(parlperiod__n=pperiod, sitting=session).delete()
                pm.Document.objects.filter(parlperiod__n=pperiod, sitting=session,
                                           text_source__startswith="GermaParlTEI from ").delete()

                parser = parse_tei_items(xtree, period=pperiod, session=session)
                parser.run()
                
                print("Done")
                exit()

### Examine results of scraper

In [2]:
# deleting specific document objects
pm.Document.objects.filter(id=19861).delete()

(826,
 {'parliament.AgendaItem': 13,
  'parliament.Document': 1,
  'parliament.Document_search_matches': 0,
  'parliament.Interjection': 52,
  'parliament.Interjection_parties': 0,
  'parliament.Interjection_persons': 52,
  'parliament.Paragraph': 624,
  'parliament.Paragraph_search_matches': 0,
  'parliament.Utterance': 84,
  'parliament.Utterance_search_matches': 0,
  'scoping.DocOwnership': 0,
  'scoping.Note': 0})

In [2]:
# checking name ids
name_id = "MK6"
query = pm.Person.objects.filter(aph_id=name_id)
print(query)

<QuerySet [<Person: Frank Mossfield>]>


In [5]:
#nametest = 'Harry (Mr DEPUTY SPEAKER) Jenkins'
nametest = 'Mr SPEAKER'
name = re.sub(r'\([^)]*\)', '', nametest)
name = name.replace('  ', ' ')
name = name.replace('Dr ', '')
name = name.replace('Mr ', '')
name = name.replace('The ', '')

if len(name.split(' ')) > 1:
        surname = name.split(' ')[-1]
        firstname = name.split(' ')[0]
else:
    surname = name
    firstname = ''

surname

'SPEAKER'

In [2]:
for i in pm.Post.objects.all():
    print(i.person)

Sir Frederick Holder
Sir Frederick Holder
Sir Frederick Holder
Charles Salmon
Charles McDonald
William Johnson
Charles McDonald
William Johnson
William Johnson
William Watt
Littleton Groom
Littleton Groom
Norman Makin
George Mackay
George Bell
George Bell
Walter Nairn
John Rosevear
John Rosevear
John Rosevear
Archie Cameron
Archie Cameron
Archie Cameron
Archie Cameron
William Aston
William Aston
James Cope
James Cope
Gordon Scholes
Billy Snedden
Billy Snedden
Billy Snedden
Gloria Child
Gloria Child
Leo McLeay
Leo McLeay
Stephen Martin
Robert Halverson
Ian Sinclair
John Andrew
John Andrew
David Hawker
Peter Slipper
Anna Burke
Bronwyn Bishop
John McLeay
John McLeay
John McLeay
John McLeay
Dr Henry Jenkins
Dr Henry Jenkins
Henry Jenkins
Henry Jenkins
Anthony Smith


In [4]:
query = pm.Person.objects.filter(surname='Jenkins', first_name='Henry', information_source='AustralianPoliticians')
print(query)

<QuerySet [<Person: Dr Henry Jenkins>, <Person: Henry Jenkins>]>


In [6]:
query[1].alt_first_names

['Alfred', 'Henry', 'Henry Alfred', '(Jr)', 'Harry', 'Harry (Jr)']

In [25]:
date = datetime.strptime('2000-02-16', '%Y-%m-%d')

In [12]:
rquery = pm.Post.objects.filter(
            Q(title='Deputy Speaker') | Q(title='Second Deputy Speaker'),
            #parlperiod__n=pp,
            start_date__lte = date,
            end_date__gte = date
            )

In [13]:
rquery

<QuerySet [<Post: Post object (277)>, <Post: Post object (296)>]>

In [16]:
surname='Jenkins'
squery = rquery.filter(person__surname = surname)
parl_speaker = squery.first().person
parl_speaker

<Person: Henry Jenkins>

In [2]:
# testing function
info_dict = {}
info_dict['pp'] = int(39)
info_dict['date'] = datetime.strptime("2000-02-16", "%Y-%m-%d")
info_dict['nameid'] = "10000"
names = 'Mr SPEAKER'

sp = find_person_in_db_aph(names, add_info= info_dict, verbosity = 2)
sp

Finding speaker with name id: 10000
Finding speaker from Speaker database...


<Person: John Andrew>

In [9]:
pp = int(39)
date = datetime.strptime("2000-02-16", "%Y-%m-%d")

rquery = pm.Post.objects.filter(
            Q(title='Deputy Speaker') | Q(title='Second Deputy Speaker'),
            parlperiod__n=pp,
            start_date__lte = date,
            end_date__gte = date
            )
rquery[1].person

<Person: Henry Jenkins>

In [13]:
surname = "Sidebottom"
pm.Person.objects.filter(surname=surname)[0].alt_first_names

['Peter Sid', 'Peter', 'Sid']

In [16]:
pm.Person.objects.filter(surname="Sidebottom",alt_first_names__contains=["Sid"])[0].aph_id

'849'

In [17]:
pm.Person.objects.filter(aph_id="849")

<QuerySet [<Person: Peter Sidebottom>]>