# Corpus walkthrough

Example on how to gather speeches from riksdagen corpus together with metadata of unknown speakers.

In [None]:
import pandas as pd
from lxml import etree
from pyparlaclarin.read import speech_iterator

## pyparlaclarin

The pyparlaclarin package is very useful for working with data in parlaclarin format and can be installed with "pip install pyparlaclarin"
Here we will use an updated version of the speech_iterator function not yet live in the module.
It has been updated to also yield intro hashes in order to allow the use of metadata for unknown speakers which was added to riksdagen corpus v. 4.0.0.

https://welfare-state-analytics.github.io/pyparlaclarin/pyparlaclarin/index.html

In [None]:
def speech_iterator(root):
    """
    Convert Parla-Clarin XML to an iterator of speeches (ignoring any notes).

    Args:
        root: Parla-Clarin document root, as an lxml tree root.
    Return:
        speaker: corpus person_id.
        n: introduction hash.
        speech: concatenated consequtive speech segments by same speaker.
    """
    speaker = None
    n = None
    speech = []
    first_speech = True
    for body in root.findall(".//{http://www.tei-c.org/ns/1.0}body"):
        for div in body.findall("{http://www.tei-c.org/ns/1.0}div"):
            for elem in div:
                if elem.get('type') == 'speaker':
                    # Create output object
                    out = [speaker, n, ' '.join(' '.join(speech).replace('\n', '').split())]
                    n = elem.get('n')
                    speech = []
                    if not first_speech:
                        yield out
                    first_speech = False

                if elem.tag[-1] == 'u':
                    speaker = elem.get('who')
                    speech.extend(elem.itertext())


In [None]:
# Example protocol
protocol = '../corpus/protocols/197980/prot-197980--165.xml'

parser = etree.XMLParser(remove_blank_text=True)
root = etree.parse(protocol, parser).getroot()

In [None]:
# Creates a generator object of [person_id, intro_hash, speech]
speeches = speech_iterator(root)

# Display first speech
person_id, n, speech = list(speeches)[0]
print(f'Person id: {person_id}, hash: {n}')
print(f'Speech: {speech[:100]} ...')

In [None]:
# As a dataframe
speeches = speech_iterator(root)
df = pd.DataFrame(list(speeches), columns=['person_id', 'hash', 'speech'])
print(df.head())

In [None]:
# Unknown speakers in protocol
unknown = df[df['person_id'] == 'unknown']
print(f'Unknowns in protocol: {os.path.split(protocol)[-1]}')
print(unknown, '\n')

# Load metadata of unknown speakers
unknown_df = pd.read_csv('../input/matching/unknowns.csv')
print(f'Unknown database:')
print(unknown_df.head())

In [None]:
# Map unknown speech to metadata
print('Unknown speakers metadata:')
unknown_df[unknown_df['hash'] == unknown['hash'].iloc[0]]