In [1]:
# Calculate embeddings for ParlaMint data

### Load ParlaMint data

In [2]:
from serializers.parlamint_serializers import ParlaMint

parlamint: ParlaMint = ParlaMint.load()

## Split into sentences using stanza

In [None]:
import stanza

pipelines: dict[str, stanza.Pipeline] = {}
for language in parlamint.languages:
    stanza.download(language.get_iso_code())
    pipelines[language] = stanza.Pipeline(language.get_iso_code(), processors='tokenize', use_gpu=False)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-05-19 15:15:37 INFO: Downloading default packages for language: es (Spanish) ...
2023-05-19 15:15:38 INFO: File exists: /home/vidklopcic/stanza_resources/es/default.zip
2023-05-19 15:15:44 INFO: Finished downloading models and saved to /home/vidklopcic/stanza_resources.
2023-05-19 15:15:44 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-05-19 15:15:44 INFO: Loading these models for language: es (Spanish):
| Processor | Package |
-----------------------
| tokenize  | ancora  |
| mwt       | ancora  |

2023-05-19 15:15:44 INFO: Using device: cpu
2023-05-19 15:15:44 INFO: Loading: tokenize
2023-05-19 15:15:44 INFO: Loading: mwt
2023-05-19 15:15:44 INFO: Done loading processors!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-05-19 15:15:44 INFO: Downloading default packages for language: en (English) ...
2023-05-19 15:15:45 INFO: File exists: /home/vidklopcic/stanza_resources/en/default.zip
2023-05-19 15:15:51 INFO: Finished downloading models and saved to /home/vidklopcic/stanza_resources.
2023-05-19 15:15:51 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-05-19 15:15:51 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2023-05-19 15:15:51 INFO: Using device: cpu
2023-05-19 15:15:51 INFO: Loading: tokenize
2023-05-19 15:15:51 INFO: Done loading processors!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-05-19 15:15:51 INFO: Downloading default packages for language: hu (Hungarian) ...
2023-05-19 15:15:52 INFO: File exists: /home/vidklopcic/stanza_resources/hu/default.zip


In [None]:
import threading, time
from typing import Union
from serializers.parlamint_serializers import ParlaMintDataset
import re

re_interjection = re.compile(r'\[\[(.+?:)?(.+?)\]\]')


def split_on_interjections(text: str) -> list[Union[str, tuple[str, str]]]:
    chunks = []
    prev_match_end = 0
    for match in re_interjection.finditer(text):
        chunks.append(text[prev_match_end:match.start()].strip())
        chunks.append((match.group(1), match.group(2)))
        prev_match_end = match.end()
    if prev_match_end:
        chunks.append(text[prev_match_end:].strip())
    else:
        chunks.append(text)
    return chunks


n = {}


def process_language(language):
    datasets: list[ParlaMintDataset] = parlamint.languages[language]
    nlp = pipelines[language]

    for dataset in datasets:
        speeches = list(dataset.speeches(ignore_artefact=True))
        for speech in speeches:
            speech.sentences = []
            chunks = split_on_interjections(speech.text)
            for chunk in chunks:
                is_interjection = False
                interjection_speaker = None
                if isinstance(chunk, tuple):
                    interjection_speaker, chunk = chunk
                    is_interjection = True
                speech.sentences += [ParlaMintDataset.Speech.Sentence(
                    text=s.text,
                    interjection=is_interjection,
                    interjection_speaker=interjection_speaker,
                ) for s in nlp(speech.text).sentences]
        dataset.save_speeches(speeches)
        n.setdefault(language, 0)
        n[language] += 1


print('Processing sentences...')
threads = []
for language in parlamint.languages:
    t = threading.Thread(target=process_language, args=(language,))
    t.daemon = True
    t.start()
    threads.append(t)

def stats():
    while True:
        time.sleep(1)
        print(' | '.join([f'{l}: {d} / {len(parlamint.languages[l])}' for l, d in n.items()]), end='\r')

t = threading.Thread(target=stats)
t.daemon = True
t.start()

for t in threads:
    t.join()


