In [1]:
import sys
import pandas as pd
import json
import csv
import pickle
from pathlib import Path
from bs4 import BeautifulSoup
import json
import multiprocessing
from tqdm import tqdm
import pandas as pd
import math
import multiprocessing as mp
from bs4 import BeautifulSoup
from collections import defaultdict
from tqdm import tqdm

In [11]:
with open('artefacts/parlamint_with_embeddings.json', 'r') as f:
    parlamint = json.load(f)
parlamint_es_ga = parlamint['ES-GA']
parlaming_gb = parlamint['GB']
parlamint_hu = parlamint['HU']
parlamint_ua = parlamint['UA']
parlamint_si = parlamint['SI']
parlamints = [
    parlamint_es_ga,
    parlamint_si,
    parlaming_gb,
    parlamint_hu,
    parlamint_ua,
]
artefacts = Path('artefacts/pandas_frames')
artefacts.mkdir(exist_ok=True)

In [None]:
embedding_files = [
    ('SI', 'ParlaMint_SI_embeddings_truncated.pkl'),
    ('ES-GA', 'ParlaMint_EE_embeddings_truncated.pkl'),
    ('GB', 'ParlaMint_GB_commons_embeddings_truncated.pkl'),
    ('GB', 'ParlaMint_GB_lords_embeddings_truncated.pkl'),
    ('HU', 'ParlaMint_HU_embeddings_truncated.pkl'),
    ('UA', 'ParlaMint_UA_embeddings_truncated.pkl'),
]

embeddings = []
for language, embedding_file in embedding_files:
    with open(Path('artefacts/bojan') / embedding_file, 'rb') as f:
        print(f'Loading', embedding_file)
        embeddings.append((language, pickle.load(f)))

## Create embeddings dataframe

In [14]:
df = pd.DataFrame(columns=['id', 'language', 'embedding'])
for language, lang_embs in embeddings:
    df = pd.concat([df, pd.DataFrame([
        (eid, language, embedding)
        for eid, embedding in lang_embs.items()
    ], columns=['id', 'language', 'embedding'])])
df.reset_index().to_feather(artefacts / 'embeddings.feather')

## Create ParlaMint dataframe

In [17]:
def worker(args):
    language, xml_file = args
    with open(xml_file, 'r') as f:
        soup = BeautifulSoup(f.read(), 'xml')
    sentences = []
    for speech in soup.find_all('u'):
        speech_id = speech['xml:id']
        for sentence in speech.find_all('s'):
            sentence_id = sentence['xml:id']
            sentence_text = ''
            lemmas = {}
            for unit in sentence.find_all(['w', 'pc']):
                text = unit.get_text()
                if not text:
                    continue
                if unit.name == 'w':
                    sentence_text += ' ' + unit.get_text()
                    lemma = unit.get('lemma')
                    if lemma and lemma.isalpha():
                        lemmas[lemma] = lemmas.get(lemma, 0) + 1
                elif unit.name == 'pc':
                    sentence_text += unit.get_text()
            sentence_text = sentence_text.strip()
            sentences.append([
                sentence_id,
                speech_id,
                language,
                sentence_text,
                json.dumps(lemmas)
            ])
    return sentences


def get_xml_sentences(parlamints):
    all_sentences = []

    with mp.Pool(mp.cpu_count()) as pool:
        for parlamint in parlamints:
            parlamint = [(i['language'], i['xml_path']) for i in parlamint]
            for result in tqdm(pool.imap(worker, parlamint), total=len(parlamint)):
                all_sentences.extend(result)

    return all_sentences


all_sentences = get_xml_sentences(parlamints)
df = pd.DataFrame(all_sentences, columns=['sentence_id', 'speech_id', 'language', 'text', 'lemmas_tf'])
df.to_feather(artefacts / 'parlamint_sentences.feather')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [01:55<00:00,  2.61it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 1572/1572 [06:49<00:00,  3.84it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 2209/2209 [10:49<00:00,  3.40it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 515/515 [02:44<00:00,  3.13it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 1091/1091 [01:52<00:00,  9.70it/s]


In [15]:
import pandas as pd
import csv
from pathlib import Path
from multiprocessing import Pool, cpu_count
from tqdm import tqdm


def process_file(metadata):
    with open(metadata, 'r') as file:
        reader = csv.reader(file, delimiter='\t')
        columns = next(reader)
        rows = list(reader)
    return rows, columns


# get list of metadata files
files = list(Path('data/nonannotated').glob('ParlaMint-*.txt/*/ParlaMint-*meta.tsv'))

# set up a multiprocessing pool and process files in parallel
with Pool(cpu_count()) as pool:
    results = list(tqdm(pool.imap(process_file, files), total=len(files)))

# aggregate results
dataframes = [pd.DataFrame(rows, columns=columns) for rows, columns in results]
df = pd.concat(dataframes)

df.reset_index().to_feather(artefacts / 'parlamint_metadata.feather')
df

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 5689/5689 [00:07<00:00, 807.12it/s]


Unnamed: 0,ID,Title,Date,Body,Term,Session,Meeting,Sitting,Agenda,Subcorpus,Speaker_role,Speaker_MP,Speaker_Minister,Speaker_party,Speaker_party_name,Party_status,Speaker_name,Speaker_gender,Speaker_birth
0,ParlaMint-ES-GA_2017-09-26-DSPG040.u1,Minutes of the Assembly of the Galician Parlia...,2017-09-26,Unicameralism,010,,040,2017-09-26,,Reference,Chairperson,MP,-,PPdeG,Partido Popular de Galicia,Coalition,"Santalices Vieira, Miguel Ángel",M,1955
1,ParlaMint-ES-GA_2017-09-26-DSPG040.u2,Minutes of the Assembly of the Galician Parlia...,2017-09-26,Unicameralism,010,,040,2017-09-26,,Reference,Chairperson,MP,-,PPdeG,Partido Popular de Galicia,Coalition,"Santalices Vieira, Miguel Ángel",M,1955
2,ParlaMint-ES-GA_2017-09-26-DSPG040.u3,Minutes of the Assembly of the Galician Parlia...,2017-09-26,Unicameralism,010,,040,2017-09-26,,Reference,Regular,MP,-,PPdeG,Partido Popular de Galicia,Coalition,"Arias Rodríguez, Raquel",F,1966
3,ParlaMint-ES-GA_2017-09-26-DSPG040.u4,Minutes of the Assembly of the Galician Parlia...,2017-09-26,Unicameralism,010,,040,2017-09-26,,Reference,Chairperson,MP,-,PPdeG,Partido Popular de Galicia,Coalition,"Santalices Vieira, Miguel Ángel",M,1955
4,ParlaMint-ES-GA_2017-09-26-DSPG040.u5,Minutes of the Assembly of the Galician Parlia...,2017-09-26,Unicameralism,010,,040,2017-09-26,,Reference,Chairperson,MP,-,PPdeG,Partido Popular de Galicia,Coalition,"Santalices Vieira, Miguel Ángel",M,1955
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213,ParlaMint-UA_2019-11-13-m0.u214,"Ukrainian parliamentary corpus ParlaMint-UA, t...",2019-11-13,Unicameralism,9,2,,2019-11-13,,Reference,Regular,MP,-,фЄС,"Фракція політичної партії ""Європейська солідар...",Opposition,"Олексійович Гончаренко, Олексій",M,1980
214,ParlaMint-UA_2019-11-13-m0.u215,"Ukrainian parliamentary corpus ParlaMint-UA, t...",2019-11-13,Unicameralism,9,2,,2019-11-13,,Reference,Chairperson,MP,-,,,,"Олександрович Разумков, Дмитро",M,1983
215,ParlaMint-UA_2019-11-13-m0.u216,"Ukrainian parliamentary corpus ParlaMint-UA, t...",2019-11-13,Unicameralism,9,2,,2019-11-13,,Reference,Regular,MP,-,фЄС,"Фракція політичної партії ""Європейська солідар...",Opposition,"Володимирівна Геращенко, Ірина",F,1971
216,ParlaMint-UA_2019-11-13-m0.u217,"Ukrainian parliamentary corpus ParlaMint-UA, t...",2019-11-13,Unicameralism,9,2,,2019-11-13,,Reference,Chairperson,MP,-,,,,"Олександрович Разумков, Дмитро",M,1983


## Generate speech_id to lemmas mapping

In [18]:
sentences = pd.read_feather('artefacts/pandas_frames/parlamint_sentences.feather')

In [23]:
speeches = {}
for language, sentences_by_language in tqdm(sentences.groupby('language')):
    speeches[language] = {}
    for speech_id, lemmas in sentences_by_language.groupby('speech_id'):
        speeches[language][speech_id] = set()
        for lemma_tf in lemmas.lemmas_tf:
            speeches[language][speech_id].update(json.loads(lemma_tf).keys())

for language, speeches in speeches.items():
    with open(artefacts / f'speech2lemmas_{language}.pkl', 'wb') as f:
        pickle.dump(speeches, f)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:15<00:00, 27.14s/it]


In [20]:
speeches

{'ParlaMint-ES-GA_2015-01-26-DSPGDP007.u1': {'a',
  'abrir',
  'asistencia',
  'benvido',
  'bo',
  'comprobación',
  'de',
  'deputación',
  'día',
  'e',
  'este',
  'favor',
  'moi',
  'o',
  'permanente',
  'por',
  'proceder',
  'que',
  'secretario',
  'sesión',
  'señor',
  'soneira',
  'tajes',
  'todo',
  'xaneiro'},
 'ParlaMint-ES-GA_2015-01-26-DSPGDP007.u10': {'a',
  'abaratar',
  'aberto',
  'abocar',
  'abordar',
  'abril',
  'absolutamente',
  'aclarar',
  'acoller',
  'acordo',
  'activo',
  'actuar',
  'ademais',
  'adianto',
  'adoptar',
  'advertir',
  'afortunadamente',
  'agora',
  'agosto',
  'alarmante',
  'algo',
  'algún',
  'alto',
  'amosar',
  'ano',
  'anual',
  'apoiar',
  'apoio',
  'apuntar',
  'argumento',
  'asegurar',
  'asinar',
  'así',
  'atento',
  'atopar',
  'aturrullo',
  'até',
  'ausencia',
  'aval',
  'axuda',
  'aínda',
  'baixada',
  'baixar',
  'barreira',
  'ben',
  'beneficiar',
  'beneficio',
  'bng',
  'bo',
  'borrador',
  'broma',
  