In [8]:
import json
from pathlib import Path

with open('artefacts/parlamint_with_embeddings.json', 'r') as f:
    parlamint = json.load(f)
parlamint_es_ga = parlamint['ES-GA']
parlaming_gb = parlamint['GB']
parlamint_hu = parlamint['HU']
parlamint_ua = parlamint['UA']
parlamint_si = parlamint['SI']
parlamints = [parlamint_si, parlaming_gb, parlamint_es_ga, parlamint_hu, parlamint_ua]
parlamint_si[0].keys()
artefacts = Path('artefacts/tf_idf')

In [24]:
from bs4 import BeautifulSoup
import json
import multiprocessing
from tqdm import tqdm


def score_tf_idf(document):
    lemmas = set()
    xml_file = document['xml_path']
    with open(xml_file, "r", encoding="utf8") as f:
        contents = f.read()
    soup = BeautifulSoup(contents, 'xml')

    for word in soup.find_all('w'):
        lemma: str = word.get('lemma')
        if not lemma or not lemma.isalpha():
            continue
        lemmas.add(lemma.lower())
    return lemmas


def process_parlamint(parlamint):
    print('processing', parlamint[0]['language'])
    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
        result_list = []
        for result in tqdm(pool.imap_unordered(score_tf_idf, parlamint), total=len(parlamint)):
            result_list.append(result)
    lemma_frequencies = {}
    for lemmas in result_list:
        for lemma in lemmas:
            lemma_frequencies[lemma] = lemma_frequencies.get(lemma, 0) + 1
    with open(artefacts / f'{parlamint[0]["language"]}_lemma_frequencies.json', 'w') as f:
        json.dump(lemma_frequencies, f)


for parlamint in parlamints:
    process_parlamint(parlamint)


processing ES-GA


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [01:26<00:00,  3.51it/s]


processing HU


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 515/515 [02:00<00:00,  4.29it/s]


processing UA


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 1091/1091 [01:32<00:00, 11.86it/s]


In [33]:
import math

def get_sentence_scores(args):
    lemma_frequencies, document_count, xml_file = args
    with open(lemma_frequencies, 'r') as f:
        lemma_frequencies = json.load(f)

    with open(xml_file, "r", encoding="utf8") as f:
        contents = f.read()
    soup = BeautifulSoup(contents, 'xml')

    scores: dict[str, dict[str, int]] = {}
    speaker_ids = set([i.get('who') for i in soup.find_all('u') if i.get('who')])
    for speaker_id in speaker_ids:
        u_elements = soup.find_all('u', {'who': speaker_id})
        sentences = [u.find_all('s') for u in u_elements]
        sentences = [item for sublist in sentences for item in sublist]

        speaker_tf = {}
        speaker_sentences = {}
        for sentence in sentences:
            sentence_id = sentence.get('xml:id')
            speaker_sentences[sentence_id] = []
            for word in sentence.find_all('w'):
                lemma: str = word.get('lemma')
                if not lemma or not lemma.isalpha():
                    continue
                lemma = lemma.lower()
                speaker_tf[lemma] = speaker_tf.get(lemma, 0) + 1
                speaker_sentences[sentence_id].append(lemma)

        for sentence_id, sentence in speaker_sentences.items():
            scores[sentence_id] = []
            for lemma in sentence:
                tf = speaker_tf[lemma]
                df = lemma_frequencies[lemma]
                idf = math.log(document_count / df) if df else 0  # IDF
                score = tf * idf  # TF-IDF
                scores[sentence_id].append(score)

    return scores


def process_parlamint(parlamint):
    print('processing', parlamint[0]['language'])
    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
        sentences = {}
        for result in tqdm(pool.imap_unordered(get_sentence_scores, [
            (artefacts / f'{i["language"]}_lemma_frequencies.json', len(parlamint), i['xml_path']) for i in parlamint
        ]), total=len(parlamint)):
            sentences.update(result)
    with open(artefacts / f'{parlamint[0]["language"]}_sentence_scores.json', 'w') as f:
        json.dump(sentences, f)

for parlamint in parlamints[1:]:
    process_parlamint(parlamint)

processing SI


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 1572/1572 [14:50<00:00,  1.77it/s]


In [10]:
import numpy as np

for parlamint in parlamints[1:]:
    print('processing', parlamint[0]['language'])
    with open(artefacts / f'{parlamint[0]["language"]}_sentence_scores.json', 'r') as f:
        sentence_scores = json.load(f)
    max_score = max([s for ss in sentence_scores.values() for s in ss])
    print('max_score', max_score)
    sentece_weights = {}
    for sentence_id in sentence_scores:
        sentece_weights[sentence_id] = np.mean(sentence_scores[sentence_id]) / max_score

    with open(artefacts / f'{parlamint[0]["language"]}_sentence_weights.json', 'w') as f:
        json.dump(sentece_weights, f)

processing SI
max_score 540.1589589347477


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [39]:
import heapq

def print_ids(ids):
    def get_path(id):
        year = id.split('_')[1].split('-')[0]
        return f'data/annotated/ParlaMint-SI.TEI.ana/{year}/{id.split(".seg")[0]}.xml'

    soups = {}
    for sid in ids:
        xml_path = get_path(sid)
        if xml_path not in soups:
            with open(xml_path, 'r') as f:
                contents = f.read()
            soups[xml_path] = BeautifulSoup(contents, 'xml')
        soup = soups[xml_path]
        sentence = soup.find('s', {'xml:id': sid})
        print(' '.join([w.get_text() for w in sentence.find_all('w')]))

for parlamint in parlamints[:1]:
    print('processing', parlamint[0]['language'])
    with open(artefacts / f'{parlamint[0]["language"]}_sentence_weights.json', 'r') as f:
        sentece_weights = json.load(f)
        top_50 = heapq.nlargest(50, sentece_weights, key=lambda x: sentece_weights[x])
        print_ids(top_50)
        print('--------')
        bottom_50 = heapq.nsmallest(50, sentece_weights, key=lambda x: sentece_weights[x])
        print_ids(bottom_50)


processing SI
243 ventilatorjev
453 ventilatorjev
Merilniki slanosti
404 ventilatorji 13. marca
Prehajamo na podprogram Štipendije
Podjetje Lastinski je prišlo po merilnike slanosti k Darsu
Multinacionalka je prodala žilno opornico recimo Mark Medicalu S. p.
Oklepnike
Ceca
Prehajamo na podprogram Dolgotrajna oskrba
Prehajamo na podprogram Univerzitetne knjižnice
Prehajamo na podprogram Kakovost zraka
Lastniki so bili Mark Medical S. p.
Prva pogodba med Darsom in podjetjem Lastinski
To se pravi podjetje Lastinski je vrnilo merilnike slanosti
Palestina
Palestina
Burka
Nazaj k ventilatorjem
Mark Medical Slovenija je v lastniški družbi KB 1909
Tako pride do posla podjetje Lastinski
Število ventilatorjev
Število ventilatorjev
Prehajamo na podprogram Podpora raziskovalni infrastrukturi
Prehajamo na podprogram Podpora raziskovalni infrastrukturi
Pomožnega motorja
Prehajamo na podprogram Kreiranje delovnih mest
Prehajamo na podprogram Kreiranje delovnih mest
Poglejte kaj je pa firma Lastinski 