# TF-IDF

## Initialize environment

In [52]:
import json
import pickle
from pathlib import Path
from bs4 import BeautifulSoup
import json
import multiprocessing
from tqdm import tqdm
import pandas as pd
import math
import multiprocessing as mp
from bs4 import BeautifulSoup
from collections import defaultdict
from tqdm import tqdm

with open('artefacts/parlamint_with_embeddings.json', 'r') as f:
    parlamint = json.load(f)
parlamint_es_ga = parlamint['ES-GA']
parlaming_gb = parlamint['GB']
parlamint_hu = parlamint['HU']
parlamint_ua = parlamint['UA']
parlamint_si = parlamint['SI']
parlamints = [
    parlamint_si,
    parlamint_es_ga,
    parlaming_gb,
    parlamint_hu,
    parlamint_ua,
]
artefacts = Path('artefacts/tf_idf')
parlamint_si[0].keys()

dict_keys(['language', 'date', 'xml_path', 'embeddings_path'])

## Create mappings for speech ids to xml files

In [53]:
import multiprocessing as mp
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm


def worker(document):
    rows = []
    with open(document['xml_path'], 'r', encoding='utf8') as f:
        contents = f.read()
    soup = BeautifulSoup(contents, 'xml')
    for speech in soup.find_all('u'):
        speech_id = speech.get('xml:id')
        rows.append([speech_id, document['xml_path']])
    return rows


columns = ['speech_id', 'xml_path']
all_rows = []

# Create a multiprocessing pool
with mp.Pool(mp.cpu_count()) as pool:
    for parlamint in parlamints:
        # Use pool.map or pool.imap to run the worker function in parallel on each document
        # tqdm can be used in conjunction with pool.imap for a progress bar
        for result in tqdm(pool.imap(worker, parlamint), total=len(parlamint)):
            all_rows.extend(result)

df = pd.DataFrame(all_rows, columns=columns)
df.to_feather('artefacts/mappings/speech2xml.feather')

  0%|                                                                                                                | 0/1572 [00:02<?, ?it/s]

KeyboardInterrupt



## TF-IDF

### Parameters
#### Utils

In [54]:
from dataclasses import dataclass
from typing import Optional, Any


@dataclass
class TFIDFJob:
    tf_idf_output_dir: Path
    speech_cluster_labels: list[str]
    speech_clusters: list[set[str]]
    corpus_documents: Optional[Any] = None
    mp_corpus_documents: Optional[Any] = None

    @classmethod
    def pickle_path(cls, tf_idf_output_dir: Path):
        return tf_idf_output_dir / 'job.pickle'

    def to_pickle(self):
        with open(self.pickle_path(self.tf_idf_output_dir), 'wb') as f:
            pickle.dump({
                'tf_idf_output_dir': self.tf_idf_output_dir,
                'speech_cluster_labels': self.speech_cluster_labels,
                'speech_clusters': self.speech_clusters,
            }, f)

    @classmethod
    def from_pickle(cls, tf_idf_output_dir: Path):
        pickle_path = cls.pickle_path(tf_idf_output_dir)
        if not pickle_path.exists():
            return None
        with open(pickle_path, 'rb') as f:
            data = pickle.load(f)
        return cls(**data)


def discover_speech_ids(xml_documents: list[str]):
    speech_ids = set()
    for xml_file in xml_documents:
        with open(xml_file, "r", encoding="utf8") as f:
            contents = f.read()
        soup = BeautifulSoup(contents, 'xml')
        for speech in soup.find_all('u'):
            speech_id = speech.get('xml:id')
            speech_ids.add(speech_id)
    return speech_ids


def get_speech_ids_by_speaker_worker(xml_file):
    speech_ids_by_speaker = defaultdict(set)
    with open(xml_file, "r", encoding="utf8") as f:
        contents = f.read()
    soup = BeautifulSoup(contents, 'xml')
    for speech in soup.find_all('u'):
        speech_id = speech.get('xml:id')
        speaker = speech.get('who')
        speech_ids_by_speaker[speaker].add(speech_id)
    return dict(speech_ids_by_speaker)


def get_speech_ids_by_speaker(xml_documents: list[str]):
    speech_ids_by_speaker = defaultdict(set)

    # Create a multiprocessing pool
    with mp.Pool(mp.cpu_count()) as pool:
        # Use pool.imap to run the worker function in parallel on each xml_file
        # tqdm can be used in conjunction with pool.imap for a progress bar
        for result in tqdm(pool.imap(get_speech_ids_by_speaker_worker, xml_documents), total=len(xml_documents)):
            # Merge the result into the final dictionary
            for speaker, speech_ids in result.items():
                speech_ids_by_speaker[speaker].update(speech_ids)

    return dict(speech_ids_by_speaker)

#### TF-IDF per speaker for language

In [None]:
tf_idf_jobs = []
for parlamint in parlamints:
    print('discovering speech ids by speaker for', parlamint[0]['language'])
    speech_ids_by_speaker = get_speech_ids_by_speaker([d['xml_path'] for d in parlamint])
    speech_ids_by_speaker = list(speech_ids_by_speaker.items())
    tf_idf_output_dir = artefacts / 'by_speaker' / parlamint[0]["language"]
    job = TFIDFJob.from_pickle(tf_idf_output_dir)
    if job:
        tf_idf_jobs.append(job)
        continue
    tf_idf_jobs.append(
        TFIDFJob(
            tf_idf_output_dir=tf_idf_output_dir,
            speech_cluster_labels=[i[0] for i in speech_ids_by_speaker],
            speech_clusters=[i[1] for i in speech_ids_by_speaker],
        )
    )
    tf_idf_jobs[-1].to_pickle()

In [55]:
parl_base = Path('data/annotated/ParlaMint-GB.TEI.ana')
job = TFIDFJob(
    tf_idf_output_dir=Path('artefacts/tf_idf/0001-test'),
    speech_cluster_labels=[
        '03-04 - 0.769',
        '01-15 - 0.758',
        '11-25 - 0.749',
        '02-05 - 0.000',
        '11-20 - 0.006',
        '09-11 - 0.015',
    ],
    speech_clusters=[
        discover_speech_ids(list(parl_base.glob('2019/*03-04*commons*'))),
        discover_speech_ids(list(parl_base.glob('2018/*01-15*commons*'))),
        discover_speech_ids(list(parl_base.glob('2020/*11-25*commons*'))),
        discover_speech_ids(list(parl_base.glob('2016/*02-05*commons*'))),
        discover_speech_ids(list(parl_base.glob('2015/*11-20*commons*'))),
        discover_speech_ids(list(parl_base.glob('2020/*09-11*commons*')))
    ],
)
tf_idf_jobs = [job]

### Setup

In [56]:
speech2xml = pd.read_feather('artefacts/mappings/speech2xml.feather').set_index('speech_id')
for job in tf_idf_jobs:
    job.tf_idf_output_dir.mkdir(parents=True, exist_ok=True)

    job.corpus_documents = []
    for speech_cluster in job.speech_clusters:
        job.corpus_documents.append((speech_cluster, set(speech2xml.loc[list(speech_cluster), 'xml_path'].unique())))

    # used for multiprocessing
    job.mp_corpus_documents = []
    for i, (speech_cluster, xml_files) in enumerate(job.corpus_documents):
        for xml_file in xml_files:
            job.mp_corpus_documents.append([i, speech_cluster, xml_file])

### TF-IDF

In [57]:
def get_document_lemmas(document):
    speech_cluster_i, speech_cluster, xml_file = document
    lemmas = {}
    with open(xml_file, "r", encoding="utf8") as f:
        contents = f.read()
    soup = BeautifulSoup(contents, 'xml')
    document_speeches = {i.get('xml:id'): i for i in soup.find_all('u')}
    for speech_id in speech_cluster:
        sentence = document_speeches.get(speech_id)
        if not sentence:
            continue
        for word in sentence.find_all('w'):
            lemma: str = word.get('lemma')
            if not lemma or not lemma.isalpha():
                continue
            lemma = lemma.lower()
            lemmas[lemma] = lemmas.get(lemma, 0) + 1
    return speech_cluster_i, lemmas


df = None
for job in tf_idf_jobs:
    tf_per_cluster_id = {}
    df_per_lemma = {}
    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
        for cluster_i, result in tqdm(pool.imap_unordered(get_document_lemmas, job.mp_corpus_documents),
                                      total=len(job.mp_corpus_documents)):
            if cluster_i not in tf_per_cluster_id:
                tf_per_cluster_id[cluster_i] = {}
            for lemma in result:
                tf_per_cluster_id[cluster_i][lemma] = tf_per_cluster_id[cluster_i].get(lemma, 0) + result[lemma]
                if lemma not in df_per_lemma:
                    df_per_lemma[lemma] = set()
                df_per_lemma[lemma].add(cluster_i)

    df_per_lemma = {lemma: len(df_per_lemma[lemma]) for lemma in df_per_lemma}
    tf_per_cluster = [tf_per_cluster_id[i] for i in range(len(job.speech_clusters))]
    print('Calculating TF-IDF')
    idf_per_cluster = []
    for i, tfs in enumerate(tf_per_cluster):
        tf_idf = {}
        max_tf = max(tfs.values())
        for lemma, tf in tfs.items():
            df = df_per_lemma[lemma]
            idf = math.log(len(job.speech_clusters) / df) if df else 0  # IDF
            tf = 0.5 + 0.5 * tf / max_tf
            tf_idf[lemma] = tf * idf
        idf_per_cluster.append(tf_idf)
    print('Saving')
    with open(job.tf_idf_output_dir / f'tf_df_values.json', 'w') as f:
        json.dump(idf_per_cluster, f)

    if len(tf_idf_jobs) == 1:
        data = {}
        for label, idf in zip(job.speech_cluster_labels, idf_per_cluster):
            data[label], data[f'{label}_scores'] = zip(*sorted(idf.items(), key=lambda x: x[1], reverse=True))
        df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data.items()]))
df

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00,  1.02it/s]


Calculating TF-IDF
Saving


Unnamed: 0,03-04 - 0.769,03-04 - 0.769_scores,01-15 - 0.758,01-15 - 0.758_scores,11-25 - 0.749,11-25 - 0.749_scores,02-05 - 0.000,02-05 - 0.000_scores,11-20 - 0.006,11-20 - 0.006_scores,09-11 - 0.015,09-11 - 0.015_scores
0,eurotunnel,0.900714,carillion,0.924636,cptpp,0.901609,riot,0.937423,cpr,0.928249,internship,0.903606
1,reoffending,0.899971,satellite,0.921464,icgs,0.900081,ccrc,0.922973,bma,0.911113,mcmorrin,0.903270
2,uprate,0.899599,spaceport,0.911526,uplift,0.899508,harry,0.902804,headteacher,0.908256,mini,0.899911
3,reoffend,0.897925,xanax,0.908566,harassment,0.899508,tottenham,0.902804,defibrillator,0.907939,demutualisation,0.899239
4,transformative,0.897925,zoe,0.906452,vaccine,0.898935,miscarriage,0.902804,cardiac,0.906035,issuance,0.899239
...,...,...,...,...,...,...,...,...,...,...,...,...
4380,,,,,walk,0.000000,,,,,,
4381,,,,,wholly,0.000000,,,,,,
4382,,,,,occur,0.000000,,,,,,
4383,,,,,maintenance,0.000000,,,,,,


In [None]:
# Legacy

In [40]:
import math


def get_sentence_scores(args):
    lemma_frequencies, document_count, xml_file = args
    with open(lemma_frequencies, 'r') as f:
        lemma_frequencies = json.load(f)

    with open(xml_file, "r", encoding="utf8") as f:
        contents = f.read()
    soup = BeautifulSoup(contents, 'xml')

    scores: dict[str, dict[str, int]] = {}
    speaker_ids = set([i.get('who') for i in soup.find_all('u') if i.get('who')])
    for speaker_id in speaker_ids:
        u_elements = soup.find_all('u', {'who': speaker_id})
        sentences = [u.find_all('s') for u in u_elements]
        sentences = [item for sublist in sentences for item in sublist]

        speaker_tf = {}
        speaker_sentences = {}
        for sentence in sentences:
            sentence_id = sentence.get('xml:id')
            speaker_sentences[sentence_id] = []
            for word in sentence.find_all('w'):
                lemma: str = word.get('lemma')
                if not lemma or not lemma.isalpha():
                    continue
                lemma = lemma.lower()
                speaker_tf[lemma] = speaker_tf.get(lemma, 0) + 1
                speaker_sentences[sentence_id].append(lemma)

        for sentence_id, sentence in speaker_sentences.items():
            scores[sentence_id] = []
            for lemma in sentence:
                tf = speaker_tf[lemma]
                df = lemma_frequencies[lemma]
                idf = math.log(document_count / df) if df else 0  # IDF
                score = tf * idf  # TF-IDF
                scores[sentence_id].append(score)

    return scores


def process_parlamint(parlamint):
    print('processing', parlamint[0]['language'])
    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
        sentences = {}
        for result in tqdm(pool.imap_unordered(get_sentence_scores, [
            (artefacts / f'{i["language"]}_lemma_frequencies.json', len(parlamint), i['xml_path']) for i in parlamint
        ]), total=len(parlamint)):
            sentences.update(result)
    with open(artefacts / f'{parlamint[0]["language"]}_sentence_scores.json', 'w') as f:
        json.dump(sentences, f)


for parlamint in parlamints:
    process_parlamint(parlamint)

processing GB


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 2209/2209 [49:09<00:00,  1.34s/it]


processing ES-GA


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [03:05<00:00,  1.63it/s]


processing HU


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 515/515 [07:02<00:00,  1.22it/s]


processing UA


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 1091/1091 [04:25<00:00,  4.11it/s]


In [41]:
import numpy as np

for parlamint in parlamints[1:]:
    print('processing', parlamint[0]['language'])
    with open(artefacts / f'{parlamint[0]["language"]}_sentence_scores.json', 'r') as f:
        sentence_scores = json.load(f)
    max_score = max([s for ss in sentence_scores.values() for s in ss])
    print('max_score', max_score)
    sentece_weights = {}
    for sentence_id in sentence_scores:
        sentece_weights[sentence_id] = np.mean(sentence_scores[sentence_id]) / max_score

    with open(artefacts / f'{parlamint[0]["language"]}_sentence_weights.json', 'w') as f:
        json.dump(sentece_weights, f)

processing GB
max_score 686.7005062402968
processing ES-GA
max_score 183.85466095245155
processing HU
max_score 255.82151674556403
processing UA
max_score 672.2312987938257


In [39]:
import heapq


def print_ids(ids):
    def get_path(id):
        year = id.split('_')[1].split('-')[0]
        return f'data/annotated/ParlaMint-SI.TEI.ana/{year}/{id.split(".seg")[0]}.xml'

    soups = {}
    for sid in ids:
        xml_path = get_path(sid)
        if xml_path not in soups:
            with open(xml_path, 'r') as f:
                contents = f.read()
            soups[xml_path] = BeautifulSoup(contents, 'xml')
        soup = soups[xml_path]
        sentence = soup.find('s', {'xml:id': sid})
        print(' '.join([w.get_text() for w in sentence.find_all('w')]))


for parlamint in parlamints[:1]:
    print('processing', parlamint[0]['language'])
    with open(artefacts / f'{parlamint[0]["language"]}_sentence_weights.json', 'r') as f:
        sentece_weights = json.load(f)
        top_50 = heapq.nlargest(50, sentece_weights, key=lambda x: sentece_weights[x])
        print_ids(top_50)
        print('--------')
        bottom_50 = heapq.nsmallest(50, sentece_weights, key=lambda x: sentece_weights[x])
        print_ids(bottom_50)


processing SI
243 ventilatorjev
453 ventilatorjev
Merilniki slanosti
404 ventilatorji 13. marca
Prehajamo na podprogram Štipendije
Podjetje Lastinski je prišlo po merilnike slanosti k Darsu
Multinacionalka je prodala žilno opornico recimo Mark Medicalu S. p.
Oklepnike
Ceca
Prehajamo na podprogram Dolgotrajna oskrba
Prehajamo na podprogram Univerzitetne knjižnice
Prehajamo na podprogram Kakovost zraka
Lastniki so bili Mark Medical S. p.
Prva pogodba med Darsom in podjetjem Lastinski
To se pravi podjetje Lastinski je vrnilo merilnike slanosti
Palestina
Palestina
Burka
Nazaj k ventilatorjem
Mark Medical Slovenija je v lastniški družbi KB 1909
Tako pride do posla podjetje Lastinski
Število ventilatorjev
Število ventilatorjev
Prehajamo na podprogram Podpora raziskovalni infrastrukturi
Prehajamo na podprogram Podpora raziskovalni infrastrukturi
Pomožnega motorja
Prehajamo na podprogram Kreiranje delovnih mest
Prehajamo na podprogram Kreiranje delovnih mest
Poglejte kaj je pa firma Lastinski 

In [None]:
for parlamint in parlamints:
    print('processing', parlamint[0]['language'])
    with open(artefacts / f'{parlamint[0]["language"]}_sentence_weights.json', 'r') as f:
        sentece_weights = json.load(f)
        top_50 = heapq.nlargest(50, sentece_weights, key=lambda x: sentece_weights[x])
        print_ids(top_50)
        print('--------')
        bottom_50 = heapq.nsmallest(50, sentece_weights, key=lambda x: sentece_weights[x])
        print_ids(bottom_50)
