In [1]:
import sys

import pandas as pd
import json
from tqdm import tqdm
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
import stopwordsiso as stopwords
import pickle


class Languages:
    si = 'SI'
    gb = 'GB'
    hu = 'HU'
    ua = 'UA'
    all = [si, gb, hu, ua]

In [2]:
sentences = {}
for language in Languages.all:
    print('loading', language)
    sentences[language] = pd.read_feather(f'artefacts/pandas_frames/{language}_parlamint_sentences.feather')
metadata = pd.read_feather('artefacts/pandas_frames/parlamint_metadata.feather')

loading SI
loading GB
loading HU
loading UA


In [3]:
artefacts_pd_frames = Path('artefacts/pandas_frames')
speeches = {}
for language in tqdm(sentences):
    with open(artefacts_pd_frames / f'speech2lemmas_{language}.pkl', 'rb') as f:
        speeches[language] = pickle.load(f)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:18<00:00,  4.56s/it]


In [37]:
artefacts_base = Path('artefacts/bojan')
files = {
    Languages.si: 'ParlaMint-SI_with_sentiment.csv',
    Languages.ua: 'ParlaMint-UA_with_sentiment.csv',
    Languages.hu: 'ParlaMint-HU_with_sentiment.csv',
    Languages.gb: 'ParlaMint-GB-commons_with_sentiment.csv',
}
filtered_speeches = {}
filtered_speech_ids = set()
for language, csv_file in files.items():
    lang_df = pd.read_csv(artefacts_base / csv_file)
    print('loaded', language, 'len =', len(lang_df))
    lang_df = lang_df[lang_df['Speaker_role'] == 'Regular']
    lang_df = lang_df[lang_df['Speaker_MP'] == 'MP']
    lang_df['speech_length'] = lang_df['speech'].apply(lambda x: len(x) if type(x) == str else 0)
    lang_df = lang_df[lang_df['speech_length'] > 200]
    filtered_speech_ids.update(lang_df.ID)
    filtered_speeches[language] = lang_df

loaded SI len = 311354
loaded UA len = 195685


  lang_df = pd.read_csv(artefacts_base / csv_file)


loaded HU len = 104521
loaded GB len = 472782


In [54]:
def get_speech_ids(speech2lemmas: dict[str, set[str]], keywords: set[str]):
    for speech_id, lemmas in speech2lemmas.items():
        speech_id = speech_id.replace('.ana', '')
        if speech_id not in filtered_speech_ids:
            continue
        lemmas = f' {lemmas.lower()} '
        for keyword in keywords:
            if f' {keyword} ' in lemmas:
                yield speech_id
                break


def get_corpuses(topic_by_language: dict[str, set[str]]):
    corpuses = {}
    for language, speech2lemmas in speeches.items():
        speech_ids = list(get_speech_ids(speech2lemmas, topic_by_language[language]))
        corpuses[language] = speech_ids
    return corpuses


def print_corpuses(corpuses):
    for lang, corpus in corpuses.items():
        print(f'{lang}:', len(corpus))
        print(corpus[:10])
        print('-' * 40)


def save_corpuses(topic, corpuses):
    artefacts_dir = Path('artefacts/by_topic')
    artefacts_dir.mkdir(exist_ok=True)
    df = pd.DataFrame.from_dict({
        'common': set([t for l in corpuses.values() for t in l]),
        **corpuses
    }, orient='index').transpose()
    df.to_feather(artefacts_dir / f'{topic}.feather')
    return df


In [76]:
import pandas as pd

url = 'https://docs.google.com/spreadsheets/d/1v7Kv1kVPWhBwGMDI6g2-pfanTfKuys8mmp1L0sMsMsM/export?format=csv&gid=0'
df = pd.read_csv(url)

topics = {}
for column in df.columns:
    if '_tr' in column or not column.strip():
        continue
    theme, lang = column.split(' (')
    theme = theme.split(': ')[1].lower()
    lang = lang.rstrip(')')
    if theme not in topics:
        topics[theme] = {}
    topics[theme][lang] = [i.lower() for i in df[column].tolist() if type(i) is str and i]

print(topics.keys())
print(topics['healthcare'].keys())

dict_keys(['eu', 'war', 'healthcare', 'gender'])
dict_keys(['UK', 'Hungary', 'Slovenian', 'Ukrainian'])


In [77]:
import pandas as pd

df_data = []
languages = {
    'UK': Languages.gb,
    'Hungary': Languages.hu,
    'Slovenian': Languages.si,
    'Ukrainian': Languages.ua,
}
topic_dfs = {}
for topic in tqdm(topics):
    row_dict = {'topic': topic}
    keywords = {}
    for language in topics[topic]:
        keywords[languages[language]] = set(topics[topic][language])
    topic_df = save_corpuses(topic, get_corpuses(keywords))
    topic_dfs[topic] = topic_df
    topic_df_count = topic_df.count().to_dict()
    for lang, count in topic_df_count.items():
        if lang not in Languages.all:
            continue
        percentage = 100 * count / len(filtered_speeches[lang])
        row_dict[f'{lang}_count'] = count
        row_dict[f'{lang}_percentage'] = percentage
    df_data.append(row_dict)

df = pd.DataFrame(df_data)
df

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:31<00:00, 22.77s/it]


Unnamed: 0,topic,SI_count,SI_percentage,GB_count,GB_percentage,HU_count,HU_percentage,UA_count,UA_percentage
0,eu,13943,11.287137,38938,10.258667,5933,13.969861,5863,8.361977
1,war,8686,7.03149,15039,3.962199,2141,5.041206,7101,10.127647
2,healthcare,13802,11.172994,45022,11.861567,4583,10.791147,4546,6.483634
3,gender,943,0.763377,3740,0.985346,121,0.284907,122,0.174


### Save to csv

In [78]:
for topic in tqdm(topics):
    topic_df = topic_dfs[topic]
    for language, language_speeches in filtered_speeches.items():
        language_speeches[language_speeches.ID.isin(topic_df.common)].drop('Unnamed: 0', axis=1).to_csv(f'artefacts/by_topic/topic_csvs/{topic}_{language}.csv', index=False)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:23<00:00,  5.99s/it]


In [53]:
for language, df in tqdm(filtered_speeches.items()):
    df.drop('Unnamed: 0', axis=1).to_csv(f'artefacts/bojan/filtered/{language}_filtered.csv', index=False)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:46<00:00, 11.75s/it]
