In [5]:
import pandas as pd
import json
from tqdm import tqdm
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
import stopwordsiso as stopwords
import pickle


class Languages:
    si = 'SI'
    gb = 'GB'
    hu = 'HU'
    ua = 'UA'
    all = [si, gb, hu, ua]

In [6]:
sentences = {}
for language in Languages.all:
    print('loading', language)
    sentences[language] = pd.read_feather(f'artefacts/pandas_frames/{language}_parlamint_sentences.feather')
metadata = pd.read_feather('artefacts/pandas_frames/parlamint_metadata.feather')

loading SI
loading GB
loading HU
loading UA


In [7]:
artefacts_pd_frames = Path('artefacts/pandas_frames')
speeches = {}
for language in tqdm(sentences):
    with open(artefacts_pd_frames / f'speech2lemmas_{language}.pkl', 'rb') as f:
        speeches[language] = pickle.load(f)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:17<00:00,  4.45s/it]


In [64]:
artefacts_base = Path('artefacts/bojan')
files = {
    Languages.si: 'ParlaMint-SI_with_sentiment.csv',
    Languages.ua: 'ParlaMint-UA_with_sentiment.csv',
    Languages.hu: 'ParlaMint-HU_with_sentiment.csv',
    Languages.gb: 'ParlaMint-GB-commons_with_sentiment.csv',
}
filtered_speeches = {}
filtered_speech_ids = set()
for language, csv_file in files.items():
    lang_df = pd.read_csv(artefacts_base / csv_file)
    print('loaded', language, 'len =', len(lang_df))
    lang_df = lang_df[lang_df['Speaker_role'] == 'Regular']
    lang_df = lang_df[lang_df['Speaker_MP'] == 'MP']
    lang_df['speech_length'] = lang_df['speech'].apply(lambda x: len(x) if type(x) == str else 0)
    lang_df = lang_df[lang_df['speech_length'] > 200]
    filtered_speech_ids.update(lang_df.ID)
    filtered_speeches[language] = lang_df

loading SI len = 311354
loading UA len = 195685


  lang_df = pd.read_csv(artefacts_base / csv_file)


loading HU len = 104521
loading GB len = 472782


In [125]:
def get_speech_ids(speech2lemmas: dict[str, set[str]], keywords: set[str]):
    for speech_id, lemmas in speech2lemmas.items():
        speech_id = speech_id.replace('.ana', '')
        if speech_id not in filtered_speech_ids:
            continue
        lemmas = lemmas.lower()
        for keyword in keywords:
            if keyword in lemmas:
                yield speech_id
                break


def get_corpuses(topic_by_language: dict[str, set[str]]):
    corpuses = {}
    for language, speech2lemmas in speeches.items():
        speech_ids = list(get_speech_ids(speech2lemmas, topic_by_language[language]))
        corpuses[language] = speech_ids
    return corpuses


def print_corpuses(corpuses):
    for lang, corpus in corpuses.items():
        print(f'{lang}:', len(corpus))
        print(corpus[:10])
        print('-' * 40)


def save_corpuses(topic, corpuses):
    artefacts_dir = Path('artefacts/by_topic')
    artefacts_dir.mkdir(exist_ok=True)
    df = pd.DataFrame.from_dict({
        'common': set([t for l in corpuses.values() for t in l]),
        **corpuses
    }, orient='index').transpose()
    df.to_feather(artefacts_dir / f'{topic}.feather')
    return df


In [129]:
import pandas as pd

url = 'https://docs.google.com/spreadsheets/d/1v7Kv1kVPWhBwGMDI6g2-pfanTfKuys8mmp1L0sMsMsM/export?format=csv&gid=0'
df = pd.read_csv(url)

topics = {}
for column in df.columns:
    theme, lang = column.split(' (')
    theme = theme.split(': ')[1].lower()
    lang = lang.rstrip(')')
    if theme not in topics:
        topics[theme] = {}
    topics[theme][lang] = [i.lower() for i in df[column].tolist() if type(i) is str and i]

print(topics.keys())
print(topics['healthcare'].keys())
print(topics['war']['Slovenian'])

dict_keys(['eu', 'war', 'healthcare', 'gender'])
dict_keys(['UK', 'Hungary', 'Slovenian', 'Ukrainian'])
['bataljon', 'bojen ladja', 'bojen letalo', 'brezpiloten', 'garda', 'genocid', 'genocid', 'invazija', 'kontingent', 'ljudski armada', 'mornarica', 'naborniški', 'oborožitev', 'obramben', 'oklepnik', 'orožje', 'pehota', 'poveljstvo', 'reparacija', 'rezervist', 'služenje vojaški', 'služenje vojen', 'strelen orožje', 'strelivo', 'tank', 'vojak', 'vojaški', 'vojaškoindustrijski', 'vojašnica', 'vojen', 'vojen taborišče', 'vojen ujetnik', 'vojna', 'vojska']


In [130]:
import pandas as pd

df_data = []
languages = {
    'UK': Languages.gb,
    'Hungary': Languages.hu,
    'Slovenian': Languages.si,
    'Ukrainian': Languages.ua,
}
for topic in tqdm(topics):
    row_dict = {'topic': topic}
    keywords = {}
    for language in topics[topic]:
        keywords[languages[language]] = set(topics[topic][language])
    healthcare = save_corpuses(topic, get_corpuses(keywords))
    healthcare_count = healthcare.count().to_dict()
    for lang, count in healthcare_count.items():
        if lang not in Languages.all:
            continue
        percentage = 100 * count / len(filtered_speeches[lang])
        row_dict[f'{lang}_count'] = count
        row_dict[f'{lang}_percentage'] = percentage
    df_data.append(row_dict)

df = pd.DataFrame(df_data)
df

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:01<00:00, 15.39s/it]


Unnamed: 0,topic,SI_count,SI_percentage,GB_count,GB_percentage,HU_count,HU_percentage,UA_count,UA_percentage
0,eu,21072,17.058204,64688,17.042802,8017,18.876854,9019,12.863153
1,war,19141,15.495021,90611,23.872516,4971,11.704733,7045,10.047779
2,healthcare,18776,15.199547,59433,15.658311,8044,18.940429,6027,8.595878
3,gender,1312,1.06209,3943,1.038829,101,0.237815,0,0.0
