## MDS Thesis
#### 03. GELECTRA prediction results

<br>
<hr style="opacity: 0.5">

### Setup

In [1]:
# load libraries
import os
import pandas as pd
import numpy as np
import glob
import re

In [2]:
# set wd
ROOT_DIR = "/Users/varvarailyina/hertie/mds_thesis"

In [3]:
# load all batch files with predictions
all_files = glob.glob(os.path.join(ROOT_DIR, "data", "out", "prediction_batches", "predictions_batch_*.csv"))

In [4]:
# load sentence-level data
df_sentences = pd.read_pickle(os.path.join(ROOT_DIR, "data", "out", "df_sentences.pkl"))

-- *Clean and combine all prediction results*

In [5]:
# sort by batch index
all_files = sorted(all_files, key=lambda x: int(os.path.basename(x).split("_")[-1].split(".")[0]))

# combine
predictions = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)

In [6]:
# save df with all predictions
predictions.to_pickle(os.path.join(ROOT_DIR, "data", "out", "prediction_batches", "predictions_all.pkl"))

# load predictions df
#predictions = pd.read_pickle(os.path.join(ROOT_DIR, "data", "out", "prediction_batches", "predictions_all.pkl"))

<hr style="opacity: 0.25">

### Merge predicted emotions back onto data

-- *Merge predictions onto sentence-level dataframe*

In [7]:
# match length between predictions and sentences
assert len(predictions) == len(df_sentences), "Mismatch."

# reset index to align for row-wise merge
df_sentences = df_sentences.reset_index(drop=True)
predictions = predictions.reset_index(drop=True)

In [8]:
# merge sentence-level metadata with predictions
df_all = pd.concat([df_sentences, predictions], axis=1)

-- *Clean merged df*

In [9]:
# drop very short or punctuation-only sentences
df_all = df_all[df_all['sentence'].str.strip().str.len() > 3]

# remove rows where sentence is just symbols (e.g. ".", "...")
df_all = df_all[~df_all['sentence'].str.match(r'^[\W_]+$')]

# remove sentences that look like filenames or URLs
df_all = df_all[~df_all['sentence'].str.contains(r'\.pdf$|\.docx?$|\.csv$|http[s]?://|www\.', case=False, na=False)]

# remove sentences that look like email addresses
df_all = df_all[~df_all['sentence'].str.contains(r'\S+@\S+', na=False)]

# remove very low-information sentences (1–2 words)
df_all = df_all[df_all['sentence'].str.split().str.len() > 2]

# rrop duplicates or nulls
df_all = df_all.dropna(subset=['sentence']).drop_duplicates(subset=['sentence'])

# reset index after filtering
df_all = df_all.reset_index(drop=True)

In [22]:
# save merged df
df_all.to_pickle(os.path.join(ROOT_DIR, "data", "out", "df_all.pkl"))

<hr style="opacity: 0.25">

### Summarize emotions

In [11]:
# look at columns in `df_all`
print(df_all.columns.tolist())

['country_name', 'party', 'party_name', 'family_name', 'date', 'month', 'calendar_week', 'issue_mono', 'issue_label', 'header', 'text', 'n_words', 'sentence', 'anger', 'fear', 'disgust', 'sadness', 'joy', 'enthusiasm', 'pride', 'hope']


-- *Extract top emotion per sentence*

In [12]:
# define emotion columns
emotion_cols = ['anger', 'fear', 'disgust', 'sadness', 'joy', 'enthusiasm', 'pride', 'hope']

# extract dominant emotions per sentence
df_all['emotions'] = df_all[emotion_cols].apply(
    lambda row: [e for e in emotion_cols if row[e] >= 0.5], axis=1
)

# extract top emotion per sentence
df_all['top_emotion'] = df_all[emotion_cols].idxmax(axis=1)

-- *Aggregate emotions per press release*

In [13]:
# define relevant columns
group_cols = ['party', 'date', 'month', 'issue_mono', 'issue_label', 'text']
emotion_cols = ['anger', 'fear', 'disgust', 'sadness', 'joy', 'enthusiasm', 'pride', 'hope']

In [14]:
# ensure emotion scores are float
df_all[emotion_cols] = df_all[emotion_cols].astype(float)

In [15]:
# aggregate: mean of probabilities across sentences in each press release
df_clean = df_all.groupby(group_cols)[emotion_cols].mean().reset_index()

-- *Include number of sentences per press release*

In [16]:
# sentence counts per press release
df_counts = df_all.groupby(group_cols).size().reset_index(name='n_sentences')

# merge in sentence counts
df_clean = df_clean.merge(df_counts, on=group_cols)

-- *Normalize emotions*

In [17]:
# normalize emotion scores by number of sentences
for col in emotion_cols:
    df_clean[col + '_norm'] = df_clean[col] / df_clean['n_sentences']

# average probability of each emotion per sentence in a document

-- *Calculate emotion density and shares*

In [18]:
# calculate total emotion weight and emotion density
df_clean['total_emotion_mentions'] = df_clean[emotion_cols].sum(axis=1)
df_clean['emotion_intensity'] = df_clean['total_emotion_mentions'] / df_clean['n_sentences']

# high `emotion_intensity` means the press release contains many emotions (but unspecified which ones)

In [19]:
# calculate shares of each emotion
for col in emotion_cols:
    df_clean[col + '_share'] = np.where(
        df_clean['total_emotion_mentions'] > 0,
        np.round((df_clean[col] / df_clean['total_emotion_mentions']) * 100, 2),
        0.0
    )

# each value in `_share` represents the share of that emotion per press releases

In [21]:
# save `df_clean`
df_clean.to_pickle(os.path.join(ROOT_DIR, "data", "out", "df_clean.pkl"))