## MDS Thesis
#### 02. Use pre-trained GELECTRA model

<br>
<hr style="opacity: 0.5">

### Setup

In [5]:
# load libraries
import os
import pandas as pd
import numpy as np
import pickle
import transformers

from helper.inferencing import Inferencer

In [2]:
# set wd
os.chdir("/Users/varvarailyina/hertie/mds_thesis/03_electra")

In [4]:
# load press release data (on sentence level)
df_sentences = pd.read_csv('../data/out/df_sentences.csv', engine='python')

# load sentence text data
with open('../data/out/sentences.pkl', 'rb') as f:
    sentences = pickle.load(f)

<hr style="opacity: 0.25">

### Apply GELECTRA model

-- *model will classify emotions: anger, fear, disgust, sadness, joy, enthusiasm, pride, hope*

In [90]:
# initialize model
predictor = Inferencer()

-- *TRY ON A TEST BATCH*

In [42]:
# try on a test batch
test_docs = sentences[:1000]
predictions = predictor.predict_dataframe(test_docs)
# this took 290 minutes (4,8 hours)

-- *RUN FOR ALL*

In [91]:
# run for all
predictions = predictor.predict_dataframe(sentences)

In [40]:
# save predictions to .csv
#predictions.to_csv('../data/out/predictions.csv', index=False)

# load predictions
predictions = pd.read_csv('../data/out/predictions.csv')

<hr style="opacity: 0.25">

### Merge predicted emotions back onto data

-- *Merge predictions onto sentence-level dataframe*

In [41]:
# get first XXXX rows from `df_sentences` that match `test_docs` batch length
df_subset = df_sentences.iloc[:len(predictions)].copy()

# reset index on both so they align row-wise
df_subset = df_subset.reset_index(drop=True)
predictions = predictions.reset_index(drop=True)

# merge data
df_all = pd.concat([df_subset, predictions], axis=1)

In [42]:
# drop duplicated second 'text' column
df_all = df_all.loc[:, ~df_all.columns.duplicated()]

In [43]:
# specify emotion columns
emotion_cols = ['anger', 'fear', 'disgust', 'sadness', 'joy', 'enthusiasm', 'pride', 'hope']

In [44]:
# extract emotions per row
df_all['emotions'] = df_all[emotion_cols].apply(lambda row: [e for e in emotion_cols if row[e] == 1.0], axis=1)

<hr style="opacity: 0.25">

### Summarize emotions

-- *Aggregate emotions per press release*

In [45]:
# look at columns in `df_all`
print(df_all.columns.tolist())

['country_name', 'parlgov_id', 'party', 'party_name', 'party_name_english', 'family_name', 'date', 'month', 'month_start', 'month_end', 'calendar_week', 'week_start', 'week_end', 'header', 'issue_multi', 'issue_mono', 'issue', 'issue_coder2', 'position', 'position_coder2', 'cv_sample', 'issue_ridge', 'issue_super', 'text', 'sentence', 'anger', 'fear', 'disgust', 'sadness', 'joy', 'enthusiasm', 'pride', 'hope', 'emotions']


In [46]:
# define needed columns
group_cols = ['party', 'date', 'month', 'issue_mono', 'text']

# sum emotion counts across sentences
df_all[emotion_cols] = df_all[emotion_cols].astype(float)

In [54]:
# clean df
df_clean = df_all.groupby(group_cols)[emotion_cols].sum().reset_index()

-- *Include number of sentences per press release*

In [55]:
# count number of sentences per press release
df_counts = df_all.groupby(group_cols).size().reset_index(name='n_sentences')

# merge in sentence counts
df_clean = df_clean.merge(df_counts, on=group_cols)

-- *Normalize emotions*

In [56]:
# normalize emotions by sentence count
for col in emotion_cols:
    df_clean[col + '_norm'] = df_clean[col] / df_clean['n_sentences']

-- *Calculate emotion density and shares*

In [57]:
# calculate total emotion weight and emotion density
df_clean['total_emotion_mentions'] = df_clean[emotion_cols].sum(axis=1)
df_clean['emotion_density'] = df_clean['total_emotion_mentions'] / df_clean['n_sentences']

# high emotion_density means the press release contains many emotions (but unspecified which ones)

In [58]:
# calculate shares of each emotion
for col in emotion_cols:
    df_clean[col + '_share'] = np.where(
        df_clean['total_emotion_mentions'] > 0,
        np.round((df_clean[col] / df_clean['total_emotion_mentions']) * 100, 2),
        0.0
    )

# each value in `_share` represents the share of that emotion per press releases

In [59]:
# save `df_clean`
df_clean.to_csv('../data/out/df_clean.csv', index=False)