## MDS Thesis
#### 02. Use pre-trained GELECTRA model

<br>
<hr style="opacity: 0.5">

### Setup

In [8]:
# load libraries
import pandas as pd
import numpy as np
import os
import torch
import pickle

from transformers import ElectraTokenizer, ElectraModel
from datasets import Dataset, load_from_disk
from sklearn.decomposition import PCA

In [9]:
# check wd
os.getcwd()

#os.chdir("/Users/varvarailyina/hertie/mds_thesis/scripts/")

'/Users/varvarailyina/hertie/mds_thesis/scripts'

In [10]:
# load data
#df_de = pd.read_csv("../data/out/df_de.csv")
df_sample = pd.read_csv("../data/out/df_sample.csv")

# load dataset
dataset = load_from_disk("../data/out/dataset/")

In [11]:
# load tokenizer and model
tokenizer = ElectraTokenizer.from_pretrained('german-nlp-group/electra-base-german-uncased')
model = ElectraModel.from_pretrained('german-nlp-group/electra-base-german-uncased')

# set to evaluation mode
model.eval()

ElectraModel(
  (embeddings): ElectraEmbeddings(
    (word_embeddings): Embedding(32767, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): ElectraEncoder(
    (layer): ModuleList(
      (0-11): 12 x ElectraLayer(
        (attention): ElectraAttention(
          (self): ElectraSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): ElectraSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0

<hr style="opacity: 0.5">

### Tokenization and embeddings

-- *Tokenize data*

In [12]:
# keep index as id
df_sample.reset_index(inplace=True)
df_sample.rename(columns={'index': 'id'}, inplace=True)

print(df_sample.columns)

Index(['id', 'country_name', 'parlgov_id', 'party', 'party_name',
       'party_name_english', 'family_name', 'date', 'month', 'month_start',
       'month_end', 'calendar_week', 'week_start', 'week_end', 'header',
       'issue_multi', 'issue_mono', 'issue', 'issue_coder2', 'position',
       'position_coder2', 'cv_sample', 'issue_ridge', 'issue_super', 'text'],
      dtype='object')


In [13]:
#def tokenize_function(text):
#    return tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors='pt')

# apply tokenization and keep 'id'
#df_sample['tokenized_data'] = df_sample['text'].apply(tokenize_function)
#df_sample['id'] = df_sample.index

In [14]:
# define function to tokenize data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512, return_tensors='pt')  # return_tensors='pt' for PyTorch

# run function
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 8800/8800 [00:47<00:00, 184.46 examples/s]


In [141]:
tokenized_dataset

Dataset({
    features: ['text', 'id', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})

-- *Extract embeddings*

In [15]:
# define function to extract embeddings
def extract_embeddings(batch):
    # ensure inputs are tensors; convert if necessary
    input_ids = torch.tensor(batch['input_ids'])
    attention_mask = torch.tensor(batch['attention_mask'])

    # forward pass, no gradient calculation
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    # return embeddings; convert tensors to numpy arrays
    return {'embeddings': outputs.last_hidden_state.mean(dim=1).detach().cpu().numpy()}

# run function
embeddings = tokenized_dataset.map(extract_embeddings, batched=True)

Map: 100%|██████████| 8800/8800 [2:16:10<00:00,  1.08 examples/s]  


In [17]:
# save into .pkl file
with open('../data/out/embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

In [None]:
embeddings

<hr style="opacity: 0.5">

### Emotional intensity score

-- *Reduce dimensionality*

In [18]:
# convert embeddings into a single array
all_embeddings = np.vstack([batch['embeddings'] for batch in embeddings])

In [19]:
# reduce dimensionality to 10 dimensions
pca = PCA(n_components=10)
reduced_embeddings = pca.fit_transform(all_embeddings)

In [20]:
# create scores
scores = np.linalg.norm(reduced_embeddings, axis=1)

# single score for each press release, indicating its intensity based on the PCA-reduced embeddings

In [21]:
# create df with scores
df_scores = pd.DataFrame({
    'score': scores
}, index=df_sample.index)

# merge scores df onto original df
df_clean = df_sample.join(df_scores)

In [22]:
# save as .csv
#df_clean.to_csv("../data/out/df_clean.csv", index=False)

# load `df_clean` data
#df_clean = pd.read_csv("../data/out/df_clean.csv")

-- *Top / bottom scores*

In [23]:
# top 10 scores
df_top10 = df_clean.sort_values(by='score', ascending=False).head(10)

# bottom 10 scores
df_bottom10 = df_clean.sort_values(by='score', ascending=True).head(10)