## MDS Thesis
#### 05 Fine-tune the German ELECTRA model

<br>
<hr style="opacity: 0.5">

### Setup

In [24]:
# load libraries
import pandas as pd
import os
import torch

from transformers import ElectraTokenizer, ElectraModel

In [16]:
# check wd
os.getcwd()

# load labels data
df_partypress = pd.read_csv("../data/in/partypress/csv/partypress.csv")
#data = data[['text', 'label']]

# load text data
df_texts = pd.read_csv("../data/in/partypress/csv/partypress_texts.csv")

In [30]:
# load tokenizer and model
tokenizer = ElectraTokenizer.from_pretrained('german-nlp-group/electra-base-german-uncased')
model = ElectraModel.from_pretrained('german-nlp-group/electra-base-german-uncased')

# set to evaluation mode
model.eval()

ElectraModel(
  (embeddings): ElectraEmbeddings(
    (word_embeddings): Embedding(32767, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): ElectraEncoder(
    (layer): ModuleList(
      (0-11): 12 x ElectraLayer(
        (attention): ElectraAttention(
          (self): ElectraSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): ElectraSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0

<hr style="opacity: 0.5">

### Pre-process data

In [17]:
print(df_partypress.index.is_unique)
print(df_texts.index.is_unique)

True
True


In [18]:
# set merging index to be 'id'
df_partypress.set_index('id', inplace=True)
df_texts.set_index('id', inplace=True)

In [19]:
# merge datasets based on id and country_name
df = df_partypress.merge(df_texts, on=['id', 'country_name'], how='inner')

In [12]:
# check var names
print(df.columns)

Index(['country_name', 'parlgov_id', 'party', 'party_name',
       'party_name_english', 'family_name', 'date', 'month', 'month_start',
       'month_end', 'calendar_week', 'week_start', 'week_end', 'header',
       'issue_multi', 'issue_mono', 'issue', 'issue_coder2', 'position',
       'position_coder2', 'cv_sample', 'issue_ridge', 'issue_super', 'text'],
      dtype='object')


In [31]:
# filter for germany
df_de = df[df['country_name'] == 'germany']

# pre-process texts df
texts = df_de['text'].dropna()
text_df = texts.to_frame()

# convert df to dataset
#text_dataset = Dataset.from_pandas(text_df)
dataset = Dataset.from_pandas(text_df)

<hr style="opacity: 0.5">

### Tokenization and embeddings

-- *Tokenize data*

In [32]:
# define function to tokenize data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512, return_tensors='pt')  # return_tensors='pt' for PyTorch

# run function
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 44950/44950 [02:03<00:00, 363.44 examples/s]


-- *Extract embeddings*

In [34]:
# define function to extract embeddings
def extract_embeddings(batch):
    # Ensure inputs are tensors; convert if necessary
    input_ids = torch.tensor(batch['input_ids'])
    attention_mask = torch.tensor(batch['attention_mask'])

    # Forward pass, no gradient calculation
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    # Return embeddings; convert tensors to numpy arrays if necessary
    return {'embeddings': outputs.last_hidden_state.mean(dim=1).detach().cpu().numpy()}

# run function
embeddings = tokenized_dataset.map(extract_embeddings, batched=True)

In [33]:
## OLD

# define function to extract embeddings
def extract_embeddings(batch):
    inputs = {k: batch[k] for k in ['input_ids', 'attention_mask']}
    with torch.no_grad():
        outputs = model(**inputs)
    return {'embeddings': outputs.last_hidden_state.mean(dim=1).numpy()}

# run function
embeddings = tokenized_dataset.map(extract_embeddings, batched=True)

Map:   0%|          | 0/44950 [00:00<?, ? examples/s]


AttributeError: 'list' object has no attribute 'size'