# Inference of the Hungarian model representations

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path='/content/drive/My Drive/Colab Notebooks/Experiments_NLP/raw_input.csv'
import pandas as pd
df = pd.read_csv(file_path, sep='\t')
print(df.head())

   Unnamed: 0        Word       Lemma    POS Case Number  Person Tense Mood  \
0           0           A           a    DET  NaN    NaN     NaN   NaN  NaN   
1           1  csatkarika  csatkarika   NOUN  Nom   Sing     NaN   NaN  NaN   
2           2    felirata     felirat   NOUN  Nom   Sing     NaN   NaN  NaN   
3           3           a           a    DET  NaN    NaN     NaN   NaN  NaN   
4           4           "           "  PUNCT  NaN    NaN     NaN   NaN  NaN   

  Voice  Form-Lemma Difference                   Subtokens  \
0   NaN                      0                       ['A']   
1   NaN                      0  ['csat', '##kar', '##ika']   
2   NaN                      1          ['felirat', '##a']   
3   NaN                      0                       ['a']   
4   NaN                      0                       ['"']   

         Subtoken IDs  Subtoken Start Index  Subtoken End Index  \
0              [2038]                     0                   0   
1  [3487, 7268, 29

In [None]:

import torch
import numpy as np
import pickle
from transformers import BertTokenizer, BertModel

# Load the BERT tokenizer and model with hidden states enabled
tokenizer = BertTokenizer.from_pretrained("SZTAKI-HLT/hubert-base-cc")
model = BertModel.from_pretrained("SZTAKI-HLT/hubert-base-cc", output_hidden_states=True)

# Move the model to GPU
model = model.to('cuda')

# Ensure the model is in evaluation mode
model.eval()


def extract_bert_hidden_representations_by_class(df, output_file="representations_class.pkl", model=model, tokenizer=tokenizer, class_label='Accusative', marker_column='Case'):
    # Use half of the dataset
    df_half = df.sample(frac=0.5, random_state=42)  # Using half of the dataset

    representations = []  # List to store all representations in memory

    for idx, row in df_half.iterrows():
        sentence = row['Sentence']
        encoded_input = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512, add_special_tokens=True)

        # Move tokenized input to GPU
        encoded_input = {key: value.to('cuda') for key, value in encoded_input.items()}

        # Process the sentence
        with torch.no_grad():
            outputs = model(**encoded_input)
            all_hidden_states = outputs.hidden_states

        # Get the length of the tokenized sentence (excluding padding)
        tokenized_length = (encoded_input['input_ids'] != tokenizer.pad_token_id).sum(dim=1) - 2

        subtoken_end_idx = row['Subtoken End Index']

        # Ensure subtoken_end_idx is within the tokenized length for this sentence
        if subtoken_end_idx > tokenized_length.item():
            print(f"Skipping token '{row['Word']}' in sentence '{row['Sentence']}' because subtoken_end_idx exceeds tokenized length.")
            torch.cuda.empty_cache()  # Clear GPU memory
            continue

        # Check if the token belongs to the class you're interested in
        if row[marker_column] == class_label:
            # Extract hidden states for the given subtoken end index for all layers
            token_representations_all_layers = [layer_output[0, subtoken_end_idx + 1, :].cpu().numpy() for layer_output in all_hidden_states]

            # Create the dictionary for the current sentence
            representation = {
                'Word': row['Word'],
                'Lemma': row['Lemma'],
                'Sentence': row['Sentence'],
                'Hidden Representations (All Layers)': token_representations_all_layers,
                'Accusative': int(row['Case'] == 'Acc'),
                'Genitive': int(row['Case'] == 'Gen'),
                'Dative': int(row['Case'] == 'Dat'),
                'Sublative': int(row['Case'] == 'Sbl'),
                'CausalFinal': int(row['Case'] == 'Cau'),
                'Translative': int(row['Case'] == 'Tra'),
                'Plural': int(row['Number'] == 'Plur'),
                'VerbConjugation': int(row['POS'] == 'VERB')
            }

            # Save each sentence's representation to file incrementally
            with open(output_file, 'ab') as f:
                pickle.dump(representation, f)

            # Add to the list to keep in memory
            representations.append(representation)

            # Print confirmation for each sentence processed
            print(f"Processed and saved representation for token '{row['Word']}' in sentence: '{row['Sentence']}'")

        # Clear GPU memory after each sentence
        torch.cuda.empty_cache()

    # Return all representations stored in memory
    return representations





In [None]:
print(len(representations_gen))
print('gen')
print(len(representations_acc))
print('acc')
print(len(representations_dat))
print('dat')
print(len(representations_subl))
print('subl')
print(len(representations_tra))
print('tran')
print(len(representations_plur))
print('plural')
print(len(representations_conj))
print('conj')
print(len(representations_cau))
print('cau')

154
gen
1023
acc
142
dat
312
subl
49
tran
1617
plural
1905
conj
75
cau


In [None]:
import pickle
from google.colab import files

# Function to save a representation using pickle and download
def save_and_download_representation(rep, filename):
    with open(f'{filename}.pkl', 'wb') as f:
        pickle.dump(rep, f)
    files.download(f'{filename}.pkl')

# Save and download each representation object
save_and_download_representation(representations_gen, 'representations_gen')
save_and_download_representation(representations_acc, 'representations_acc')
save_and_download_representation(representations_dat, 'representations_dat')
save_and_download_representation(representations_subl, 'representations_subl')
save_and_download_representation(representations_cau, 'representations_cau')
save_and_download_representation(representations_tra, 'representations_tra')
save_and_download_representation(representations_plur, 'representations_plur')
save_and_download_representation(representations_conj, 'representations_conj')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:

representations_gen = extract_bert_hidden_representations_by_class(df, output_file="representations_class.pkl", model=model, tokenizer=tokenizer, class_label='Gen', marker_column='Case')
representations_acc = extract_bert_hidden_representations_by_class(df, output_file="representations_class.pkl", model=model, tokenizer=tokenizer, class_label='Acc', marker_column='Case')
representations_dat = extract_bert_hidden_representations_by_class(df, output_file="representations_class.pkl", model=model, tokenizer=tokenizer, class_label='Dat', marker_column='Case')
representations_subl = extract_bert_hidden_representations_by_class(df, output_file="representations_class.pkl", model=model, tokenizer=tokenizer, class_label='Sbl', marker_column='Case')
representations_cau = extract_bert_hidden_representations_by_class(df, output_file="representations_class.pkl", model=model, tokenizer=tokenizer, class_label='Cau', marker_column='Case')
representations_tra = extract_bert_hidden_representations_by_class(df, output_file="representations_class.pkl", model=model, tokenizer=tokenizer, class_label='Tra', marker_column='Case')
representations_plur = extract_bert_hidden_representations_by_class(df, output_file="representations_class.pkl", model=model, tokenizer=tokenizer, class_label='Plur', marker_column='Number')
representations_conj = extract_bert_hidden_representations_by_class(df, output_file="representations_class.pkl", model=model, tokenizer=tokenizer, class_label='VERB', marker_column='POS')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
A tiltólistára kerülésre egy új kislemez lett a válasz .	kerülésre	2	Sub
Miley nemrég költözött el Tennessee-ből és most Malibu-ban hódít tehetségével .	tehetségével	9	Ins
ez a kérdés bizonyára napirendre kerül majd a pénzügyi terv felülvizsgálatakor .	felülvizsgálatakor	10	Tem
Egy , az alapítás óta a szervezetnél lévő tag a fennálló helyzetet a Jobbik 2010 előtti kommunikációjához hasonlította .	szervezetnél	6	Ade
Miért és mivel küzd ma egy vidéki egyetem , és hol a jövő a hallgatók számára a fővárostól egy aránylag messzi távolságban .	távolságban	21	Ine
Egy effektet , a lábdobot vagy más hangzást kiválasztani és keverni csak minőségi felszereléssel lehet .	hangzást	7	Acc
Különösen veszélyeztetettek az alkohol- és drogfogyasztók , de a számítógép- vagy játékgép-függő fiatalok , valamint a nagyon szoros párkapcsolatba bevonódó személyek egyaránt .	párkapcsolatba	18	Ill
Egyébként a 99%-ot nem a cigányság összlakossághoz v