### Representation of Contextualized Embeddings
1. Each position in the last hidden state corresponds to the embedding of a token in the input sequence.
2. These embeddings are contextualized, meaning they encode information about the token itself as well as its relationships and dependencies with other tokens in the sequence.
3. For example, the word "bank" in the context of "river bank" and "financial bank" will have different embeddings in the last hidden state because the model incorporates the surrounding words.

In [None]:
import torch
import pandas as pd
from utils import load_model, load_review_data, configure_environment

configure_environment(device="cuda")
bert, bert_tokenizer, device = load_model(model_name="allegro/herbert-base-cased")
papuga, papuga_tokenizer, device = load_model(model_name="flax-community/papuGaPT2", causal=True)
reviews_df = load_review_data()

Seed set to 8610


Device set to cpu


Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
def representation(txt):
    input_ids = bert_tokenizer(txt, return_tensors='pt')['input_ids'] #.to(device)
    output = bert(input_ids=input_ids)
    return output.last_hidden_state.detach().cpu().numpy()[0,0,:]

def log_probs_from_logits(logits, labels):
    logp = torch.nn.functional.log_softmax(logits, dim=-1)
    logp_label = torch.gather(logp, 2, labels.unsqueeze(2)).squeeze(-1)
    return logp_label

def sentence_prob(sentence_txt):
    suffix = " Opinia jest pozytywna."
    input_ids = papuga_tokenizer(sentence_txt + suffix, return_tensors='pt')['input_ids'] #.to(device)
    with torch.no_grad():
        output = papuga(input_ids=input_ids)
        log_probs = log_probs_from_logits(output.logits[:, :-1, :], input_ids[:, 1:])
        seq_log_probs = torch.sum(log_probs)
    return seq_log_probs.cpu().numpy().item()

sentence_prob("ala ma kota"), representation("ala ma kota").shape

(-38.80006408691406, (768,))

In [3]:
def extract_features(df):
	df = df.copy().join(df.text.apply(representation).apply(pd.Series).add_prefix('features.bert.'))
	df["features.papuga.probability"] = df.text.map(sentence_prob)
	df.columns = pd.MultiIndex.from_tuples([col.split('.') for col in df.columns])
	return df

In [4]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(reviews_df, test_size=0.2, random_state=42, shuffle=True)

train_features_df = extract_features(train_df)
test_features_df = extract_features(test_df)


In [5]:
train_features_df

Unnamed: 0_level_0,label,text,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features
Unnamed: 0_level_1,NaN,NaN,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,papuga
Unnamed: 0_level_2,NaN,NaN,0,1,2,3,4,5,6,7,...,759,760,761,762,763,764,765,766,767,probability
3,True,"Wypożyczalnia samochodów w firmie hotelowej, r...",-0.397389,-0.312147,0.056186,0.142620,-0.497406,1.163807,-0.103312,-0.083231,...,0.506825,-0.065348,0.255681,0.152758,-0.118751,0.277989,-0.122502,-0.009789,-0.320787,-62.358536
18,True,"Polecam ten hotel - świetny widok, super dojazd.",-0.208168,-0.021141,0.051543,0.126063,0.110358,0.305206,0.052856,-0.283542,...,0.468360,-0.024745,0.465612,0.056653,-0.000556,0.065137,0.244588,0.093530,0.404718,-57.420876
202,False,"Apartamenty znajdują się w budynkach, które po...",-0.213159,-0.214663,0.228404,0.289841,-0.148720,0.278599,0.069608,-0.246748,...,-0.038890,-0.050254,0.229985,0.074232,-0.100421,0.301730,0.017435,0.059403,0.427140,-82.344879
250,False,POZDRAWIAM PANI DOKTOR ; - /,-0.132632,0.047073,0.012956,0.115169,-0.293542,1.047164,-0.104890,-0.061957,...,0.773239,-0.040484,0.440073,0.010412,0.204703,-0.235704,-0.174118,0.060988,0.276662,-67.415474
274,False,Pokoje nie są sprzątane.,0.028177,0.019898,0.050209,0.224215,0.238290,0.050711,-0.197451,-0.415037,...,-0.318681,0.055497,0.346750,0.063330,0.027019,0.344931,-0.301065,-0.050593,0.266963,-32.901093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,True,Wielokrotnie pomagał moim dzieciom Oldze i Jus...,-0.077440,-0.058589,-0.010326,0.023113,-0.568084,0.369323,-0.064523,0.011742,...,0.334429,0.117695,0.189385,0.391590,-0.150924,0.213030,0.150356,0.086259,-0.618287,-136.673615
106,True,"Jedyny lekarz, który faktycznie szukał rozwiązań.",-0.095759,0.063175,0.011771,-0.064605,-0.393593,0.514520,0.074060,0.070047,...,0.189531,-0.031423,0.299835,0.107138,-0.126691,0.284013,-0.187929,0.192120,0.066619,-53.901554
270,False,Kolejny to problem z talerzami i sztućcami i s...,0.064061,0.143702,-0.038295,0.140301,0.332156,-0.135974,-0.097481,-0.152080,...,0.267881,0.035912,0.428588,0.278874,-0.135661,-0.123172,-0.252835,0.089340,-0.238518,-158.545105
348,False,"Znajomi zaproponowali, żebym zostawił bagaże i...",0.201128,-0.082920,0.134153,-0.005217,0.176424,0.418421,-0.101599,-0.272516,...,-0.588318,0.112465,0.249527,0.143351,-0.177011,0.529219,-0.156516,0.140669,-0.058572,-253.174667


In [6]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(train_features_df.features.bert.values, train_features_df.label.squeeze())

print ('Train accuracy:', clf.score(train_features_df.features.bert.values, train_features_df.label.squeeze()))
print ('Test accuracy:', clf.score(test_features_df.features.bert.values, test_features_df.label.squeeze()))

#Train accuracy: 0.9348939283101683
#Test accuracy: 0.8715697036223929

Train accuracy: 1.0
Test accuracy: 0.7875
