In [1]:
import pandas as pd
import random
from dataclasses import dataclass
from utils import load_model, load_review_data, configure_environment, logistic_regression, augment_data

configure_environment()
bert, bert_tokenizer, device = load_model(model_name="allegro/herbert-base-cased")
reviews_df = load_review_data()

Seed set to 36786


Device set to cuda


Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
def representation(txt):
    input_ids = bert_tokenizer(txt, return_tensors='pt')['input_ids']
    output = bert(input_ids=input_ids)
    return output.last_hidden_state.detach().cpu().numpy()[0,0,:]

def extract_features(df):
	df = df.copy().join(df.text.apply(representation).apply(pd.Series).add_prefix('features.bert.'))
	df.columns = pd.MultiIndex.from_tuples([col.split('.') for col in df.columns])
	return df

In [3]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(reviews_df, test_size=0.2, shuffle=True)
train_features_df = extract_features(train_df)
test_features_df = extract_features(test_df)

In [4]:
train_features_df

Unnamed: 0_level_0,label,text,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features
Unnamed: 0_level_1,NaN,NaN,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert
Unnamed: 0_level_2,NaN,NaN,0,1,2,3,4,5,6,7,...,758,759,760,761,762,763,764,765,766,767
176,True,Zupełnie inne podejście miała Pani doktor na n...,-0.228186,0.106508,0.143653,0.100886,-0.583679,-0.448140,-0.050301,0.119306,...,-0.090929,-0.336753,0.001660,0.124735,0.266845,-0.022654,0.047777,0.132901,-0.025131,0.244655
256,False,Poprawcie to.,-0.083961,-0.134343,0.200194,0.088947,-0.261745,0.336625,-0.156988,-0.372877,...,0.118508,-0.274321,-0.071802,0.471191,0.334924,-0.040434,0.023334,0.185660,0.049927,0.074076
366,False,Niestety miałam nieprzyjemność zetknąć się z n...,0.046734,0.106501,-0.024241,0.028417,0.041316,0.191452,-0.029730,0.263827,...,0.001247,0.085410,0.080240,0.271734,0.384899,-0.104438,0.341161,-0.155862,0.273268,0.220608
95,True,Ogólnie polecam : ],-0.452650,0.011029,-0.036654,0.044954,-0.013617,0.378267,-0.142742,-0.131144,...,0.108367,0.364120,0.036408,-0.041373,0.354578,0.112237,0.489756,-0.057231,0.091037,0.545532
126,True,"Na zakończenie leczenia rzeczowa, przyjacielsk...",0.034715,-0.198251,0.032767,-0.072684,-0.170697,0.444355,-0.032781,-0.273924,...,-0.227748,0.206771,0.021383,0.229205,0.276767,-0.081775,-0.120907,-0.530573,-0.074006,0.388413
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254,False,Wydałam ok 1300zł i na szczęście znalazłam inn...,-0.033959,0.197749,0.025394,-0.165185,0.178361,-0.091009,-0.075289,-0.277863,...,0.104002,-0.579222,0.105540,0.188452,0.335603,-0.143326,-0.336182,-0.226790,0.236538,-0.020417
220,False,"Dowiedziałem się, że jeśli szczelina będzie mi...",-0.265117,0.099175,0.174028,-0.124155,0.046409,0.390323,-0.000872,-0.289533,...,0.097107,-0.050891,0.082499,0.277035,0.410913,-0.267114,-0.416551,-0.333245,0.116270,-0.105064
305,False,Wino rozcieczone woda Jedzenie monotonne.,0.227190,0.083490,-0.156883,-0.065452,0.176190,0.157179,-0.175360,-0.171695,...,0.038006,0.348740,-0.010934,0.109342,0.058157,0.005926,0.517455,-0.784506,0.005514,-0.672104
291,False,"Na moja uwagę, iż katar ostatnio trwał długo i...",0.288273,0.063857,0.017946,-0.096863,0.004496,0.331096,0.026300,-0.232358,...,0.090488,0.050552,0.075482,0.193699,0.258856,-0.076792,0.039414,-0.574275,-0.037756,0.222950


In [5]:
logistic_regression(
	x_train=train_features_df.features.values,
	y_train=train_features_df.label.values.squeeze(),
	x_test=test_features_df.features.values,
	y_test=test_features_df.label.values.squeeze(),
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'train': 1.0, 'test': 0.775}

### Contextual Word Embeddings

In [6]:
from transformers import pipeline

fill_mask = pipeline("fill-mask", model="bert-base-uncased")

@dataclass
class ContextualWordReplacement:
    def contextual_word_replacement(self, text):
        words = text.split()
        word_to_replace_idx = random.randint(0, len(words)-1)
        masked_text = " ".join(words[:word_to_replace_idx] + ["[MASK]"] + words[word_to_replace_idx+1:])
        predictions = fill_mask(masked_text)
        new_word = predictions[0]['token_str']
        words[word_to_replace_idx] = new_word
        return " ".join(words)
        
    def __call__(self, row: pd.Series) -> pd.Series:
        row.text = self.contextual_word_replacement(row.text)
        return row

2024-12-07 22:57:33.556543: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-07 22:57:33.569589: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-07 22:57:33.573088: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-07 22:57:33.584162: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
BertForMaskedLM has generative capabilities, as `prep

#### K=1

In [7]:
augmented_train_K1_df = augment_data(train_df, augmentation=ContextualWordReplacement(), K=1)
augmented_train_K1_df

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Unnamed: 0,label,text
176,True,Zupełnie inne podejście miała Pani doktor na n...
176,True,Zupełnie inne podejście miała Pani doktor na n...
256,False,Poprawcie to.
256,False,. to.
366,False,Niestety miałam nieprzyjemność zetknąć się z n...
...,...,...
305,False,Wino rozcieczone woda Jedzenie .
291,False,"Na moja uwagę, iż katar ostatnio trwał długo i..."
291,False,"Na moja uwagę, iż katar ostatnio trwał długo i..."
365,False,Nieprofesjonalna obsługa w barze ( pozdrowieni...


In [8]:
augmented_train_K1_features_df = extract_features(augmented_train_K1_df)
logistic_regression(
	x_train=augmented_train_K1_features_df.features.values,
	y_train=augmented_train_K1_features_df.label.values.squeeze(),
	x_test=test_features_df.features.values,
	y_test=test_features_df.label.values.squeeze(),
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'train': 0.996875, 'test': 0.7375}

#### K=3

In [9]:
augmented_train_K3_df = augment_data(train_df, augmentation=ContextualWordReplacement(), K=3)
augmented_train_K3_features_df = extract_features(augmented_train_K3_df)
logistic_regression(
	x_train=augmented_train_K3_features_df.features.values,
	y_train=augmented_train_K3_features_df.label.values.squeeze(),
	x_test=test_features_df.features.values,
	y_test=test_features_df.label.values.squeeze(),
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'train': 1.0, 'test': 0.675}

### Back Translation

In [10]:
from deep_translator import GoogleTranslator

@dataclass
class BackTranslation:
    language = "en"

    def back_translation(self, text):
        front = GoogleTranslator(source='pl', target=self.language)
        back = GoogleTranslator(source=self.language, target='pl')
        return back.translate(front.translate(text))
        
    def __call__(self, row: pd.Series) -> pd.Series:
        row.text = self.back_translation(row.text)
        return row


In [11]:
%%time
augmented_train_K1_df = augment_data(train_df, augmentation=BackTranslation(), K=1)
augmented_train_K1_df

CPU times: user 5.08 s, sys: 470 ms, total: 5.55 s
Wall time: 9min 1s


Unnamed: 0,label,text
176,True,Zupełnie inne podejście miała Pani doktor na n...
176,True,Zupełnie inne podejście miała lekarka pełniąca...
256,False,Poprawcie to.
256,False,Proszę to poprawić.
366,False,Niestety miałam nieprzyjemność zetknąć się z n...
...,...,...
305,False,Wino rozcieńczone wodą. Jedzenie monotonne.
291,False,"Na moja uwagę, iż katar ostatnio trwał długo i..."
291,False,"Gdy zwróciłam jej uwagę, że przeziębienie trwa..."
365,False,Nieprofesjonalna obsługa w barze ( pozdrowieni...


In [12]:
augmented_train_K1_features_df = extract_features(augmented_train_K1_df)
logistic_regression(
	x_train=augmented_train_K1_features_df.features.values,
	y_train=augmented_train_K1_features_df.label.values.squeeze(),
	x_test=test_features_df.features.values,
	y_test=test_features_df.label.values.squeeze(),
)

{'train': 1.0, 'test': 0.75}