### Load gensim model from [github](https://github.com/sdadas/polish-nlp-resources/releases/download/v1.0/word2vec.zip)

In [1]:
import pandas as pd
import random
import numpy as np
from gensim.models import KeyedVectors

from dataclasses import dataclass
from utils import load_model, load_review_data, configure_environment, logistic_regression, augment_data

configure_environment()
bert, bert_tokenizer, device = load_model(model_name="allegro/herbert-base-cased")
reviews_df = load_review_data()

Seed set to 42712


Device set to cuda


Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
def representation(txt):
    input_ids = bert_tokenizer(txt, return_tensors='pt')['input_ids']
    output = bert(input_ids=input_ids)
    return output.last_hidden_state.detach().cpu().numpy()[0,0,:]

def extract_features(df):
	df = df.copy().join(df.text.apply(representation).apply(pd.Series).add_prefix('features.bert.'))
	df.columns = pd.MultiIndex.from_tuples([col.split('.') for col in df.columns])
	return df

In [3]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(reviews_df, test_size=0.2, shuffle=True)
train_features_df = extract_features(train_df)
test_features_df = extract_features(test_df)

In [4]:
train_features_df

Unnamed: 0_level_0,label,text,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features
Unnamed: 0_level_1,NaN,NaN,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert
Unnamed: 0_level_2,NaN,NaN,0,1,2,3,4,5,6,7,...,758,759,760,761,762,763,764,765,766,767
112,True,Jestem 2 dni po zabiegu i naprawde widac popra...,-0.166789,0.055120,0.040738,0.062562,0.038650,-0.191555,0.001619,0.030428,...,0.061630,0.284108,0.106787,0.163743,0.153379,-0.102755,-0.086485,0.057876,0.175473,-0.210865
365,False,Nieprofesjonalna obsługa w barze ( pozdrowieni...,0.221364,-0.029767,-0.108641,0.176262,0.096322,0.067469,-0.090967,-0.078779,...,0.011472,-0.204341,0.049946,0.091792,0.351829,-0.085705,0.089164,-0.852266,0.197400,-0.039202
339,False,"Pytania podczas laborki potrafią byc ciężkie, ...",-0.076753,-0.078704,-0.138457,0.265353,-0.016553,0.161608,0.042638,-0.196528,...,0.122220,0.571785,-0.126486,0.054753,0.216552,0.133295,-0.279513,-0.523560,0.094154,-0.447300
329,False,Po otwarciu drzwi pokoju uderzał nieprzyjemny ...,-0.204082,-0.040308,-0.048479,0.083144,-0.079823,0.166468,0.070079,0.083498,...,0.059195,-0.102907,-0.007124,0.212242,0.292041,-0.137885,-0.428366,0.086692,-0.039449,0.570365
286,False,Brak lobby i miejsc do wypoczynku.,0.198214,-0.231528,-0.159114,-0.029486,0.636929,0.149423,-0.089068,0.492712,...,0.048487,-0.893390,-0.119268,0.101796,0.220019,-0.299325,-0.456732,0.276496,-0.105577,-0.123156
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50,True,Pokoje te są zlokalizowane na piętrze pensjona...,0.043725,-0.020467,0.043737,0.219762,0.223317,0.273472,-0.226220,-0.408401,...,0.183351,-0.326955,-0.013456,0.384032,0.095777,-0.041340,0.287313,-0.191112,0.084220,0.213388
97,True,Wprowadza atmosferę spokoju i rzeczowości.,0.135019,-0.009740,-0.152579,0.189626,0.323342,0.124945,-0.287877,-0.365972,...,-0.216853,0.231571,-0.034450,-0.047848,0.212989,-0.165746,-0.020718,-0.154120,0.285610,-0.186035
355,False,to sa te trafne zaściankowe diagnozy.,0.190267,0.058012,-0.012369,0.163534,-0.248350,-0.255285,-0.112946,-0.038393,...,-0.028444,0.193244,-0.025528,0.276538,0.256483,-0.142741,-0.084024,-0.085960,-0.028197,-0.391466
388,False,Na pewno nikomu nie polecam tego hotelu!,-0.278896,-0.066436,0.084914,0.145937,0.137390,0.357342,0.008929,-0.064014,...,-0.101406,-0.207535,-0.029110,0.360082,0.195425,0.043408,-0.282126,0.020250,-0.107082,0.115247


In [5]:
logistic_regression(
	x_train=train_features_df.features.values,
	y_train=train_features_df.label.values.squeeze(),
	x_test=test_features_df.features.values,
	y_test=test_features_df.label.values.squeeze(),
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'train': 0.99375, 'test': 0.8125}

### Word2Vec Augmentation

In [6]:
@dataclass
class Word2Vec:
	noise = 0.5
	word2vec = KeyedVectors.load("private/word2vec_100_3_polish.bin")

	def softmax_sample(self, data):
		# Extract items and values
		items, values = zip(*data)
		# Compute softmax probabilities
		exp_values = np.exp(values)
		probabilities = exp_values / np.sum(exp_values)
		# Randomly choose an item based on the probabilities
		chosen_item = np.random.choice(items, p=probabilities)
		return chosen_item

	def __call__(self, row: pd.Series) -> pd.Series:
		row.text = " ".join(
			self.softmax_sample(self.word2vec.similar_by_word(w))
			if w in self.word2vec and random.random() < self.noise else w
			for w in row.text.split()
		)
		return row

In [7]:
augmented_train_K1_df = augment_data(train_df, augmentation=Word2Vec(), K=1)
augmented_train_K1_df

  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]


Unnamed: 0,label,text
112,True,Jestem 2 dni po zabiegu i naprawde widac popra...
112,True,Jestem 8 dni po zabiegu on naprawde widac popr...
365,False,Nieprofesjonalna obsługa w barze ( pozdrowieni...
365,False,Nieprofesjonalna automatyczny w barze oraz poz...
339,False,"Pytania podczas laborki potrafią byc ciężkie, ..."
...,...,...
355,False,to sa te trafne zaściankowe diagnozy.
388,False,Na pewno nikomu nie polecam tego hotelu!
388,False,Na szczęście nikomu jednak polecam tego hotelu!
282,False,Wydałem 150zl plus dojazd.


In [8]:
augmented_train_K1_features_df = extract_features(augmented_train_K1_df)
logistic_regression(
	x_train=augmented_train_K1_features_df.features.values,
	y_train=augmented_train_K1_features_df.label.values.squeeze(),
	x_test=test_features_df.features.values,
	y_test=test_features_df.label.values.squeeze(),
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'train': 1.0, 'test': 0.75}