In [1]:
import pandas as pd
import random
from dataclasses import dataclass
from utils import load_model, load_review_data, configure_environment, logistic_regression, augment_data

configure_environment()
bert, bert_tokenizer, device = load_model(model_name="allegro/herbert-base-cased")
reviews_df = load_review_data()

Seed set to 51749


Device set to cuda


Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
def representation(txt):
    input_ids = bert_tokenizer(txt, return_tensors='pt')['input_ids']
    output = bert(input_ids=input_ids)
    return output.last_hidden_state.detach().cpu().numpy()[0,0,:]

def extract_features(df):
	df = df.copy().join(df.text.apply(representation).apply(pd.Series).add_prefix('features.bert.'))
	df.columns = pd.MultiIndex.from_tuples([col.split('.') for col in df.columns])
	return df

In [3]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(reviews_df, test_size=0.2, shuffle=True)
train_features_df = extract_features(train_df)
test_features_df = extract_features(test_df)

In [4]:
train_features_df

Unnamed: 0_level_0,label,text,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features
Unnamed: 0_level_1,NaN,NaN,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert
Unnamed: 0_level_2,NaN,NaN,0,1,2,3,4,5,6,7,...,758,759,760,761,762,763,764,765,766,767
35,True,"Działał mi bardzo ładnie, nie krzaczył się jak...",-0.250587,0.044173,0.118709,-0.100504,-0.102707,0.191547,-0.151845,-0.021836,...,0.096086,0.132005,-0.145207,0.288063,0.490282,-0.244557,0.085787,0.105476,0.391839,-0.558862
386,False,Badanie trwało dosłownie moment - dr nie zleci...,-0.016805,-0.081440,-0.128962,-0.088329,0.157753,-0.417212,-0.205676,0.286694,...,0.116096,-0.218194,0.193157,0.208422,0.171440,-0.022037,0.489000,0.153495,0.202705,0.037559
90,True,Jedzenie w porządku.,-0.006820,0.101888,0.068190,0.186785,0.019013,0.020581,-0.102092,-0.380311,...,0.189698,0.185457,0.002853,0.328520,0.299032,-0.085792,0.366902,-0.392819,-0.074393,0.072027
231,False,Hotel nie powinien mieć ich 5.,-0.303546,0.026752,-0.012962,0.091690,-0.090015,-0.206873,-0.112513,0.086325,...,-0.188722,-0.029026,0.038694,0.351946,0.246647,-0.070808,0.175360,0.032608,-0.004586,0.487502
177,True,Gdy się chce czegoś u niej nauczyć - nie ma pr...,-0.089130,0.024326,0.075258,0.295325,-0.572678,0.337507,0.094997,0.325756,...,-0.170109,-0.244512,-0.042971,0.043389,0.325687,-0.141715,0.124149,-0.277762,-0.142268,0.192734
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,True,Polecam gorąco!,-0.207426,-0.170036,-0.033940,0.257174,-0.018600,0.456678,-0.199067,0.100050,...,-0.032488,0.116933,0.040579,0.180119,0.098833,0.139046,-0.238715,0.148369,-0.048894,0.753203
396,False,Jedyny minus to bardzo wysokie ceny jak na Pol...,-0.316551,-0.028436,-0.057008,0.002780,-0.106427,0.071419,-0.113765,0.704119,...,0.132339,0.131630,-0.077902,0.242636,0.183911,-0.331103,0.148920,-0.680917,0.067205,0.059378
194,True,"Obsługa bardzo miła, pokoje ładne i czyste.",0.274798,0.026573,0.125020,0.033270,0.343656,-0.028196,-0.310885,-0.744858,...,0.059801,0.102245,-0.145936,0.250979,0.109129,-0.181861,-0.032184,-0.619223,0.095814,0.403707
48,True,"Używam go do panasonica lumix lx7, aparat nies...",-0.195864,0.010850,0.068848,0.221020,-0.086630,0.287352,0.023075,0.294634,...,0.020690,-0.323337,-0.012109,0.190511,0.277214,-0.298410,0.181176,-0.327683,0.009478,-0.107572


In [5]:
logistic_regression(
	x_train=train_features_df.features.values,
	y_train=train_features_df.label.values.squeeze(),
	x_test=test_features_df.features.values,
	y_test=test_features_df.label.values.squeeze(),
)

{'train': 0.990625, 'test': 0.775}

### Capitalization augmentation

In [6]:
@dataclass
class RandomCapitilization:
    max_capitalization=0.3
    min_words=2
    def random_capitalize(self, text):
        words = text.split()
        num_words_to_capitalize = max(1, int(len(words) * self.max_capitalization))
        indices_to_capitalize = random.sample(range(len(words)), min(num_words_to_capitalize, self.min_words))
        
        for i in indices_to_capitalize:
            words[i] = words[i].upper()
        
        return " ".join(words)
    
    def __call__(self, row: pd.Series) -> pd.Series:
        row.text = self.random_capitalize(row.text)
        return row

#### K = 1

In [7]:
augmented_train_K1_df = augment_data(train_df, augmentation=RandomCapitilization(), K=3)
augmented_train_K1_df

Unnamed: 0,label,text
35,True,"Działał mi bardzo ładnie, nie krzaczył się jak..."
35,True,"DZIAŁAŁ mi bardzo ładnie, nie krzaczył SIĘ jak..."
35,True,"Działał MI bardzo ładnie, nie krzaczył się jak..."
35,True,"Działał MI bardzo ładnie, nie krzaczył się jak..."
386,False,Badanie trwało dosłownie moment - dr nie zleci...
...,...,...
48,True,"UŻYWAM go do panasonica lumix lx7, APARAT nies..."
215,False,Niestety ale trąbi tu chór rozanielonych użytk...
215,False,NIESTETY ale trąbi tu chór rozanielonych UŻYTK...
215,False,Niestety ale TRĄBI tu chór rozanielonych użytk...


In [8]:
augmented_train_K1_features_df = extract_features(augmented_train_K1_df)
logistic_regression(
	x_train=augmented_train_K1_features_df.features.values,
	y_train=augmented_train_K1_features_df.label.values.squeeze(),
	x_test=test_features_df.features.values,
	y_test=test_features_df.label.values.squeeze(),
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'train': 1.0, 'test': 0.8}

#### K = 3

In [9]:
augmented_train_K3_df = augment_data(train_df, augmentation=RandomCapitilization(), K=3)
augmented_train_K3_features_df = extract_features(augmented_train_K3_df)
logistic_regression(
	x_train=augmented_train_K3_features_df.features.values,
	y_train=augmented_train_K3_features_df.label.values.squeeze(),
	x_test=test_features_df.features.values,
	y_test=test_features_df.label.values.squeeze(),
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'train': 1.0, 'test': 0.8375}

### Character swap augmentation

In [10]:
@dataclass
class RandomCharacterSwaps:
    swaps = 0.1
    def character_swap(self, text):
        words = text.split()
        word_idx = random.randint(0, len(words)-1)
        word = words[word_idx]
        
        # Swap two random characters in the word
        if len(word) > 1:
            idx1, idx2 = random.sample(range(len(word)), 2)
            word = list(word)
            word[idx1], word[idx2] = word[idx2], word[idx1]
            words[word_idx] = "".join(word)
        
        return " ".join(words)
        
    def __call__(self, row: pd.Series) -> pd.Series:
        swaps = int(self.swaps * len(row.text))
        for _ in range(swaps):
            row.text = self.character_swap(row.text)
        return row

#### K=1

In [11]:
augmented_train_K1_df = augment_data(train_df, augmentation=RandomCharacterSwaps(), K=1)
augmented_train_K1_df

Unnamed: 0,label,text
35,True,"Działał mi bardzo ładnie, nie krzaczył się jak..."
35,True,"Działał mi bardzo ładnie, nei krzaczył się jak..."
386,False,Badanie trwało dosłownie moment - dr nie zleci...
386,False,Badanie trwało dosłownie moment - rd ine clize...
90,True,Jedzenie w porządku.
...,...,...
194,True,"Obsługa azrdbo miła, pokoje ładne i czyste."
48,True,"Używam go do panasonica lumix lx7, aparat nies..."
48,True,"Używam go do aanisonpca limux lx7, aparat nies..."
215,False,Niestety ale trąbi tu chór rozanielonych użytk...


In [12]:
augmented_train_K1_features_df = extract_features(augmented_train_K1_df)
logistic_regression(
	x_train=augmented_train_K1_features_df.features.values,
	y_train=augmented_train_K1_features_df.label.values.squeeze(),
	x_test=test_features_df.features.values,
	y_test=test_features_df.label.values.squeeze(),
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'train': 0.996875, 'test': 0.7875}

#### K=3

In [13]:
augmented_train_K3_df = augment_data(train_df, augmentation=RandomCharacterSwaps(), K=3)
augmented_train_K3_features_df = extract_features(augmented_train_K3_df)
logistic_regression(
	x_train=augmented_train_K3_features_df.features.values,
	y_train=augmented_train_K3_features_df.label.values.squeeze(),
	x_test=test_features_df.features.values,
	y_test=test_features_df.label.values.squeeze(),
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'train': 0.99140625, 'test': 0.8}