In [53]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import categories

device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'
THRESHOLD = 100

model_names = {
    "herbert-klej-cased-v1": {
        "tokenizer": "allegro/herbert-klej-cased-tokenizer-v1", 
        "model": "allegro/herbert-klej-cased-v1",
    },
    "herbert-base-cased": {
        "tokenizer": "allegro/herbert-base-cased", 
        "model": "allegro/herbert-base-cased",
    },
    "herbert-large-cased": {
        "tokenizer": "allegro/herbert-large-cased", 
        "model": "allegro/herbert-large-cased",
    },
}

tokenizer = AutoTokenizer.from_pretrained(model_names["herbert-base-cased"]["tokenizer"])
herbert = AutoModel.from_pretrained(model_names["herbert-base-cased"]["model"]).to(device)

@torch.no_grad()
def herbert_forward(data, batch_size=256):
    embeddings = []
    for i in tqdm(range(0, len(data), batch_size)):
        batch = data[i:i+batch_size]
        tokens = tokenizer.batch_encode_plus(
            batch,
            padding="longest",
            add_special_tokens=True,
            return_tensors="pt",
        )

        if torch.cuda.is_available():
            for key in tokens.keys():
                tokens[key] = tokens[key].to(device)

        embeddings.append(herbert(**tokens)['pooler_output'].cpu())
    return torch.cat(embeddings)


import xgboost as xgb

model = xgb.XGBRegressor(tree_method='gpu_hist', reg_alpha=1.0)

def train(batch_size=256, validate=False):
    places = pd.read_csv('places.csv.gz')
    places = places[places['language'] == 'pl'][places['category'].map(places['category'].value_counts()) > THRESHOLD].reset_index()
    X = pd.DataFrame(herbert_forward(list(places['query'])).numpy())
    X['category'] = places['category'].map(categories.cat_id)
    X['audit_latitude'] = places['audit_latitude']
    X['audit_longitude'] = places['audit_longitude']
    X.fillna(len(categories.id_cat))
    y = places['position']
    # print(y.isna())
    model.fit(X, y)
    if validate:
        y_pred = model.predict(X)
        print(mean_squared_error(y, y_pred))




Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.sso.sso_relationship.weight', 'cls.sso.sso_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [50]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
columns = ['language', 'category', 'query', 'position', 'audit_latitude', 'audit_longitude']
places = pd.read_csv('places.csv.gz')[columns].dropna()
places = places[places['language'] == 'pl'][places['category'].map(places['category'].value_counts()) > THRESHOLD]
X = pd.DataFrame(herbert_forward(list(places['query'])).numpy())
encoder.fit(places['category'].values.reshape(-1,1))
cat_df = pd.DataFrame(encoder.transform(places['category'].values.reshape(-1,1)), columns=encoder.categories_[0].tolist())
X = pd.concat([X, cat_df], axis=1)
X[encoder.categories_[0].tolist()] = cat_df
places = places.reset_index()
X['audit_latitude'] = places['audit_latitude']
X['audit_longitude'] = places['audit_longitude']
    # X.fillna(len(categories.id_cat))
y = places['position']


  places = places[places['language'] == 'pl'][places['category'].map(places['category'].value_counts()) > THRESHOLD]
100%|██████████| 1761/1761 [00:39<00:00, 44.68it/s]


In [54]:
validate = True
model.fit(X, y)
if validate:
    y_pred = model.predict(X)
    print(mean_squared_error(y, y_pred))

30.007705074173813


In [52]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Wywóz odpadów komunalnych,Zakwaterowanie z wyżywieniem we własnym zakresie,Zakład fotograficzny,Zarządzanie odpadami,Złomowanie samochodów,Złomowisko,Ślusarz,Świece i znicze,audit_latitude,audit_longitude
0,-0.055108,0.039794,-0.305659,0.999815,-0.201768,-0.035143,0.998513,-0.998995,-0.999557,0.969775,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.779579,20.519410
1,0.090466,0.068071,0.162139,0.999078,0.080206,0.233418,0.998603,-0.997329,-0.998630,0.895384,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.865837,19.673775
2,0.090466,0.068071,0.162139,0.999078,0.080206,0.233418,0.998603,-0.997329,-0.998630,0.895384,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.865837,19.673775
3,0.090466,0.068071,0.162139,0.999078,0.080206,0.233418,0.998603,-0.997329,-0.998630,0.895384,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.800270,19.744841
4,-0.281585,0.308234,-0.055315,0.999063,-0.303568,-0.214196,0.934511,0.002287,-0.717030,0.917391,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51.744031,19.544995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450646,0.066055,0.160120,0.034533,0.999801,0.039807,0.023200,0.999580,-0.999846,-0.999935,0.973768,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51.266122,22.631937
450647,0.066055,0.160120,0.034533,0.999801,0.039807,0.023200,0.999580,-0.999846,-0.999935,0.973768,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51.266122,22.631937
450648,0.066055,0.160120,0.034533,0.999801,0.039807,0.023200,0.999580,-0.999846,-0.999935,0.973768,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51.166808,22.580159
450649,0.066055,0.160120,0.034533,0.999801,0.039807,0.023200,0.999580,-0.999846,-0.999935,0.973768,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51.166808,22.580159


In [58]:
import numpy as np
def make_query(query_msg, category, latitude, longitude):
    query = pd.DataFrame(herbert_forward([query_msg]).numpy())
    cat_df = pd.DataFrame(encoder.transform(np.array([[category]])), columns=encoder.categories_[0].tolist())
    query = pd.concat([query, cat_df], axis=1)
    query['audit_latitude'] = latitude
    query['audit_longitude'] = longitude
    return model.predict(query)

In [56]:
encoder.categories_[0]

array(['Adwokat', 'Adwokat rozwodowy', 'Adwokat sądowy',
       'Agencja interaktywna', 'Agencja marketingowa',
       'Agencja nieruchomości', 'Agencja reklamowa',
       'Agencja ubezpieczeniowa', 'Agroturystyka', 'Apartamenty wczasowe',
       'Artykuły metalowe', 'Automatyka', 'Bar', 'Bistro',
       'Biuro nieruchomości', 'Biuro podatkowe', 'Biuro rachunkowe',
       'Blacharstwo samochodowe', 'Brukarstwo', 'Budowa domów',
       'Budownictwo mieszkalne', 'Catering',
       'Catering food and drink supplier', 'Centrum biznesowe',
       'Centrum medyczne', 'Centrum paintballowe', 'Centrum szkoleniowe',
       'Chirurg plastyczny', 'Czyszczenie dywanów', 'Czyszczenie kominów',
       'Czyszczenie samochodów', 'Czyszczenie tapicerki', 'Dekarz',
       'Depilacja laserowa', 'Deweloper', 'Didżej', 'Dietetyk',
       'Dorabianie kluczy', 'Doradca finansowy', 'Doradca marketingowy',
       'Doradca podatkowy', 'Doradztwo kredytowe',
       'Dostawca alarmów samochodowych', 'Dostawca bal

In [60]:
make_query('kebab rynek', 'Restauracja', 51.109426, 17.031529)

100%|██████████| 1/1 [00:00<00:00, 32.00it/s]


array([10.454274], dtype=float32)