In [None]:
!wget dataset_ozon.tsv https://raw.githubusercontent.com/ustera/brand_ner/main/dataset_ozon.tsv

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import spacy
from collections import Counter
from sklearn.model_selection import train_test_split
from spacy.tokens import DocBin
from re import sub
from string import punctuation
tqdm.pandas()

In [None]:
!python3 -m spacy download ru_core_news_lg

In [None]:
ds = pd.read_csv('/content/dataset_ozon.tsv', sep = '\t')
ds = ds.drop(columns = ['Unnamed: 0'])

In [None]:
ds

Unnamed: 0,name,brand
0,"Игровая консоль PlayStation 5, белый",PlayStation
1,Золотой Шелк Маска для волос гиалуроновая кисл...,Золотой Шелк
2,"Игровая консоль Microsoft Xbox Series X, черный",Microsoft
3,L'Oreal Paris Infaillible 24H Fresh Wear Пудра...,L'Oreal Paris
4,Maybelline New York Lifter Gloss Блеск для губ...,Maybelline New York
...,...,...
81774,"Erborian BB Family BB карандаш, светлый, 3 г",Erborian
81775,Комплект модулей сменных фильтрующих Аквафор Р...,Аквафор
81776,Набор менструальных чаш Satisfyer Feel secure ...,Satisfyer
81777,28048 Подстилка для надувных и каркасных бассе...,Intex


In [None]:
# add spaces either side of punctuation
# remove multiple spaces
# strip of spacelike characters
ds = ds.applymap(lambda string: sub(r"\s{2,}", " ", sub(f'([{punctuation}]) ?', r" \1 ", string)).strip()) 

In [None]:
ds

Unnamed: 0,name,brand
0,"Игровая консоль PlayStation 5 , белый",PlayStation
1,Золотой Шелк Маска для волос гиалуроновая кисл...,Золотой Шелк
2,"Игровая консоль Microsoft Xbox Series X , черный",Microsoft
3,L ' Oreal Paris Infaillible 24H Fresh Wear Пуд...,L ' Oreal Paris
4,Maybelline New York Lifter Gloss Блеск для губ...,Maybelline New York
...,...,...
81774,"Erborian BB Family BB карандаш , светлый , 3 г",Erborian
81775,Комплект модулей сменных фильтрующих Аквафор Р...,Аквафор
81776,Набор менструальных чаш Satisfyer Feel secure ...,Satisfyer
81777,28048 Подстилка для надувных и каркасных бассе...,Intex


In [None]:
import ru_core_news_lg
nlp = ru_core_news_lg.load()

In [None]:
def make_tags(name, brand):
  global tags
  global ind
  name_splt = nlp(name)
  brand_splt = nlp(brand)
  for i in name_splt:
    for j in brand_splt:
      # check for brand names that repeat words (e.g. M&M's)
      if not tags or tags[-1][1].idx != i.idx:
        if i.lower_ == j.lower_:
          if j.i == 0:
            # for multiple word brand names, we have to make sure what the word found is part of the brand name
            if len(brand_splt) == 1: 
              tags.append([ind, i, 'B-BRAND'])
            elif name_splt[i.i:].text.lower().startswith(brand_splt[j.i:].text.lower()):
              tags.append([ind, i, 'B-BRAND'])
            elif (j.i == len(brand_splt)-1) or (i.text not in brand_splt[j.i+1:].text.split()):
              tags.append([ind, i, 'O'])
          else: 
            if (
                name_splt[:i.i].text.lower().endswith(brand_splt[:j.i].text.lower())
                and
                name_splt[i.i:].text.lower().startswith(brand_splt[j.i:].text.lower())
              ):
              tags.append([ind, i, 'I-BRAND'])
            elif (j.i == len(brand_splt)-1) or (i.text not in brand_splt[j.i+1:].text.split()):
              tags.append([ind, i, 'O'])
    if i.lower_ not in brand.lower():
      tags.append([ind, i, 'O'])
  ind += 1
  return tags

In [None]:
tags = []
ind = 0
print(make_tags("Pure Paw Paw бальзам с ароматом клубники", "Pure Paw Paw"))

[[0, Pure, 'B-BRAND'], [0, Paw, 'I-BRAND'], [0, Paw, 'I-BRAND'], [0, бальзам, 'O'], [0, с, 'O'], [0, ароматом, 'O'], [0, клубники, 'O']]


In [None]:
tags = []
ind = 0
nlp = spacy.load('ru_core_news_lg', disable=['ner', 'parser', 'tok2vec', 'attribute_ruler', 'morphologizer'])
ds.progress_apply(lambda x: make_tags(x['name'], x['brand']), axis = 1)

In [None]:
ds_IOB2 = pd.DataFrame(tags, columns = ['sent_id', 'token', 'label'])

In [None]:
ds_IOB2

Unnamed: 0,sent_id,token,label
0,0,Игровая,O
1,0,консоль,O
2,0,PlayStation,B-BRAND
3,0,5,O
4,0,",",O
...,...,...,...
1073593,81777,см,O
1073594,81778,Трусы,O
1073595,81778,женские,O
1073596,81778,befree,B-BRAND


Before we start training spacy, we need to convert the data into the format that spacy can understand. For that, we need BRAND start and end tokens for every sentence.

In [None]:
brand_indices = {}
sents = ds_IOB2.groupby('sent_id')

cnt = 0
failed = []
for sent_id, sent in tqdm(sents):
  start, end = -1, -1
  text_list = sent['token'].tolist()
  labels = sent['label'].tolist()
  for n, token in enumerate(text_list):
    if labels[n][0] == 'B':
      start = token.idx
      if n == len(labels)-1 or labels[n+1][0] == 'O':
        # brand name is only one word
        end = token.idx + len(token)
    elif labels[n][0] == 'I' and (
         n == len(labels)-1 or labels[n+1][0] == 'O'
         ):
      # brand name is multiple words
      end = token.idx + len(token)
  if start != -1 and end != -1:
    if len(ds.iloc[sent_id]['brand']) == end-start:
      brand_indices[sent_id] = {'brand': ds.iloc[sent_id]['brand'], 'ind': [start, end]}
    else:
      cnt += 1
      failed.append(sent_id)
    # assert len(ds.iloc[sent_id]['brand']) == end-start, f"{sent}, \ntarget {ds.iloc[sent_id]['brand']}, ind {start}, {end}"

100%|██████████| 81779/81779 [00:28<00:00, 2853.18it/s]


In [None]:
# number of sentences where we couldn't successfully locate brand name
cnt

30

In [None]:
nlp = spacy.load('ru_core_news_lg', disable=['ner', 'parser', 'tok2vec', 'attribute_ruler', 'morphologizer'])

In [None]:
train, test = train_test_split(ds, test_size=0.2)
print(train.shape, test.shape)
train

(65423, 2) (16356, 2)


Unnamed: 0,name,brand
70299,"СпивакЪ . Бельди Ягодное , 100 г",СпивакЪ
64831,Машина Кабриолет Barbie,Barbie
14909,Портативная игровая консоль MyPads 4 . 3 - дюй...,MyPads
17522,Электрический штопор и набор аксессуаров для в...,Xiaomi
58139,Туфли T . TACCARDI,T . TACCARDI
...,...,...
15305,Магнит плакат мотиватор на холодильник Правила...,Notta & Belle
51381,Десертный соус топпинг Bounty Dessert Sauce ( ...,Bounty
74644,Кулер для воды AEL 85C LD white / black,AEL
75912,LEGO NINJAGO Конструктор Дракон Джея 70602,LEGO


Now for every sentence in _train_ and _test_ let's find its start/end tokens!

In [None]:
db = spacy.tokens.DocBin()
train_id = train.index

for idx in tqdm(train_id):
  if idx in brand_indices.keys():
    doc = nlp(train.loc[train.index == idx]['name'].tolist()[0])
    ents = []
    start, end = brand_indices[idx]['ind']
    span = doc.char_span(start, end, label='BRAND')
    ents.append(span)
    doc.ents = ents
    db.add(doc)

100%|██████████| 65423/65423 [03:51<00:00, 282.45it/s]


In [None]:
db.to_disk('./train.spacy')

In [None]:
db = spacy.tokens.DocBin()
test_id = test.index

for idx in tqdm(test_id):
  if idx in brand_indices.keys():
    doc = nlp(test.loc[test.index == idx]['name'].tolist()[0])
    ents = []
    start, end = brand_indices[idx]['ind']
    span = doc.char_span(start, end, label='BRAND')
    ents.append(span)
    doc.ents = ents
    db.add(doc)

100%|██████████| 16356/16356 [00:55<00:00, 295.68it/s]


In [None]:
db.to_disk('./test.spacy')

.spacy is the format that spacy will look for when training its model. We're almost ready to start training.

Let's creat custom embeddings with gensim.

In [None]:
!pip install gensim --upgrade

import gensim

In [None]:
titles = ds['name'].tolist()
sequences = list(map(lambda x: x.split(), titles))

In [None]:
model = gensim.models.Word2Vec(sequences, min_count=2, vector_size=200, epochs=10)

In [None]:
model.wv.most_similar('детская')

[('Кроватка', 0.7732728719711304),
 ('Горка', 0.7556986212730408),
 ('Шапочка', 0.7531970143318176),
 ('кроватка', 0.7494092583656311),
 ('Каталка', 0.748365581035614),
 ('Koala', 0.7479047775268555),
 ('RH301', 0.7405040860176086),
 ('Детская', 0.7387740015983582),
 ('KIDS', 0.733460545539856),
 ('Itikka', 0.7307717204093933)]

In [None]:
model.wv.most_similar('кроватка')

[('Woodlines', 0.89400315284729),
 ('Кроватка', 0.890350341796875),
 ('Bonne', 0.8860921263694763),
 ('приставная', 0.8818628787994385),
 ('Menthol', 0.8629642724990845),
 ('VDK', 0.8626813888549805),
 ('ванна', 0.8592925667762756),
 ('Boom', 0.8585990071296692),
 ('Buggy', 0.8571462035179138),
 ('Clouds', 0.8517149686813354)]

In [None]:
model.wv.similarity('Apple','Samsung')

0.77490693

In [None]:
model.wv.save_word2vec_format('embeddings.txt', binary=False)

NOW we're ready to start training spacy. Initialize vectors.

In [None]:
!python -m spacy init vectors ru embeddings.txt spacy_embeddings --name title_vectors

2023-03-23 17:46:09.945834: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-23 17:46:09.946049: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-23 17:46:12.262772: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[38;5;4mℹ Creating blank nlp object for language 'ru'[0m
[2023-03-23 17:46:13,265] [INFO] Reading vectors from embeddings.txt
42772it [00:02, 18372.05it/s]
[2023-03-23 17:46:15,599] [INFO] Loaded vectors

In [None]:
nlp = spacy.load('spacy_embeddings')
nlp.add_pipe('ner')
nlp.to_disk('spacy_embeddings')

At this point, spacy creates a 'spacy_embeddings' folder in the working directory -- find the config.cfg file in it and tweak some arguments, if you want to. We changed the default learning rate to 0.0001.

In [None]:
!python -m spacy train spacy_embeddings/config.cfg --output spacy_model --paths.train train.spacy --paths.dev test.spacy

2023-03-23 18:46:52.878486: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-23 18:46:52.878608: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-23 18:46:54.904014: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[38;5;4mℹ Saving to output directory: spacy_model[0m
[38;5;4mℹ Using CPU[0m
[1m
[2023-03-23 18:46:55,724] [INFO] Set up nlp object from config
[2023-03-23 18:46:55,741] [INFO] Pipeline: ['ner']
[2023-

In [None]:
fin = spacy.load('spacy_model/model-best')

In [None]:
test_sentences = ["Крем для рук Чистая линия",
                  "Крем для рук чистая линия",
                  "Чистая линия крем для рук",

                  "Блендер '6 в 1' Kitfort КТ-3078, 2 скорости и импульсный режим, для взбивания, смешивания и колки льда",
                  "Ноутбук Lenovo ThinkPad E14 Gen 2 (20TA002JRT)",
                  "Organic Shop Organic Hand Soap мыло для рук",
                  "Minimal!st Серьги",
                  "Шоколадные конфеты Snickers Minis Белый, 2.9 кг / белый шоколад, нуга, карамель, арахис, шоколад", 

                  "Сок Добрый Апельсин",
                  "кровать-тумба раскладная \"Карина\", цвет орех Удачная Мебель",
                  "Дезодорант мужской Деоника Антиперспирант PROpharma Active men, спрей - 150 мл",
                  "Умная колонка Яндекс Станция Мини без часов с голосовым помощником Алиса, черный оникс, 10Вт",
                  "Ортопедическая подушка с эффектом памяти 60х40х11/13 см \"Просто Подушка\" №9 валики мягкая",
                  "Кольцо из золота с фианитами яхонт Ювелирный Арт. 240091" ,

                  "Пастила Белёвская классическая без сахара Зелёная Линия, 100г",
                  "Пицца 4 сыра Маркет Перекрёсток, 350г",
                  
                  "Туфли Мэри Джейн Marco Tozzi, размер 41, черный",
                  "Туфли Mary Jane Marco Tozzi, размер 41, черный",
                  "Туфли Мэри Джейн Марко Тоцци, размер 41, черный",
                  "Туфли MARY JANE MARCO TOZZI, размер 41, черный",
                  "Туфли MARY JANE Marco Tozzi, размер 41, черный",
                  "Туфли Mary Jane MARCO TOZZI, размер 41, черный",]
                  
for sent in test_sentences:
  print(sent)
  sent = sub(r"\s{2,}", " ", sub(f'([{punctuation}]) ?', r" \1 ", sent)).strip()
  doc = fin(sent)
  for ent in doc.ents:
      print(ent.text," : ", ent.label_)
  print('\n')

Крем для рук Чистая линия
Чистая линия  :  BRAND


Крем для рук чистая линия


Чистая линия крем для рук
Чистая линия  :  BRAND


Блендер '6 в 1' Kitfort КТ-3078, 2 скорости и импульсный режим, для взбивания, смешивания и колки льда
Kitfort  :  BRAND


Ноутбук Lenovo ThinkPad E14 Gen 2 (20TA002JRT)
Lenovo  :  BRAND


Organic Shop Organic Hand Soap мыло для рук
Organic Shop  :  BRAND


Minimal!st Серьги
Minimal ! st  :  BRAND


Шоколадные конфеты Snickers Minis Белый, 2.9 кг / белый шоколад, нуга, карамель, арахис, шоколад
Snickers  :  BRAND


Сок Добрый Апельсин


кровать-тумба раскладная "Карина", цвет орех Удачная Мебель
Удачная Мебель  :  BRAND


Дезодорант мужской Деоника Антиперспирант PROpharma Active men, спрей - 150 мл
Деоника  :  BRAND


Умная колонка Яндекс Станция Мини без часов с голосовым помощником Алиса, черный оникс, 10Вт
Яндекс  :  BRAND


Ортопедическая подушка с эффектом памяти 60х40х11/13 см "Просто Подушка" №9 валики мягкая


Кольцо из золота с фианитами яхонт Ювел

In [None]:
!zip -r /content/spacy_model.zip /content/spacy_model

  adding: content/spacy_model/ (stored 0%)
  adding: content/spacy_model/model-last/ (stored 0%)
  adding: content/spacy_model/model-last/meta.json (deflated 54%)
  adding: content/spacy_model/model-last/vocab/ (stored 0%)
  adding: content/spacy_model/model-last/vocab/lookups.bin (stored 0%)
  adding: content/spacy_model/model-last/vocab/vectors.cfg (stored 0%)
  adding: content/spacy_model/model-last/vocab/vectors (deflated 45%)
  adding: content/spacy_model/model-last/vocab/strings.json (deflated 86%)
  adding: content/spacy_model/model-last/vocab/key2row (stored 0%)
  adding: content/spacy_model/model-last/config.cfg (deflated 59%)
  adding: content/spacy_model/model-last/tokenizer (deflated 84%)
  adding: content/spacy_model/model-last/ner/ (stored 0%)
  adding: content/spacy_model/model-last/ner/model (deflated 8%)
  adding: content/spacy_model/model-last/ner/moves (deflated 42%)
  adding: content/spacy_model/model-last/ner/cfg (deflated 33%)
  adding: content/spacy_model/model-b