In [1]:
!pip install torch



In [2]:
!pip install datasets transformers==4.28.0



In [3]:
!pip install datasets



In [4]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
import torch

from transformers import BertTokenizer, AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer

from torch.nn.utils.rnn import pad_sequence
# from torch.utils.data import Dataset, DataLoader
from datasets import Dataset

# torch.set_float32_matmul_precision("high")


In [5]:
print(torch.__version__)

2.0.1


In [14]:
TRAIN_DATASET_PATH = "train_supervised_dataset.csv"
TRAIN_UNSUPERVISED_DATASET_PATH = "train_unsupervised_dataset.csv"
TEST_DATASET_PATH = "test_dataset.csv"
TOKENIZER_PATH = "./tokenizer"
MODEL_PRETRAINED = "DeepPavlov/rubert-base-cased"
VAL_SPLIT_SIZE = 0.01
BATCH_SIZE = 512
NUM_WORKERS = 5


In [9]:
! pip install seqeval
from seqeval.metrics.sequence_labeling import get_entities



## Tokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PRETRAINED)

In [10]:
#BIO- taggs (beginning,inside, outside)
index_to_tag = ["O", "B-GOOD", "I-GOOD", "B-BRAND", "I-BRAND", "PAD"]
tag_to_index = {tag: index for index, tag in enumerate(index_to_tag)}

In [11]:
def test_tokenize_function(df):
    """
    Tokenize without labeling
    """
    return tokenizer(df['tokens'], truncation=True, is_split_into_words=True, padding="max_length")


In [12]:
def tokenize_labeling_function(df):
    """
    Tokenize inputs and labels
    
    Returns:
            A dictionary containing 'input_ids', 'token_type_ids','attention_mask','labels'
    """
    tokens = df['tokens']
    tokenized_inputs = tokenizer(tokens, truncation=True, is_split_into_words=True, padding="max_length")
    tokens_tokenized = tokenizer.tokenize(tokens, truncation=True, is_split_into_words=True)
    goods_tokenized = tokenizer.tokenize(df['good'].lower().split(','), is_split_into_words=True) 
    brands_tokenized = tokenizer.tokenize(df['brand'].lower().split(','), is_split_into_words=True) 

    tags = apply_bio_tagging(tokens_tokenized, goods_tokenized , brands_tokenized) 
          
    labels = [tag_to_index[tag] for tag in tags]
    labels.insert(0, -100)
    labels.insert(len(labels), -100)

    tokenized_inputs['labels'] = labels

    return tokenized_inputs


def apply_bio_tagging(tokens, good, brand):
    """
    Build BIO tags(beginning,inside, outside) based on the receipt tokens and brands and goods tokens
    
    """
    tags = ['O'] * len(tokens)

    for i, token in enumerate(tokens):
        if len(good) > 0 and tokens[i:i + len(good)] == good:
            tags[i] = "B-GOOD"
            for j in range(i + 1, i + len(good)):
                tags[j] = "I-GOOD"
        if len(brand) > 0 and tokens[i:i + len(brand)] == brand:
            tags[i] = "B-BRAND"
            for j in range(i + 1, i + len(brand)):
                tags[j] = "I-BRAND"
                
    return tags

# Dataset

In [18]:
train_df = pd.read_csv(TRAIN_DATASET_PATH).fillna("")
test_df = pd.read_csv(TEST_DATASET_PATH).fillna("")

In [19]:


train_df["tokens"] = train_df["name"].str.lower().str.split()

test_df["tokens"] = test_df["name"].str.lower().str.split()
train_df, val_df = train_test_split(train_df[['name', 'good', 'brand', 'tokens']], test_size=VAL_SPLIT_SIZE)

train_df = Dataset.from_pandas(train_df)
val_df = Dataset.from_pandas(val_df)
test_df=Dataset.from_pandas(test_df)

train_df, val_df, test_df

(Dataset({
     features: ['name', 'good', 'brand', 'tokens', '__index_level_0__'],
     num_rows: 24750
 }),
 Dataset({
     features: ['name', 'good', 'brand', 'tokens', '__index_level_0__'],
     num_rows: 250
 }),
 Dataset({
     features: ['id', 'name', 'tokens'],
     num_rows: 5000
 }))

In [20]:
train_df

Dataset({
    features: ['name', 'good', 'brand', 'tokens', '__index_level_0__'],
    num_rows: 24750
})

In [21]:
len(train_df), len(val_df), len(test_df)

(24750, 250, 5000)

In [25]:
train_tokenized = train_df.map(tokenize_labeling_function )
val_tokenized = val_df.map(tokenize_labeling_function)

test_tokenized = test_df.map(test_tokenize_function)


Map:   0%|          | 0/24750 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [26]:
train_tokenized = train_tokenized.remove_columns(['name', 'good', 'brand', 'tokens', '__index_level_0__'])
val_tokenized = val_tokenized.remove_columns(['name', 'good', 'brand', 'tokens', '__index_level_0__'])
train_tokenized, val_tokenized

(Dataset({
     features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
     num_rows: 24750
 }),
 Dataset({
     features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
     num_rows: 250
 }))

# Fine-tuning the model

In [27]:
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_PRETRAINED,
    num_labels = len(index_to_tag),
    output_attentions = False,
    output_hidden_states = False)

model.config.id2label = dict(enumerate(index_to_tag))
model.config.label2id = {v: k for k,v in model.config.id2label.items()}

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initializ

In [28]:
batch_size =16
args = TrainingArguments(
    "ner",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
#     load_best_model_at_end= True,
    weight_decay=0.01,
    save_strategy="no",
    report_to="none",
    use_mps_device=True

)

In [29]:
from transformers import DataCollatorForTokenClassification

data_collator=DataCollatorForTokenClassification(tokenizer)


In [30]:
!pip install seqeval


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [31]:
!pip install evaluate

import evaluate

metric = evaluate.load("seqeval")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [39]:
def compute_metrics(p):
    logits, labels = p
    predictions = np.argmax(logits, axis = -1)
        # Remove ignored index (special tokens)
    true_predictions = [
        [index_to_tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [index_to_tag[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recal": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }



In [40]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics

)

In [41]:
for param in model.parameters():
    param.requires_grad = True

In [42]:
import logging
from transformers.trainer import logger as noisy_logger
noisy_logger.setLevel(logging.WARNING)

In [43]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recal,F1,Accuracy
1,0.0608,0.0545,0.851117,0.929539,0.888601,0.957345
2,0.0391,0.047765,0.895288,0.926829,0.910786,0.965422
3,0.0284,0.052875,0.883249,0.943089,0.912189,0.965169


TrainOutput(global_step=4641, training_loss=0.04973919477423726, metrics={'train_runtime': 2743.1201, 'train_samples_per_second': 27.068, 'train_steps_per_second': 1.692, 'total_flos': 1224454343379120.0, 'train_loss': 0.04973919477423726, 'epoch': 3.0})

In [44]:
trainer.evaluate()

{'eval_loss': 0.052874885499477386,
 'eval_precision': 0.883248730964467,
 'eval_recal': 0.943089430894309,
 'eval_f1': 0.9121887287024902,
 'eval_accuracy': 0.9651691065118627,
 'eval_runtime': 0.6793,
 'eval_samples_per_second': 368.05,
 'eval_steps_per_second': 23.555,
 'epoch': 3.0}

In [45]:
# Save model
save_directory = './pt_save_pretrained'
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

## Prediction

In [46]:
test_tokenized

Dataset({
    features: ['id', 'name', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 5000
})

In [47]:
output = trainer.predict(test_tokenized)

In [48]:
from seqeval.metrics.sequence_labeling import get_entities

In [49]:
predictions =  np.argmax(output.predictions, axis = -1)
predictions

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 3, ..., 0, 0, 0],
       [0, 1, 2, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0]])

In [50]:
def get_predictions(tokens_sequence, prediction):
 #   tokens_sequence = [token.replace("#", "") for token in tokens_sequence ]

    for i in range(len(prediction)):
        if (prediction[i] == 2 and prediction[i-1]==0):
            prediction [i-1]=1 
    for i in range(len(prediction)):    
        if (prediction[i] == 4 and prediction[i-1]==0):
            prediction [i-1]=3
    # 
    tags = [index_to_tag[i] for i in prediction ]
    #get positon of tag
    entities = get_entities(tags)

    #concatenate begining and inside part of words, delate "##"
    goods_pred = ','.join([' '.join(tokens_sequence[start-1:finish]) for t, start, finish in entities if t == "GOOD"])
    brands_pred = ','.join([' '.join(tokens_sequence[start-1:finish]) for t, start, finish in entities if t == "BRAND"])
    goods_pred = goods_pred.replace(" ##", "")
    brands_pred = brands_pred.replace(" ##", "")

    return goods_pred, brands_pred

In [51]:
goods, brands = [], []
#result = []
for tokens_sequence, prediction in zip(test_df['tokens'], predictions):

    tokens_tokenized = tokenizer.tokenize(tokens_sequence, truncation=True, is_split_into_words=True)
    prediction = prediction[:len(tokens_tokenized)]

    good, brand = get_predictions(tokens_tokenized, prediction)

    goods.append(good)
    brands.append(brand)


In [52]:

submission = pd.DataFrame(data={"tokens":test_df['tokens'], "id": range(len(goods)), "good": goods, "brand": brands})
submission.head(70)

Unnamed: 0,tokens,id,good,brand
0,"[469-210, ермак, клей, универсальный,, 15мл,, ...",0,клей,ермак
1,"[торт, сладушка, зимняя, вишня, 700г]",1,торт,сладушка
2,"[смеситель, ""calorie"", 1023, а06, д/кухни]",2,смеситель,calorie
3,"[лимон, 50гр, бар]",3,лимон,
4,"[коньяк, сараджишвили, 5, лет, 0,5л, грузия]",4,коньяк,сараджишвили
...,...,...,...,...
65,"[3390150569975, набор, (дезодорант, рол.75мл+гел]",65,дезодорант,
66,"[болт, нержав., м4х25, уп.10шт.]",66,болт,
67,"[п/н, angel, love, №6, шт.]",67,,angel love
68,"[1.аксессуар, туфли, 0wh144z8, 8681384400314]",68,туфли,


In [53]:
submission.to_csv("submission.csv", index=False)

     1