In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

from transformers.trainer_callback import EarlyStoppingCallback

import torch
from torch.utils.data import Dataset, DataLoader


Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [None]:
df = pd.read_csv('../../data/multilabel_data/publication2/train.csv')
df_val = pd.read_csv('../../data/multilabel_data/publication2/val.csv')
df_test = pd.read_csv('../../data/multilabel_data/publication2/test.csv')


In [None]:
df.head()


Unnamed: 0,text,offline_crime,online_crime,drugs,gambling,pornography,prostitution,slavery,suicide,terrorism,weapons,body_shaming,health_shaming,politics,racism,religion,sexual_minorities,sexism,social_injustice
0,Убийства и мы все знаем что убийца там ☝️,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,...а потом граждане возмущаются что ктото кое ...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,"Да преступление не тяжкое, могут под домашний ...",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Не льсти себе: Вот моя бывшая вообще мило пост...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Стать правителем и посадить их всех в тюрьму. ...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
necessary_columns = list(df.columns)[1:] 
necessary_columns


['offline_crime',
 'online_crime',
 'drugs',
 'gambling',
 'pornography',
 'prostitution',
 'slavery',
 'suicide',
 'terrorism',
 'weapons',
 'body_shaming',
 'health_shaming',
 'politics',
 'racism',
 'religion',
 'sexual_minorities',
 'sexism',
 'social_injustice']

In [None]:
def get_labels(dataframe):
    labels =[]
    for i, el in dataframe.iterrows():
        current_sample_labels = []
        any_class = False
        for clm in necessary_columns:
            if el[clm] == 1:
                any_class = True
                current_sample_labels.append(clm)
        if any_class == False:
            current_sample_labels.append("none")
        current_sample_labels = ','.join(current_sample_labels)
        labels.append(current_sample_labels)
    return labels
train_labels = get_labels(df)
val_labels = get_labels(df_val)
test_labels = get_labels(df_test)


In [None]:
len(set(test_labels))


197

In [None]:
df_train_adjusted = pd.DataFrame({'text':list(df['text']), 'labels':train_labels})
df_val_adjusted = pd.DataFrame({'text':list(df_val['text']), 'labels':val_labels})
df_test_adjusted = pd.DataFrame({'text':list(df_test['text']), 'labels':test_labels})


In [None]:
mapping = dict()
mapping['none'] = 0

for label in train_labels:
    if label not in mapping:
        mapping[label] = len(mapping)

for label in test_labels:
    if label not in mapping:
        mapping[label] = len(mapping)
        
for label in val_labels:
    if label not in mapping:
        mapping[label] = len(mapping)


In [None]:
len(mapping)


393

In [None]:
df_train_adjusted['class'] = df_train_adjusted['labels'].apply(lambda x: mapping[x])
df_test_adjusted['class'] = df_test_adjusted['labels'].apply(lambda x: mapping[x])
df_val_adjusted['class'] = df_val_adjusted['labels'].apply(lambda x: mapping[x])


In [None]:
df_val_adjusted


Unnamed: 0,text,labels,class
0,Требуют забрать заявление об увольнении с угрозой,offline_crime,1
1,"да расстреляют на месте, за сопротивление",offline_crime,1
2,Белорусы в форме омона бьют белорусов в штатском,offline_crime,1
3,"Одного нужно захватить, чтобы узнать мотивы со...",offline_crime,1
4,сел в тюрьму за кражу глазурованного сырка,offline_crime,1
...,...,...,...
687,"Бляяяяяя, он опять порвался, что совок унижают",none,0
688,Кормилица И в пир и в мир и в добрые люди,none,0
689,"Очень интересная статья, про развертки даже я ...",none,0
690,Бан блять кроль за такую хуйню СХС КРОЛ,none,0


In [None]:
labels_val = df_val_adjusted['labels'].tolist()


In [None]:
x_train = df_train_adjusted['text'].tolist()
y_train = df_train_adjusted['class'].tolist()
x_test = df_test_adjusted['text'].tolist()
y_test = df_test_adjusted['class'].tolist()
x_val = df_val_adjusted['text'].tolist()
y_val = df_val_adjusted['class'].tolist()


In [None]:
class UnsafeData(Dataset):

    def __init__(self, texts, targets, tokenizer, max_len):
        
        super().__init__()
        
        self.texts = texts
        self.targets = targets        
        self.max_len = max_len
        self.tokenizer = tokenizer

    def __len__(self):
        
        return len(self.texts)

    
    def __getitem__(self, index):
        x = self.texts[index]

        enc_dict = self.tokenizer(x, truncation=True, max_length=self.max_len, padding='max_length')
      
        item = {key: torch.tensor(val).long() for key, val in enc_dict.items()}
        item['labels'] = torch.tensor(self.targets[index]).long()

        return item 


In [None]:
model_name = 'DeepPavlov/rubert-base-cased-conversational'


In [None]:
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels = len(mapping))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!nvidia-smi


Fri Mar 19 12:36:46 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.102.04   Driver Version: 450.102.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 208...  Off  | 00000000:1A:00.0 Off |                  N/A |
| 27%   27C    P8    11W / 260W |      3MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:1B:00.0 Off |                  N/A |
| 27%   26C    P8     9W / 260W |      3MiB / 11019MiB |      0%      Default |
|       

In [None]:
device= torch.device("cuda:3")


In [None]:
model.to(device);


In [None]:
train_dataset = UnsafeData(x_train, y_train, tokenizer, max_len = 60)
test_dataset = UnsafeData(x_test, y_test, tokenizer, max_len = 60)
val_dataset = UnsafeData(x_val, y_val, tokenizer, max_len = 60)


In [None]:
len(train_dataset), len(test_dataset), len(val_dataset)


(31130, 1481, 692)

In [None]:
val_dataset[10]


{'input_ids': tensor([  101,  2270, 47970,   994,   846,  2181,   132,   458,  2396,  7370,
          1536,  1967,   838,  3005,   132,  1235,   322, 19121,   322, 28114,
           846,  2181,   132, 75832,   371,   801,  5827,   130,  1064,   802,
          7134,   322, 37442,   846,  2181,  1981,  6080,   132,  1190,  4302,
           340, 11728,  1143,  2838,  1088, 11757,   132,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': tensor(1)}

In [None]:
topics_list = necessary_columns + ['none']
topics_list


['offline_crime',
 'online_crime',
 'drugs',
 'gambling',
 'pornography',
 'prostitution',
 'slavery',
 'suicide',
 'terrorism',
 'weapons',
 'body_shaming',
 'health_shaming',
 'politics',
 'racism',
 'religion',
 'sexual_minorities',
 'sexism',
 'social_injustice',
 'none']

In [None]:
target_vaiables_id2topic_dict = {val:key for key, val in mapping.items()}


In [67]:
import json
with open("id2topic.json","w") as f:
    json.dump(target_vaiables_id2topic_dict, f, indent = 2)
    

In [None]:
len(target_vaiables_id2topic_dict)


393

In [None]:
def adjust_multilabel(y, is_pred = False):
    y_adjusted = []
    for y_c in y:
        y_test_curr = [0]*19
        if is_pred == True:
            y_c = target_vaiables_id2topic_dict[np.argmax(y_c)]
        else:
            y_c = target_vaiables_id2topic_dict[y_c]
        for tag in y_c.split(","):
            topic_index = topics_list.index(tag)
            y_test_curr[topic_index] = 1
        y_adjusted.append(y_test_curr)
    return y_adjusted


In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    labels = adjust_multilabel(labels, is_pred = False)
    preds = pred.predictions
    preds = adjust_multilabel(preds, is_pred = True)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division = 0)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [None]:
training_args = TrainingArguments(
    output_dir='/multi_model/publ',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps = 600,
    evaluation_strategy = 'steps',
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    save_steps = 500,
    eval_steps = 500,
    metric_for_best_model  = 'f1',
    greater_is_better = True,
    load_best_model_at_end = True    
)


In [None]:
import gc
def cleanup():
    gc.collect()
    torch.cuda.empty_cache()
    
cleanup()


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)


In [None]:
trainer.add_callback(EarlyStoppingCallback(4))


In [None]:
trainer.train()


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
500,No log,2.580844,0.504335,0.618515,0.79688,0.546571,2.8623,241.767
1000,2.443700,2.351733,0.510116,0.661653,0.737713,0.609007,3.1813,217.523
1500,0.553800,2.403322,0.531792,0.680419,0.745736,0.634596,3.1793,217.66
2000,0.299900,2.488272,0.534682,0.690131,0.74224,0.653019,3.1869,217.138




TrainOutput(global_step=2440, training_loss=0.863366361328813, metrics={'train_runtime': 2959.4311, 'train_samples_per_second': 0.824, 'total_flos': 19965548168676000, 'epoch': 10.0})

In [None]:
trainer.evaluate()




{'eval_loss': 2.488271951675415,
 'eval_accuracy': 0.5346820809248555,
 'eval_f1': 0.690130843519044,
 'eval_precision': 0.7422397670469699,
 'eval_recall': 0.6530194472876152,
 'eval_runtime': 3.3282,
 'eval_samples_per_second': 207.922,
 'epoch': 10.0}

In [None]:
trainer.save_model('multi-class')


Оценка на val_dataset

In [None]:
pred = trainer.predict(val_dataset)




In [None]:
pr = pred.predictions


In [None]:
len(df),len(df_test), len(df_val), len(adjust_multilabel(y_val, is_pred = False))


(31130, 1481, 692, 692)

In [None]:
print(classification_report(adjust_multilabel(y_val, is_pred = False), adjust_multilabel(pr, is_pred = True),
                           target_names=topics_list, zero_division = 0))


                   precision    recall  f1-score   support

    offline_crime       0.64      0.54      0.58        52
     online_crime       0.46      0.43      0.44        14
            drugs       0.88      0.88      0.88        41
         gambling       0.50      0.50      0.50         2
      pornography       0.77      0.68      0.72        87
     prostitution       0.87      0.80      0.84        41
          slavery       0.72      0.87      0.79        15
          suicide       0.50      0.67      0.57         3
        terrorism       0.50      0.39      0.44        18
          weapons       0.90      0.94      0.92        65
     body_shaming       0.86      0.67      0.75        48
   health_shaming       0.86      0.65      0.74        49
         politics       0.73      0.56      0.63       109
           racism       0.82      0.59      0.69        86
         religion       0.90      0.80      0.84        44
sexual_minorities       0.69      0.55      0.61       

Оценка на test_dataset

In [None]:
pred2 = trainer.predict(test_dataset)




In [None]:
pr2 = pred2.predictions


In [None]:
print(classification_report(adjust_multilabel(y_test, is_pred = False), adjust_multilabel(pr2, is_pred = True),
                           target_names=topics_list, zero_division = 0))


                   precision    recall  f1-score   support

    offline_crime       0.65      0.55      0.60       132
     online_crime       0.50      0.46      0.48        37
            drugs       0.87      0.90      0.88        87
         gambling       0.50      0.67      0.57         6
      pornography       0.73      0.59      0.65       204
     prostitution       0.75      0.69      0.72        91
          slavery       0.72      0.72      0.73        40
          suicide       0.33      0.29      0.31         7
        terrorism       0.68      0.57      0.62        47
          weapons       0.89      0.83      0.86       138
     body_shaming       0.90      0.67      0.77       109
   health_shaming       0.84      0.55      0.66       108
         politics       0.68      0.54      0.60       241
           racism       0.81      0.59      0.68       204
         religion       0.94      0.72      0.81       102
sexual_minorities       0.69      0.46      0.55       

In [None]:
import os
path = "../../../../../russian-sensitive-topics"
os.listdir(path)


['.git', '.gitattributes']

In [None]:
trainer.save_model(path)


In [None]:
tokenizer.save_pretrained(path)


('../../../../../russian-sensitive-topics/tokenizer_config.json',
 '../../../../../russian-sensitive-topics/special_tokens_map.json',
 '../../../../../russian-sensitive-topics/vocab.txt',
 '../../../../../russian-sensitive-topics/added_tokens.json')

In [None]:
from transformers import TFBertForSequenceClassification


In [None]:
tf_model = TFBertForSequenceClassification.from_pretrained(path, from_pt=True)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [None]:
tf_model.save_pretrained(path)
