In [1]:
import os
import random
import numpy as np
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report

import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, GPT2ForSequenceClassification
from transformers.modeling_outputs import SequenceClassifierOutputWithPast

from tqdm.notebook import tqdm

In [2]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [3]:
os.environ['TOKENIZERS_PARALLELISM'] = 'False'
tokenizer = AutoTokenizer.from_pretrained('sberbank-ai/rugpt3medium_based_on_gpt2')
tokenizer.add_special_tokens({'pad_token': '<pad>'})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0

In [32]:
df = pd.read_csv("train_dataset_train.csv")
test = pd.read_csv("test_dataset_test.csv").dropna()

In [5]:
X_train, X_val, y_train, y_val = train_test_split(df['name'], df['groups'], test_size=0.1, random_state=0)

In [6]:
category2id = {cat:i for i, cat in enumerate(df['groups'].unique())}

In [7]:
len(category2id)

9

In [8]:
device = 'cuda'
maxl = 64
batch_size = 16

In [9]:
X_train  = [
            tokenizer.encode(q, max_length=maxl, padding='max_length', truncation=True, pad_to_max_length=True) 
            for q in tqdm(X_train.tolist())
]
X_train = [i if i else [0] * maxl for i in X_train]
X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train.map(category2id).tolist())
train_data = TensorDataset(X_train, y_train)
train_dataloader = DataLoader(
    train_data,
    sampler=RandomSampler(train_data),
    batch_size=batch_size,
    num_workers=12
    #pin_memory=True
)

  0%|          | 0/592257 [00:00<?, ?it/s]

In [10]:
X_val  = [
          tokenizer.encode(q, max_length=maxl, padding='max_length', truncation=True, pad_to_max_length=True) 
          for q in tqdm(X_val.tolist())
]
X_val = [i if i else [0] * maxl for i in X_val]
X_val = torch.tensor(X_val)
y_val = torch.tensor(y_val.map(category2id).tolist())
validation_data = TensorDataset(X_val, y_val)
validation_dataloader = DataLoader(
    validation_data,
    sampler=SequentialSampler(validation_data),
    batch_size=batch_size,
    num_workers=12,
    #pin_memory=True
)

  0%|          | 0/65807 [00:00<?, ?it/s]

In [11]:
class_weight = compute_class_weight('balanced', classes=range(max(y_train)+1), y=y_train.tolist())

In [12]:
class WeightedGPT2(GPT2ForSequenceClassification):

    def forward(
        self,
        input_ids=None,
        past_key_values=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size, sequence_length = input_ids.shape[:2]
        else:
            batch_size, sequence_length = inputs_embeds.shape[:2]

        assert (
            self.config.pad_token_id is not None or batch_size == 1
        ), "Cannot handle batch sizes > 1 if no padding token is defined."
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
            else:
                sequence_lengths = -1
                logger.warning(
                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
                    f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
                )

        pooled_logits = logits[range(batch_size), sequence_lengths]

        loss = None
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(pooled_logits.view(-1), labels.to(self.dtype).view(-1))
            else:
                loss_fct = CrossEntropyLoss(weight=torch.tensor(class_weight, dtype=torch.float).to(device))
                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )


In [13]:
model = WeightedGPT2.from_pretrained('sberbank-ai/rugpt3medium_based_on_gpt2', num_labels=len(category2id))
model.to(device)
model.config.pad_token_id = 0

Some weights of the model checkpoint at sberbank-ai/rugpt3medium_based_on_gpt2 were not used when initializing WeightedGPT2: ['lm_head.weight']
- This IS expected if you are initializing WeightedGPT2 from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WeightedGPT2 from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WeightedGPT2 were not initialized from the model checkpoint at sberbank-ai/rugpt3medium_based_on_gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=2e-6)



In [15]:
for _ in range(5):
    model.train()
    train_loss = 0
    
    for step, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_labels = batch
      
        optimizer.zero_grad()
      
        loss = model(b_input_ids.long(), token_type_ids=None, labels=b_labels)
        loss[0].backward()
        
        optimizer.step()
        
        train_loss += loss[0].item()
      
    print("Loss на обучающей выборке: {0:.5f}".format(train_loss / len(train_dataloader)))
    
    model.eval()

    valid_preds, valid_labels = [], []

    for batch in tqdm(validation_dataloader): 
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_labels = batch

        with torch.no_grad():
            logits = model(b_input_ids.long(), token_type_ids=None)

        logits = logits[0].detach().cpu()
        label_ids = b_labels.to('cpu').numpy()

        batch_preds = torch.softmax(logits, axis=1).argmax(axis=1).numpy()
        batch_labels = label_ids
        valid_preds.extend(batch_preds)
        valid_labels.extend(batch_labels)

    valid_preds = np.array(valid_preds)
    print("recall: " + str(recall_score(valid_labels, valid_preds, average='macro')))

  0%|          | 0/37017 [00:00<?, ?it/s]

Loss на обучающей выборке: 0.13687


  0%|          | 0/4113 [00:00<?, ?it/s]

recall: 0.9888309145676758


  0%|          | 0/37017 [00:00<?, ?it/s]

Loss на обучающей выборке: 0.03754


  0%|          | 0/4113 [00:00<?, ?it/s]

recall: 0.9931102535999377


  0%|          | 0/37017 [00:00<?, ?it/s]

Loss на обучающей выборке: 0.02392


  0%|          | 0/4113 [00:00<?, ?it/s]

recall: 0.9938060067452731


  0%|          | 0/37017 [00:00<?, ?it/s]

Loss на обучающей выборке: 0.01693


  0%|          | 0/4113 [00:00<?, ?it/s]

recall: 0.9946216618921676


  0%|          | 0/37017 [00:00<?, ?it/s]

Loss на обучающей выборке: 0.01320


  0%|          | 0/4113 [00:00<?, ?it/s]

recall: 0.9955456193304504


In [16]:
categories = [i for i in category2id.keys()]

In [17]:
print(classification_report([categories[i] for i in valid_labels], [categories[i] for i in valid_preds]))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2107
           1       0.94      1.00      0.97      1673
           2       0.93      1.00      0.96       397
           3       0.99      0.99      0.99      2084
           4       0.98      1.00      0.99      2763
           6       0.89      0.99      0.94      1054
           7       0.89      0.99      0.94      1176
           9       0.99      1.00      0.99      4287
          10       1.00      0.99      0.99     50266

    accuracy                           0.99     65807
   macro avg       0.96      1.00      0.98     65807
weighted avg       0.99      0.99      0.99     65807



In [33]:
test_texts = test['name']

In [34]:
X_test  = [
           tokenizer.encode(q, max_length=maxl, padding='max_length', truncation=True, pad_to_max_length=True) 
           for q in tqdm(test_texts)
]
X_test = [i if i else [0] * maxl for i in X_test]
X_test = torch.tensor(X_test)
test_data = TensorDataset(X_test)
test_dataloader = DataLoader(
    test_data,
    sampler=SequentialSampler(test_data),
    batch_size=batch_size,
    num_workers=4,
    pin_memory=True
)

  0%|          | 0/282227 [00:00<?, ?it/s]

In [35]:
#test_preds = np.zeros((test.shape[0], 11))

In [36]:
categories = np.array([i for i in category2id.keys()])

In [37]:
model.eval()
preds = []
for batch in tqdm(test_dataloader):   
    batch = batch[0]
    b_input_ids = batch.to(device)
    
    with torch.no_grad():
        logits = model(b_input_ids.long(), token_type_ids=None)

    logits = logits[0].detach().cpu()
    
    batch_preds = categories[logits.argmax(axis=1).numpy()]
    
    preds.extend(batch_preds)
    
preds = np.array(preds)

  0%|          | 0/17640 [00:00<?, ?it/s]

In [39]:
sample_submission = pd.DataFrame({'id': test['id'], 'groups': preds})
sample_submission.to_csv('submission.csv', index=None)

In [40]:
sample_submission['groups'].value_counts()

10    213649
9      17994
4      11951
0       9388
3       9065
1       7886
7       5733
6       4827
2       1734
Name: groups, dtype: int64

In [42]:
sample_submission

Unnamed: 0,id,groups
0,10529,10
1,10530,10
2,10531,10
3,10532,0
4,10533,10
...,...,...
282222,292751,10
282223,292752,7
282224,292753,10
282225,292754,10
