
- из данных убраны дубли

In [1]:
TRAIN_CSV = f"./datasets/train_clean.csv"
SMALL_CSV = f"./cache/train.csv"
SCORING_CSV = f"./datasets/test.csv"

USE_SMALL = True

max_title = 36          # max 103, 3 сигмы 34 + 2
max_abstract = 460      # max 1096, 3 сигмы 457 + 3

In [2]:
import torch
from tqdm import tqdm

SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Датасеты

In [3]:
# !pip install datasets
import datasets

In [4]:
arxiv_dataset = datasets.Dataset.from_csv(SMALL_CSV if USE_SMALL else TRAIN_CSV)

Using custom data configuration default-04a3546adcf83ebb
Reusing dataset csv (/home/user1/.cache/huggingface/datasets/csv/default-04a3546adcf83ebb/0.0.0)


In [5]:
test_size = 0.2 if USE_SMALL else 0.02
arxiv_dataset = arxiv_dataset.train_test_split(test_size=test_size)
pass

In [6]:
len(arxiv_dataset["train"]), len(arxiv_dataset["test"]), arxiv_dataset["train"][0].keys()

(800, 200, dict_keys(['abstract', 'title']))

In [7]:
scoring_dataset = datasets.Dataset.from_csv(SCORING_CSV)
len(scoring_dataset), scoring_dataset[0].keys()

Using custom data configuration default-2e0a9ad90b647d2d
Reusing dataset csv (/home/user1/.cache/huggingface/datasets/csv/default-2e0a9ad90b647d2d/0.0.0)


(1000, dict_keys(['abstract']))

## Токенайзер

In [9]:
from transformers import AutoTokenizer

# google/bert_uncased_L-8_H-512_A-8     # medium
# tokenizer = AutoTokenizer.from_pretrained("google/bert_uncased_L-4_H-512_A-8")   # small
tokenizer = AutoTokenizer.from_pretrained("google/bert_uncased_L-8_H-512_A-8")   # medium

Downloading: 100%|██████████| 383/383 [00:00<00:00, 265kB/s]
Downloading: 100%|██████████| 226k/226k [00:00<00:00, 456kB/s]  


In [10]:
tokenizer("some long long  long  long  long  long text", max_length=5, truncation=True)

{'input_ids': [101, 2070, 2146, 2146, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [11]:
prefix = "summarize: "

def preprocess_function(examples):
    """ Длины последовательностей max = 1096 / 103. Берем какбе 3 сигмы:
        - max_length=457 + 4
        - max_length=34 + 2
    """

    srcs = [prefix + doc for doc in examples["abstract"]]
    model_inputs = tokenizer(srcs, max_length=max_abstract, truncation=True) # max_length includes special tokens

    with tokenizer.as_target_tokenizer():
        trgs = tokenizer(examples["title"], max_length=max_title, truncation=True)

    model_inputs["labels"] = trgs["input_ids"]
    return model_inputs

In [12]:
tokenized_arxiv = arxiv_dataset.map(preprocess_function, batched=True)

100%|██████████| 1/1 [00:00<00:00,  2.64ba/s]
100%|██████████| 1/1 [00:00<00:00, 12.87ba/s]


In [13]:
tokenized_arxiv["train"][0].keys(), tokenized_arxiv["train"][0]["abstract"], tokenized_arxiv["train"][0]["title"]

(dict_keys(['abstract', 'title', 'input_ids', 'token_type_ids', 'attention_mask', 'labels']),
 'asynchrony, overlaps and delays in sensory-motor signals introduce ambiguity as to which stimuli, actions, and rewards are causally related. only the repetition of reward episodes helps distinguish true cause-effect relationships from coincidental occurrences. in the model proposed here, a novel plasticity rule employs short and long-term changes to evaluate hypotheses on cause-effect relationships. transient weights represent hypotheses that are consolidated in long-term memory only when they consistently predict or cause future rewards. the main objective of the model is to preserve existing network topologies when learning with ambiguous information flows. learning is also improved by biasing the exploration of the stimulus-response space towards actions that in the past occurred before rewards. the model indicates under which conditions beliefs can be consolidated in long-term memory, it

# Модель

In [14]:
from transformers import EncoderDecoderModel

# google bert-small https://huggingface.co/google/bert_uncased_L-4_H-512_A-8
model = EncoderDecoderModel.from_encoder_decoder_pretrained('google/bert_uncased_L-8_H-512_A-8', 'google/bert_uncased_L-8_H-512_A-8') # medium
model.to(device)

Downloading: 100%|██████████| 159M/159M [00:20<00:00, 8.08MB/s] 
Some weights of the model checkpoint at google/bert_uncased_L-8_H-512_A-8 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at go

EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 512, padding_idx=0)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(2, 512)
      (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=512, out_features=512, bias=True)
              (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_af

При обучении тагрет в модель подается кроме последнего токена, при валидации/генрации - кроме первого.

Поэтому для генерации сид-фраза должна начинаться с того токена, на котором модель обучалась, в данном случае это первый токен берта `[CLS]`

In [15]:
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

In [16]:
from helpers.utils import count_parameters

count_parameters(model)

+---------------------------------------------------------------------+-----+-------------+------------+---------------+----------+
|                           Modules/Tensors                           | GPU |    Shape    | Parameters |      Type     | DataMem  |
+---------------------------------------------------------------------+-----+-------------+------------+---------------+----------+
|              encoder.embeddings.word_embeddings.weight              |  +  | 30522 x 512 |  15627264  | torch.float32 | 62509056 |
|            encoder.embeddings.position_embeddings.weight            |  +  |  512 x 512  |   262144   | torch.float32 | 1048576  |
|           encoder.embeddings.token_type_embeddings.weight           |  +  |   2 x 512   |    1024    | torch.float32 |   4096   |
|                 encoder.embeddings.LayerNorm.weight                 |  +  |     512     |    512     | torch.float32 |   2048   |
|                  encoder.embeddings.LayerNorm.bias                  |  +  

91191098

In [17]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Обучение

In [18]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(   # gpu 6 GB
    output_dir="./bert2bert-base-results",
    optim="adamw_torch",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    ignore_data_skip=True,                  # disable ignore unused field in data warnings
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,          # eff.batch = grd_acc * batch = 32
    weight_decay=0.01,
    logging_steps=1000,
    save_steps=1000,
    save_total_limit=3,
    num_train_epochs=2*8,                   # eff.epochs = epoch / grd_acc = 2
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_arxiv["train"],
    eval_dataset=tokenized_arxiv["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Using amp half precision backend


In [20]:
tqdm._instances.clear()

trainer.train(resume_from_checkpoint=True)

The following columns in the training set  don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: token_type_ids, abstract, title. If token_type_ids, abstract, title are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 800
  Num Epochs = 16
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 8
  Total optimization steps = 400
  6%|▋         | 25/400 [00:27<06:47,  1.09s/it]The following columns in the evaluation set  don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: token_type_ids, abstract, title. If token_type_ids, abstract, title are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
                                                
 

{'eval_loss': 5.713404655456543, 'eval_runtime': 2.4418, 'eval_samples_per_second': 81.906, 'eval_steps_per_second': 20.476, 'epoch': 1.0}


 12%|█▎        | 50/400 [00:55<05:56,  1.02s/it]The following columns in the evaluation set  don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: token_type_ids, abstract, title. If token_type_ids, abstract, title are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
                                                
 12%|█▎        | 50/400 [00:58<05:56,  1.02s/it]

{'eval_loss': 5.164654731750488, 'eval_runtime': 2.3511, 'eval_samples_per_second': 85.065, 'eval_steps_per_second': 21.266, 'epoch': 2.0}


 19%|█▉        | 75/400 [01:24<05:39,  1.05s/it]The following columns in the evaluation set  don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: token_type_ids, abstract, title. If token_type_ids, abstract, title are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
                                                
 19%|█▉        | 75/400 [01:27<05:39,  1.05s/it]

{'eval_loss': 4.90974235534668, 'eval_runtime': 2.3386, 'eval_samples_per_second': 85.521, 'eval_steps_per_second': 21.38, 'epoch': 3.0}


 25%|██▌       | 100/400 [01:55<06:02,  1.21s/it]The following columns in the evaluation set  don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: token_type_ids, abstract, title. If token_type_ids, abstract, title are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
                                                 
 25%|██▌       | 100/400 [01:58<06:02,  1.21s/it]

{'eval_loss': 4.8107171058654785, 'eval_runtime': 2.6813, 'eval_samples_per_second': 74.59, 'eval_steps_per_second': 18.648, 'epoch': 4.0}


 31%|███▏      | 125/400 [02:28<05:01,  1.10s/it]The following columns in the evaluation set  don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: token_type_ids, abstract, title. If token_type_ids, abstract, title are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
                                                 
 31%|███▏      | 125/400 [02:30<05:01,  1.10s/it]

{'eval_loss': 4.702428817749023, 'eval_runtime': 2.4794, 'eval_samples_per_second': 80.666, 'eval_steps_per_second': 20.166, 'epoch': 5.0}


 38%|███▊      | 150/400 [02:57<04:33,  1.09s/it]The following columns in the evaluation set  don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: token_type_ids, abstract, title. If token_type_ids, abstract, title are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
                                                 
 38%|███▊      | 150/400 [03:00<04:33,  1.09s/it]

{'eval_loss': 4.663840293884277, 'eval_runtime': 2.4499, 'eval_samples_per_second': 81.635, 'eval_steps_per_second': 20.409, 'epoch': 6.0}


 44%|████▍     | 175/400 [03:27<03:56,  1.05s/it]The following columns in the evaluation set  don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: token_type_ids, abstract, title. If token_type_ids, abstract, title are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
                                                 
 44%|████▍     | 175/400 [03:29<03:56,  1.05s/it]

{'eval_loss': 4.597473621368408, 'eval_runtime': 2.6777, 'eval_samples_per_second': 74.691, 'eval_steps_per_second': 18.673, 'epoch': 7.0}


 50%|█████     | 200/400 [04:00<03:49,  1.15s/it]The following columns in the evaluation set  don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: token_type_ids, abstract, title. If token_type_ids, abstract, title are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
                                                 
 50%|█████     | 200/400 [04:03<03:49,  1.15s/it]

{'eval_loss': 4.59846305847168, 'eval_runtime': 2.5076, 'eval_samples_per_second': 79.758, 'eval_steps_per_second': 19.94, 'epoch': 8.0}


 56%|█████▋    | 225/400 [04:32<03:10,  1.09s/it]The following columns in the evaluation set  don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: token_type_ids, abstract, title. If token_type_ids, abstract, title are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
                                                 
 56%|█████▋    | 225/400 [04:35<03:10,  1.09s/it]

{'eval_loss': 4.5791802406311035, 'eval_runtime': 2.3903, 'eval_samples_per_second': 83.672, 'eval_steps_per_second': 20.918, 'epoch': 9.0}


 62%|██████▎   | 250/400 [05:02<02:41,  1.08s/it]The following columns in the evaluation set  don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: token_type_ids, abstract, title. If token_type_ids, abstract, title are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
                                                 
 62%|██████▎   | 250/400 [05:05<02:41,  1.08s/it]

{'eval_loss': 4.547262191772461, 'eval_runtime': 2.59, 'eval_samples_per_second': 77.221, 'eval_steps_per_second': 19.305, 'epoch': 10.0}


 69%|██████▉   | 275/400 [05:32<02:25,  1.17s/it]The following columns in the evaluation set  don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: token_type_ids, abstract, title. If token_type_ids, abstract, title are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
                                                 
 69%|██████▉   | 275/400 [05:35<02:25,  1.17s/it]

{'eval_loss': 4.557379245758057, 'eval_runtime': 2.534, 'eval_samples_per_second': 78.927, 'eval_steps_per_second': 19.732, 'epoch': 11.0}


 75%|███████▌  | 300/400 [06:01<01:45,  1.05s/it]The following columns in the evaluation set  don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: token_type_ids, abstract, title. If token_type_ids, abstract, title are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
                                                 
 75%|███████▌  | 300/400 [06:04<01:45,  1.05s/it]

{'eval_loss': 4.554116725921631, 'eval_runtime': 2.4232, 'eval_samples_per_second': 82.535, 'eval_steps_per_second': 20.634, 'epoch': 12.0}


 81%|████████▏ | 325/400 [06:31<01:21,  1.09s/it]The following columns in the evaluation set  don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: token_type_ids, abstract, title. If token_type_ids, abstract, title are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
                                                 
 81%|████████▏ | 325/400 [06:34<01:21,  1.09s/it]

{'eval_loss': 4.545895576477051, 'eval_runtime': 2.6332, 'eval_samples_per_second': 75.954, 'eval_steps_per_second': 18.989, 'epoch': 13.0}


 88%|████████▊ | 350/400 [07:04<01:03,  1.28s/it]The following columns in the evaluation set  don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: token_type_ids, abstract, title. If token_type_ids, abstract, title are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
                                                 
 88%|████████▊ | 350/400 [07:07<01:03,  1.28s/it]

{'eval_loss': 4.5565714836120605, 'eval_runtime': 3.0192, 'eval_samples_per_second': 66.242, 'eval_steps_per_second': 16.561, 'epoch': 14.0}


 94%|█████████▍| 375/400 [07:36<00:27,  1.09s/it]The following columns in the evaluation set  don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: token_type_ids, abstract, title. If token_type_ids, abstract, title are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
                                                 
 94%|█████████▍| 375/400 [07:39<00:27,  1.09s/it]

{'eval_loss': 4.547601699829102, 'eval_runtime': 2.6015, 'eval_samples_per_second': 76.878, 'eval_steps_per_second': 19.219, 'epoch': 15.0}


100%|██████████| 400/400 [08:06<00:00,  1.06s/it]The following columns in the evaluation set  don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: token_type_ids, abstract, title. If token_type_ids, abstract, title are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
                                                 
100%|██████████| 400/400 [08:08<00:00,  1.06s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 400/400 [08:08<00:00,  1.22s/it]

{'eval_loss': 4.549394607543945, 'eval_runtime': 2.4135, 'eval_samples_per_second': 82.866, 'eval_steps_per_second': 20.717, 'epoch': 16.0}
{'train_runtime': 488.8834, 'train_samples_per_second': 26.182, 'train_steps_per_second': 0.818, 'train_loss': 4.19384521484375, 'epoch': 16.0}





TrainOutput(global_step=400, training_loss=4.19384521484375, metrics={'train_runtime': 488.8834, 'train_samples_per_second': 26.182, 'train_steps_per_second': 0.818, 'train_loss': 4.19384521484375, 'epoch': 16.0})

# Генерация

In [21]:
def generate(example):
    input_ids = tokenizer(prefix + example["abstract"], 
                        max_length=max_abstract, 
                        truncation=True, 
                        return_tensors="pt").input_ids  # Batch size 1
    outputs = model.generate(input_ids.to(device))
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [22]:
n = 10
arxiv_dataset["test"][n]["abstract"], arxiv_dataset["test"][n]["title"], generate(arxiv_dataset["test"][n])

('we prove that the 3-state potts antiferromagnet on the diced lattice (dual of the kagome lattice) has entropically-driven long-range order at low temperatures (including zero). we then present monte carlo simulations, using a cluster algorithm, of the 3-state and 4-state models. the 3-state model has a phase transition to the high-temperature disordered phase at v = e^j - 1 = -0.860599 +- 0.000004 that appears to be in the universality class of the 3-state potts ferromagnet. the 4-state model is disordered throughout the physical region, including at zero temperature.',
 'phase transition in the 3-state potts antiferromagnet on the diced   lattice',
 'a non - non - non - - covariance model of the non - covar')

In [23]:
n = 42
arxiv_dataset["test"][n]["abstract"], arxiv_dataset["test"][n]["title"], generate(arxiv_dataset["test"][n])

('we employ conditional tsallis q entropies to study the separability of symmetric one parameter w and ghz multiqubit mixed states. the strongest limitation on separability is realized in the limit q-->infinity, and is found to be much superior to the condition obtained using the von neumann conditional entropy (q=1 case). except for the example of two qubit and three qubit symmetric states of ghz family, the $q$-conditional entropy method leads to sufficient - but not necessary - conditions on separability.',
 'separability of a family of one parameter w and ghz multiqubit states   using abe-rajagopal q-conditional entropy approach',
 'a multi - semiparable multivariate multiplebility and the multiplebility of')

# BLEU-score

Самоделки:
- 0.02457 (словарь 6152, по 5 эпох по 5r-4, 1e-3, min.val.loss = 3.875) 
- **0.19204** (словарь 60 тыс. ~15 эпох с шагом 5e-4 -> 5e-5, min.val.loss = 2.289)
- 0.12601 (словарь 84 тыс. много разных эпох, сходится плохо, min.val.loss = 3.305)
- 0.10644 (BPE, словарь 16 тыс., много разных эпох, сходится плохо, min.val.loss = 3.8)

T5-small
- BLEU-score: **0.044...** 1% тюнинг
- BLEU-score: **0.16563** (3 эпохи - 2,5 часа RTX2060 6Gb)

T5-base
- BLEU-score: **0.07422** (без обучения)
- обучение не тянет...

BART-base
- BLEU-score: **0.17743** 1% тюнинг
- BLEU-score: **0.17984** (1.43 эпохи - 2,5 часа RTX2060 6Gb)
- BLEU-score: **0.19266** (2 эпохи)

bert2bert-small (4 слоя, 8 голов внимания, 512 эмбеддинг)
- BLEU-score: **0.00524** 1% тюнинг
- BLEU-score: **0.04807** (2 эпохи) чето не то....
 

In [24]:
from torchtext.data.metrics import bleu_score

tqdm._instances.clear()

candidates = []
references = []
for example in tqdm(tokenized_arxiv["test"]):
    candidates.append(generate(example).split())
    references.append([example["title"].split()])

score = bleu_score(candidates, references, max_n=3, weights=[1/3]*3)

print('BLEU-score: {0:.5f}'.format(score))

 36%|███▌      | 72/200 [00:23<00:41,  3.07it/s]


KeyboardInterrupt: 

### Stepik score

In [None]:
SUBMISSION_NAME = "bert2bert-small" if USE_SMALL else "bert2bert-small-tune"

Генерация заголовков для тестовых данных

In [None]:
tqdm._instances.clear()

abstracts = []
titles = []

for example in tqdm(scoring_dataset):
    abstracts.append(example["abstract"])
    titles.append(generate(example))

Получилось, например

In [None]:
abstracts[1], titles[1]

Записываем полученные заголовки в файл формата `<abstract>,<title>`:

In [None]:
import pandas as pd

submission_df = pd.DataFrame({'abstract': abstracts, 'title': titles})
submission_df.to_csv(f"./submission/predicted_titles_{SUBMISSION_NAME}.csv", index=False)

In [None]:
submission_df["title"].apply(lambda x: len(str(x).split())).describe()[["mean","std", "max"]]

С помощью скрипта `generate_csv` приводим файл `submission_prediction.csv` в формат, необходимый для отправки:

In [None]:
from helpers.create_submission import generate_csv

generate_csv(input_file=f"./submission/predicted_titles_{SUBMISSION_NAME}.csv", 
             output_file=f'./submission/submission_{SUBMISSION_NAME}.csv', 
             voc_file=f'./datasets/vocs.pkl')

# С учетом

In [None]:
import pandas as pd
import numpy as np

train_df = pd.read_csv("./datasets/train.csv")
submission_df = pd.read_csv(f"./submission/predicted_titles_{SUBMISSION_NAME}.csv")

intersect_idx = np.intersect1d(submission_df["abstract"].str.lower(), train_df["abstract"].str.lower(), return_indices=True)

submission_df.loc[intersect_idx[1], 'title'] = train_df.loc[intersect_idx[2], 'title'].values

In [None]:
from helpers.create_submission import generate_csv

submission_df.to_csv(f"./submission/predicted_titles_{SUBMISSION_NAME}_fake.csv", index=False)

generate_csv(input_file=f"./submission/predicted_titles_{SUBMISSION_NAME}_fake.csv", 
             output_file=f'./submission/submission_{SUBMISSION_NAME}_fake.csv', 
             voc_file=f'./datasets/vocs.pkl')

In [None]:
f'./submission/submission_{SUBMISSION_NAME}_fake.csv'

T5-small:
- **Score: 0.26174** 1% tuning
- **Score: 0.34497** tuning 3 эпохи
- **Score: 0.51810** + добавление правильных меток из трейна

T5-base:
- **Score: 0.20510** w/o tuning
- для обучения с имеющейся длиной последовательности не хватает памяти GPU

BART-base
- **Score: 0.33851** 1% tuning
- **Score: 0.39536** tuning 1,5 эпохи
- **Score: 0.54804** + добавление правильных меток из трейна
- **Score: 0.56782** 2 эпохи с накопление градиента (вот и в топ-10)
- ... дальше не интересно