На основе туториала `transformers`


- из данных убраны дубли

In [24]:
TRAIN_CSV = f"./datasets/train_clean.csv"
SMALL_CSV = f"./cache/train.csv"
SCORING_CSV = f"./datasets/test.csv"

USE_SMALL = False

In [None]:
import torch
from tqdm import tqdm

SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Датасеты

- пакет huggingface datasets

In [None]:
# !pip install datasets
import datasets

In [None]:
arxiv_dataset = datasets.Dataset.from_csv(SMALL_CSV if USE_SMALL else TRAIN_CSV)

In [None]:
test_size = 0.2 if USE_SMALL else 0.02
arxiv_dataset = arxiv_dataset.train_test_split(test_size=test_size)
pass

In [None]:
len(arxiv_dataset["train"]), len(arxiv_dataset["test"]), arxiv_dataset["train"][0].keys()

In [None]:
scoring_dataset = datasets.Dataset.from_csv(SCORING_CSV)
len(scoring_dataset), scoring_dataset[0].keys()

## Токенайзер

In [None]:
from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("t5-base")   # 1.3 MB
tokenizer = AutoTokenizer.from_pretrained("t5-small")   # 1.3 MB

In [None]:
prefix = "summarize: "

def preprocess_function(examples):
    """Почти все последовательности без обрезки: max = 1096 / 103
        - max_length=1024
        - max_length=128
    """
    srcs = [prefix + doc for doc in examples["abstract"]]
    model_inputs = tokenizer(srcs, max_length=1024, truncation=True)

    with tokenizer.as_target_tokenizer():
        trgs = tokenizer(examples["title"], max_length=128, truncation=True)

    model_inputs["labels"] = trgs["input_ids"]
    return model_inputs

In [None]:
tokenized_arxiv = arxiv_dataset.map(preprocess_function, batched=True)

In [None]:
tokenized_arxiv["train"][0].keys(), tokenized_arxiv["train"][0]["abstract"], tokenized_arxiv["train"][0]["title"]

# Модель

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base")

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Обучение

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./T5-base-results",#"./T5-small-results",
    optim="adamw_torch",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # gradient_accumulation_steps=4,
    weight_decay=0.01,
    logging_steps=1000,
    save_steps=1000,
    save_total_limit=3,
    num_train_epochs=3,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_arxiv["train"],
    eval_dataset=tokenized_arxiv["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

3 эпохи: 2,5 часа (RTX2060 6GB)

In [None]:
tqdm._instances.clear()

trainer.train()

# Генерация

In [None]:
def generate(example):
    input_ids = tokenizer(prefix + example["abstract"], 
                        max_length=1024, 
                        truncation=True, 
                        return_tensors="pt").input_ids  # Batch size 1
    outputs = model.generate(input_ids.to(device))
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
n = 10
arxiv_dataset["test"][n]["abstract"], arxiv_dataset["test"][n]["title"], generate(arxiv_dataset["test"][n])

In [None]:
n = 42
arxiv_dataset["test"][n]["abstract"], arxiv_dataset["test"][n]["title"], generate(arxiv_dataset["test"][n])

# BLEU-score

T5-small
- BLEU-score: **0.16563** (3 эпохи)

Самоделки:
- 0.02457 (словарь 6152, по 5 эпох по 5r-4, 1e-3, min.val.loss = 3.875) 
- **0.19204** (словарь 60 тыс. ~15 эпох с шагом 5e-4 -> 5e-5, min.val.loss = 2.289)
- 0.12601 (словарь 84 тыс. много разных эпох, сходится плохо, min.val.loss = 3.305)
- 0.10644 (BPE, словарь 16 тыс., много разных эпох, сходится плохо, min.val.loss = 3.8)

T5-base
- BLEU-score: 0.07422 (без обучения)

In [None]:
from torchtext.data.metrics import bleu_score

tqdm._instances.clear()

references = []
candidates = []
for example in tqdm(tokenized_arxiv["test"]):
    references.append([example["title"].split()])
    candidates.append(generate(example).split())

score = bleu_score(candidates, references, max_n=3, weights=[1/3]*3)

print('BLEU-score: {0:.5f}'.format(score))

### Делаем submission

In [26]:
SUBMISSION_NAME = "T5" if USE_SMALL else "T5-base"

Генерация заголовков для тестовых данных

In [None]:
tqdm._instances.clear()

abstracts = []
titles = []

for example in tqdm(scoring_dataset):
    abstracts.append(example["abstract"])
    titles.append(generate(example))

Получилось, например

In [None]:
abstracts[1], titles[1]

Записываем полученные заголовки в файл формата `<abstract>,<title>`:

In [None]:
import pandas as pd

submission_df = pd.DataFrame({'abstract': abstracts, 'title': titles})
submission_df.to_csv(f"./submission/predicted_titles_{SUBMISSION_NAME}.csv", index=False)

In [None]:
submission_df["title"].apply(lambda x: len(str(x).split())).describe()[["mean","std", "max"]]

С помощью скрипта `generate_csv` приводим файл `submission_prediction.csv` в формат, необходимый для отправки:

In [None]:
from helpers.create_submission import generate_csv

generate_csv(input_file=f"./submission/predicted_titles_{SUBMISSION_NAME}.csv", 
             output_file=f'./submission/submission_{SUBMISSION_NAME}.csv', 
             voc_file=f'./datasets/vocs.pkl')

Сравним с примером (Score: 0.097), прост чтоб увидеть, что чето похожее посчиталось.

In [None]:
sample_df = pd.read_csv(f'./datasets/sample_submission.csv')
df = pd.read_csv(f'./submission/submission_{SUBMISSION_NAME}.csv')

df["Predict"].mean(), sample_df["Predict"].mean()

# С учетом

In [27]:
import pandas as pd
import numpy as np

train_df = pd.read_csv("./datasets/train.csv")
submission_df = pd.read_csv("./submission/predicted_titles_T5tune.csv")

intersect_idx = np.intersect1d(submission_df["abstract"].str.lower(), train_df["abstract"].str.lower(), return_indices=True)

submission_df.loc[intersect_idx[1], 'title'] = train_df.loc[intersect_idx[2], 'title'].values

submission_df.to_csv(f"./submission/predicted_titles_{SUBMISSION_NAME}_fake.csv", index=False)

In [28]:
from helpers.create_submission import generate_csv

generate_csv(input_file=f"./submission/predicted_titles_{SUBMISSION_NAME}_fake.csv", 
             output_file=f'./submission/submission_{SUBMISSION_NAME}_fake.csv', 
             voc_file=f'./datasets/vocs.pkl')

T5-small:
- **Score: 0.26174** w/o tuning
- **Score: 0.34497** tuning
- **Score: 0.51810** + добавление правильных меток из трейна

T5-base:
- **Score: 0.20510** w/o tuning
- для обучения с имеющейся длиной последовательности не хватает памяти GPU