In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/title-generation/sample_submission.csv
/kaggle/input/title-generation/vocs.pkl
/kaggle/input/title-generation/train.csv
/kaggle/input/title-generation/test.csv


In [2]:
import re
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [3]:
model_names = {
    'bert2bert-base-arxiv-titlegen': 'Callidior/bert2bert-base-arxiv-titlegen',
    't5-large-arxiv-abstract-title': 'pbmstrk/t5-large-arxiv-abstract-title',
    't5-small': 't5-small'
              }

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))

In [4]:
 def inference(text, model, tokenizer, out_max_length=27, num_beams=4):
    input_ids = tokenizer(
        [WHITESPACE_HANDLER(text)],
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=512
    )["input_ids"]

    output_ids = model.generate(
        input_ids=input_ids.to(device),
        max_length=out_max_length, #in tokens
        no_repeat_ngram_size=2,
        num_beams=num_beams
    )[0]

    summary = tokenizer.decode(
        output_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )
    return summary

In [5]:
train = pd.read_csv("/kaggle/input/title-generation/train.csv")
test = pd.read_csv("/kaggle/input/title-generation/test.csv")
#т.к. в трейн есть дубликаты, необходимо их предварительно удалить
cross = test.merge(train.drop_duplicates(subset=['abstract']), on='abstract', how='left')
cross

Unnamed: 0,abstract,title
0,Most sequence transformation models use recurr...,
1,The doc2vec approach was introduced as an exte...,
2,LSTM models can vary greatly depending on sequ...,
3,A joint learning process of alignment and tran...,
4,Current unsupervised image-to-image translatio...,
...,...,...
995,subsystem codes are the most versatile class o...,
996,we study dirac-harmonic maps from degenerating...,
997,in this note we study kloosterman sums twisted...,
998,we obtain the rate of growth of long strange s...,"long strange segments, ruin probabilities and ..."


In [6]:
predicted_titles_filename_suffixes = []
OUT_MAX_LENGTH = 20
for model_name in model_names:
    print(model_name)
    predicted_titles_filename_suffixes.append(f"_{model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_names[model_name])
    model = AutoModelForSeq2SeqLM.from_pretrained(model_names[model_name])
    model.to(device)
    predicted_titles = cross.copy()
    #predicted_titles = predicted_titles.sample(5) #only for test on errors
    predicted_titles
    predicted_titles['title'] = predicted_titles.apply(lambda x: x.title if x.title is not np.NaN 
                                                        else inference(x.abstract, model, tokenizer, out_max_length=OUT_MAX_LENGTH).lower(), 
                                                        axis=1)
    
    # Временный костыль, т.к. generate_csv не принимает названия в одно слово
    predicted_titles['title'] = predicted_titles.title.map(lambda x: x if len(x.split()) > 1 else 'of the and in for a on with to model')
    
    predicted_titles.to_csv(f'predicted_titles{predicted_titles_filename_suffixes[-1]}.csv', index=False)

bert2bert-base-arxiv-titlegen


Downloading:   0%|          | 0.00/300 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.08k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/944M [00:00<?, ?B/s]

t5-large-arxiv-abstract-title


Downloading:   0%|          | 0.00/1.81k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.75G [00:00<?, ?B/s]

t5-small


Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

In [7]:
!ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
__notebook__.ipynb
predicted_titles_bert2bert-base-arxiv-titlegen.csv
predicted_titles_t5-large-arxiv-abstract-title.csv
predicted_titles_t5-small.csv


### Делаем submission в Kaggle

In [8]:
import string
from nltk.util import ngrams
import numpy as np
import pandas as pd
import pickle


def generate_csv(input_file='predicted_titles.csv',
                 output_file='submission.csv',
                 voc_file='/kaggle/input/title-generation/vocs.pkl'):
    '''
    Generates file in format required for submitting result to Kaggle
    
    Parameters:
        input_file (str) : path to csv file with your predicted titles.
                           Should have two fields: abstract and title
        output_file (str) : path to output submission file
        voc_file (str) : path to voc.pkl file
    '''
    data = pd.read_csv(input_file)
    with open(voc_file, 'rb') as voc_file:
        vocs = pickle.load(voc_file)

    with open(output_file, 'w') as res_file:
        res_file.write('Id,Predict\n')
        
    output_idx = 0
    for row_idx, row in data.iterrows():
        trg = row['title']
        trg = trg.translate(str.maketrans('', '', string.punctuation)).lower().split()
        trg.extend(['_'.join(ngram) for ngram in list(ngrams(trg, 2)) + list(ngrams(trg, 3))])
        
        VOCAB_stoi = vocs[row_idx]
        trg_intersection = set(VOCAB_stoi.keys()).intersection(set(trg))
        trg_vec = np.zeros(len(VOCAB_stoi))    

        for word in trg_intersection:
            trg_vec[VOCAB_stoi[word]] = 1

        with open(output_file, 'a') as res_file:
            for is_word in trg_vec:
                res_file.write('{0},{1}\n'.format(output_idx, int(is_word)))
                output_idx += 1

In [9]:
for suffix in predicted_titles_filename_suffixes:
    generate_csv(input_file=f'predicted_titles{suffix}.csv',
                 output_file=f'submission{suffix}.csv')

In [10]:
!ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
__notebook__.ipynb
predicted_titles_bert2bert-base-arxiv-titlegen.csv
predicted_titles_t5-large-arxiv-abstract-title.csv
predicted_titles_t5-small.csv
submission_bert2bert-base-arxiv-titlegen.csv
submission_t5-large-arxiv-abstract-title.csv
submission_t5-small.csv
