[Кто-то заметил](https://www.kaggle.com/code/kashnitsky/arxiv-title-generation-dumb-baseline/notebook), что в тестовой выборке много дублей с тренировочной, да и в тренировочной тоже дублируются

In [1]:
import pickle
import string
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm
from nltk.util import ngrams

from helpers.create_submission import generate_csv

In [2]:
train_df = pd.read_csv("./datasets/train.csv")
test_df = pd.read_csv("./datasets/test.csv")

len(train_df), len(test_df)

(135000, 1000)

Убрать дубликаты

In [3]:
train_df.drop_duplicates(inplace=True, ignore_index=True)
len(train_df)

105603

Найти пересечения

In [4]:
train_abstracts = train_df['abstract'].str.lower()
test_abstracts = test_df['abstract'].str.lower()

intersect_idx = np.intersect1d(test_abstracts, train_abstracts, return_indices=True)

len(intersect_idx[0])

430

В качестве $H_0$ для заголовка - первого предложение резюме

In [5]:
def extract_first_sentence(text, max_words=40):
    return " ".join(text.strip().split('.')[0].split()[:max_words])

submission_df = pd.DataFrame({"abstract":test_df['abstract'],
                              "title":test_df['abstract'].apply(extract_first_sentence)})

Заменить заголовки для пересечения тестовой и тренировочной выборки на настоящие заголовки

In [6]:
submission_df.loc[intersect_idx[1], 'title'] = train_df.loc[intersect_idx[2], 'title'].values

Оценка такой $H_0$
- чистой

In [7]:
from torchtext.data.metrics import bleu_score

sample_df = train_df.sample(1000)
sample_df["candidate"] = sample_df['abstract'].apply(extract_first_sentence)

candidat_corpus = [c.split() for c in sample_df["candidate"].tolist()]
ref_corpus = [[r.split()] for r in sample_df["title"].tolist()]

bleu_score(candidat_corpus, ref_corpus, max_n=3, weights=[1/3]*3)

0.0734657347202301

- с заменой

In [8]:
sample_df = train_df.sample(1000)
sample_df["candidate"] = sample_df['abstract'].apply(extract_first_sentence)

bug_idx = sample_df["candidate"].sample(430).index
sample_df.loc[bug_idx, "candidate"] = sample_df.loc[bug_idx, "title"].values

candidat_corpus = [c.split() for c in sample_df["candidate"].tolist()]
ref_corpus = [[r.split()] for r in sample_df["title"].tolist()]

bleu_score(candidat_corpus, ref_corpus, max_n=3, weights=[1/3]*3)

0.29631006717681885

submission

In [9]:
PREFIX = "bug"

submission_df.to_csv(f"./submission/{PREFIX}_submission/predicted_titles.csv", index=False)

generate_csv(input_file=f'./submission/{PREFIX}_submission/predicted_titles.csv', 
             output_file=f'./submission/{PREFIX}_submission/submission.csv', 
             voc_file=f'./datasets/vocs.pkl')

# **Другой вариант**
- заменить на последовательность самых частотных токенов

In [14]:
test_samples_from_train = set(train_df['abstract']).intersection(set(test_df['abstract']))
wtf_df = train_df[train_df['abstract'].isin(test_samples_from_train)]
wtf_df.describe()

Unnamed: 0,abstract,title
count,433,433
unique,430,433
top,"discussion of ""instrumental variables: an econ...",some contra-arguments for the use of stable di...
freq,3,1


In [15]:
bugged_title = wtf_df.abstract.mode()[0]
# https://arxiv.org/pdf/1410.0163.pdf
wtf_df[wtf_df['abstract'] == bugged_title]

Unnamed: 0,abstract,title
13758,"discussion of ""instrumental variables: an econ...",ace bounds; sems with equilibrium conditions
24872,"discussion of ""instrumental variables: an econ...","think globally, act globally: an epidemiologis..."
101295,"discussion of ""instrumental variables: an econ...",causal graphs: addressing the confounding prob...


In [16]:
wtf_df = wtf_df[wtf_df['abstract'] != bugged_title]
uncertain_title = wtf_df.abstract.mode()[0]
wtf_df[wtf_df['abstract'] == uncertain_title]

Unnamed: 0,abstract,title
11430,"to appear to mcmc handbook, s. p. brooks, a. g...",reversible jump markov chain monte carlo
90696,"to appear to mcmc handbook, s. p. brooks, a. g...",likelihood-free markov chain monte carlo


In [17]:
wtf_df = wtf_df[wtf_df['abstract'] != uncertain_title].reset_index(drop=True)

In [18]:
import numpy as np
import collections

words = [st.split() for st in train_df['title'].values]
words = [w for ttl in words for w in ttl]

mean_title_length = np.mean(np.asarray([len(st.split()) for st in train_df['title'].values]))
print('Mean title length is', mean_title_length)

most_frequently_words = collections.Counter(words).most_common()[:round(mean_title_length)]
nan_fill_value = ' '.join([el[0] for el in most_frequently_words])
print('BEST TITLE EVER:')
nan_fill_value

Mean title length is 9.523820346012897
BEST TITLE EVER:


'of the and in for a on with to model'

In [22]:
submission_df = pd.merge(test_df, wtf_df, on='abstract', how='left').fillna(nan_fill_value)

Оценка такой $H_0$
- чистой (фигня, bleu это не проведет, F1-меру - наверно)

In [23]:
sample_df = train_df.sample(1000)
sample_df["candidate"] = nan_fill_value

candidat_corpus = [c.split() for c in sample_df["candidate"].tolist()]
ref_corpus = [[r.split()] for r in sample_df["title"].tolist()]

bleu_score(candidat_corpus, ref_corpus, max_n=3, weights=[1/3]*3)

0.0

In [26]:
candidat_corpus[0], ref_corpus[0]

(['of', 'the', 'and', 'in', 'for', 'a', 'on', 'with', 'to', 'model'],
 [['search', 'for', 'rare', 'b-meson', 'decays', 'at', 'cdf']])

In [24]:
PREFIX = "bug"

submission_df.to_csv(f"./submission/{PREFIX}_submission/predicted_titles2.csv", index=False)

generate_csv(input_file=f'./submission/{PREFIX}_submission/predicted_titles2.csv', 
             output_file=f'./submission/{PREFIX}_submission/submission2.csv', 
             voc_file=f'./datasets/vocs.pkl')