In [1]:
import re
import json
import pandas as pd
from tqdm.auto import tqdm
from nltk.translate.bleu_score import sentence_bleu

In [2]:
data_file = r'../arxiv-metadata-oai-snapshot.json'

In [3]:
import json

In [4]:
def get_metadata():
    with open(data_file, 'r') as f:
        for line in f:
            yield line

In [5]:
# we will consider below 3 categories for training 
paper_categories = set(["cs.AI", # Artificial Intelligence
                    "cs.CV", # Computer Vision and Pattern Recognition
                    "cs.LG",
                   "cs.CL",
                   "cs.CC",
                   "cs.CE",
                   "cs.CG",
                   "cs.GT",
                   "cs.CY",
                   "cs.CR",
                   "cs.DS",
                   "cs.DB",
                   "cs.DL",
                   "cs.DM",
                   "cs.DC",
                   "cs.ET",
                   "cs.FL",
                   "cs.GL",
                   "cs.GR",
                   "cs.AR",
                   "cs.HC",
                   "cs.IR",
                   "cs.IT",
                   "cs.LO",
                   "cs.LG",
                   "cs.MS",
                   "cs.MA",
                   "cs.MM",
                   "cs.NI",
                   "cs.NE",
                   "cs.NA",
                   "cs.OS",
                   "cs.OH",
                   "cs.PF",
                   "cs.PL",
                   "cs.RO",
                   "cs.SI",
                    "cs.SE",
                    "cs.SD",
                    "cs.SC",
                    "cs.SY"])# Machine Learning

In [6]:
def build_dataset(categories=paper_categories):
    titles = []
    abstracts = []
    metadata = get_metadata()
    for paper in tqdm(metadata):
        paper_dict = json.loads(paper)
        category = paper_dict.get('categories')
        if len(paper_categories.intersection(set(category.split(" ")))) > 0:
            try:
                year = int(paper_dict.get('journal-ref')[-4:])
                titles.append(paper_dict.get('title'))
                abstracts.append(paper_dict.get('abstract').replace("\n",""))
            except:
                pass 

    papers = pd.DataFrame({'title': titles,'abstract': abstracts})
    papers = papers.dropna()
    papers["title"] = papers["title"].apply(lambda x: re.sub('\s+',' ', x))
    papers["abstract"] = papers["abstract"].apply(lambda x: re.sub('\s+',' ', x))

    del titles, abstracts
    return papers

In [7]:
papers = build_dataset()

0it [00:00, ?it/s]

In [8]:
papers = papers[['abstract','title']]
papers.columns = ["source_text", "target_text"]

# let's add a prefix to source_text, to uniquely identify kind of task we are performing on the data, in this case --> "summarize"
papers['source_text'] = "summarize: "+ papers['source_text']

In [9]:
papers['source_text']

0        summarize:  Sparse Code Division Multiple Acce...
1        summarize:  Given a multiple-input multiple-ou...
2        summarize:  This paper discusses the benefits ...
3        summarize:  Given a bipartite graph $G = (V_1,...
4        summarize:  Honeypots are more and more used t...
                               ...                        
40778    summarize:  Is the universe computable? If so,...
40779    summarize:  Quantum key distribution is the mo...
40780    summarize:  We consider the design of self-tes...
40781    summarize:  If mutually mistrustful parties A ...
40782    summarize:  Based on a calculation of neural d...
Name: source_text, Length: 40783, dtype: object

In [10]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(papers, test_size=0.1, random_state=42)

In [None]:
# import
from simplet5 import SimpleT5

# instatntiate
model = SimpleT5()

# load
model.from_pretrained("t5","t5-base")

# train
# model.train(train_df=train_df, eval_df=test_df, source_max_token_len=512, target_max_token_len=128, max_epochs=100, batch_size=38, use_gpu=True, dataloader_num_workers=32)%notebook

Global seed set to 42
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Missing logger folder: /home/seclab/Project/woohyuk2/bbbbbbbbbbbbbbbb/lightning_logs

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 42


Training: 0it [00:00, ?it/s]

In [None]:
model.load_model(r"outputs/simplet5-epoch-5-train-loss-1.3866-val-loss-1.7207", use_gpu=True)

# generate
model.predict("summarize:  some text you want to test it on")

In [None]:
sample_abstracts = test_df.sample(10)

for i, abstract in sample_abstracts.iterrows():
    print(f"===== Abstract =====")
    print(abstract['source_text'])
    summary= model.predict(abstract['source_text'])[0]
    print(f"\n===== Actual Title =====")
    print(f"{abstract['target_text']}")
    print(f"\n===== Generated Title =====")
    print(f"{summary}")
    print("\n +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")

In [None]:
!pip install pyqtwebengine
!pip install pyqt5
!pip install simplet5
!pip install --upgrade simplet5