In [None]:
!nvidia-smi

In [None]:
%cd /content/
!git clone https://github.com/westphal-jan/peer-data
%cd /content/peer-data
# !git checkout huggingface
!git submodule update --init --recursive

In [None]:
# !pip install pytorch-lightning wandb python-dotenv catalyst sentence-transformers numpy requests nlpaug sentencepiece nltk
# !pip install wandb nltk

In [None]:
import os
import torch
import json
import glob
from pathlib import Path
from tqdm import tqdm
# from sklearn.model_selection import train_test_split
# from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments, TrainerCallback, TrainerState, TrainerControl, AdamW
# import wandb
# from datetime import datetime
# import pickle
import numpy as np
from torch import nn, optim
# import nlpaug.augmenter.word as naw
from torch.utils.data import DataLoader
# from copy import copy
from datetime import datetime
# from nltk.corpus import stopwords
# nltk.download("stopwords")
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
from collections import defaultdict, Counter

In [None]:
class PaperDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]).float() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
def raw_read_dataset(data_dir: Path, num_texts=None):
    file_paths = glob.glob(f"{data_dir}/*.json")
    if num_texts != None:
        file_paths = file_paths[:num_texts]
    raws = []
    for i, file_path in enumerate(tqdm(file_paths)):
        with open(file_path) as f:
            paper_json = json.load(f)
            raws.append(paper_json)
    return raws

def read_dataset(data_dir: Path, num_texts=None, text_key=""):
    file_paths = glob.glob(f"{data_dir}/*.json")
    if num_texts != None:
        file_paths = file_paths[:num_texts]
    abstracts = []
    sections = []
    labels = []
    for i, file_path in enumerate(tqdm(file_paths)):
        with open(file_path) as f:
            paper_json = json.load(f)
            accepted = paper_json["review"]["accepted"]
            abstract = paper_json["review"]["abstract"]
            _sections = paper_json["pdf"]["metadata"]["sections"]
            _sections = _sections if _sections else []

            abstracts.append(abstract)
            labels.append(int(accepted))
            sections.append(_sections)
    return abstracts, sections, labels

In [None]:
dataset = "data/original"
data_dir = Path(dataset)
raw_submissions = raw_read_dataset(data_dir)

In [None]:
len(raw_submissions)

abstracts = list(map(lambda x: x["review"]["abstract"], raw_submissions))
sections = list(map(lambda x: x["pdf"]["metadata"]["sections"], raw_submissions))

In [None]:
words_abstract = list(map(lambda x: len(x.split(" ")), abstracts))
print(min(words_abstract), max(words_abstract))

n_bins = 30
plt.figure()
plt.hist(words_abstract, weights=np.ones(len(raw_submissions)) / len(raw_submissions), bins=n_bins)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.title(f"Number of words in abstracts")
plt.xlabel("Number of words")
plt.ylabel("Occurrences")
plt.show()
# plt.savefig(f"abstract_words.png", dpi=300)

In [None]:
accepted = [i for i in range(len(raw_submissions)) if raw_submissions[i]["review"]["accepted"]]
not_accepted = [i for i in range(len(raw_submissions)) if not raw_submissions[i]["review"]["accepted"]]
print(len(accepted), len(not_accepted))

In [None]:
accepted_sections = np.array(sections, dtype=object)[accepted]
rejected_sections = np.array(sections, dtype=object)[not_accepted]
print(accepted_sections[0])

In [None]:
def get_length_to_occurrence(sections, percent=True):
    num_sections = list(map(lambda x: len(x) if x else 0, sections))
    count = Counter(num_sections)
    total_sections = sum(num_sections)
    count = {k: v for k, v in sorted(count.items(), key=lambda item: item[0])}
    if percent:
        count = {k: v / total_sections for k, v in count.items()}
    return count, total_sections

percent = True
accepted_count, num_accepted_sections = get_length_to_occurrence(accepted_sections, percent)
rejected_count, num_rejected_sections = get_length_to_occurrence(rejected_sections, percent)
print(num_accepted_sections, rejected_count)

plt.figure()
plt.plot(list(accepted_count.keys()), list(accepted_count.values()), label="Accepted")
plt.plot(list(rejected_count.keys()), list(rejected_count.values()), label="Rejected")
if percent:
    plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.title(f"Number of sections")
plt.xlabel("Number of sections")
plt.ylabel("Occurrences")
plt.legend()
plt.show()

In [None]:
# num_sections = list(map(lambda x: len(x) if x else 0, sections))
# print(max(num_sections))
# print(sorted(num_sections, reverse=True)[:100])
# num_usable = num_sections.count(0)
# print(num_usable, len(raw_submissions) - num_usable)

# counts = defaultdict(int)
# for s in num_sections:
#     counts[s] += 1
# counts = {k: counts[k] for k in sorted(counts)}
# print(counts)

# n_bins = 3*len(set(num_sections))
# plt.figure()
# plt.hist(num_sections, weights=np.ones(len(raw_submissions)) / len(raw_submissions), bins=n_bins)
# # plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title(f"Number of sections")
# plt.xlabel("Number of sections")
# plt.ylabel("Occurrences")
# plt.show()

In [None]:
dataset = "data/original"

data_dir = Path(dataset)
abstracts, sections, labels = read_dataset(data_dir)
abstracts, sections, labels = np.array(abstracts), np.array(sections, dtype=object), np.array(labels)

# num_accepted = len(list(filter(lambda x: x == 1, labels)))
# num_not_accepted = len(list(filter(lambda x: x == 0, labels)))

# print(num_accepted, num_not_accepted)
# label_weight = num_not_accepted / np.array([num_not_accepted, num_accepted])

# # Get random index split for train/val/test.
# idx = list(range(len(texts)))
# # Get constant split across runs
# rnd = np.random.RandomState(42)
# rnd.shuffle(idx)
# total_len = len(idx)
# train_len, val_len = int(0.8*total_len), int(0.1*total_len)
# train_idx = idx[:train_len]
# val_idx = idx[train_len:(train_len + val_len)]
# test_idx = idx[(train_len + val_len):]

# train_texts, train_labels = texts[train_idx], labels[train_idx]
# val_texts, val_labels = texts[val_idx], labels[val_idx]
# text_texts, test_labels = texts[test_idx], labels[test_idx]

In [None]:
num_accepted = len(list(filter(lambda x: x == 1, labels)))
num_not_accepted = len(list(filter(lambda x: x == 0, labels)))

print(num_accepted, num_not_accepted)

# Get random index split for train/val/test.
idx = list(range(len(texts)))
# Get constant split across runs
rnd = np.random.RandomState(42)
rnd.shuffle(idx)
total_len = len(idx)
train_len, val_len = int(0.8*total_len), int(0.1*total_len)
train_idx = idx[:train_len]
val_idx = idx[train_len:(train_len + val_len)]
test_idx = idx[(train_len + val_len):]

train_abstracts, train_sections, train_labels = texts[train_idx], sections[train_idx], labels[train_idx]
val_abstracts, val_sections, val_labels = texts[val_idx], sections[val_idx], labels[val_idx]
test_abstracts, test_sections, test_labels = texts[test_idx], sections[test_idx], labels[test_idx]

In [None]:
train_sections[0]

In [None]:
def extract_sections(sections, labels):
    all_sections = []
    all_labels = []
    for _sections, label in zip(sections, labels):
        if len(sections) == 0:
            continue
        texts = list(map(lambda x: x["text"], _sections))
        all_sections.extend(texts)
        all_labels.extend([label] * len(_sections))
    return all_sections, all_labels

flattened_train_sections, flattened_train_labels = extract_sections(train_sections, train_labels)
flattened_val_sections, flattened_val_labels = extract_sections(val_sections, val_labels)
print(len(flattened_train_sections), len(flattened_val_sections))

num_accepted, num_rejected = flattened_train_labels.count(1), flattened_train_labels.count(0)
print(num_accepted, num_rejected)
label_weight = num_rejected / np.array([num_rejected, num_accepted])

In [None]:
label_weight

In [None]:
train_encodings = list(map(lambda x: tokenize(x, token_to_id, True), train_texts))
val_encodings = list(map(lambda x: tokenize(x, token_to_id, True), val_texts))

train_dataset = PaperDataset({"tokens": train_encodings}, train_labels)
val_dataset = PaperDataset({"tokens": val_encodings}, val_labels)

In [None]:
train_dataset[0]