In [None]:
!nvidia-smi

In [None]:
%cd /content/
!git clone -b reduced-data https://github.com/westphal-jan/peer-data
%cd /content/peer-data
# !git checkout huggingface
!git submodule update --init --recursive

In [None]:
# !pip install pytorch-lightning wandb python-dotenv catalyst sentence-transformers numpy requests
!pip install wandb transformers nltk pytorch-lightning

In [None]:
import os
import torch
import json
import glob
from pathlib import Path
from tqdm import tqdm
# from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments, TrainerCallback, TrainerState, TrainerControl, AdamW, AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoModelWithLMHead, T5ForConditionalGeneration
import wandb
from datetime import datetime
import pickle
import numpy as np
# import nlpaug.augmenter.word as naw
from torch.utils.data import DataLoader, WeightedRandomSampler
from torch import nn, optim
import copy
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
from collections import defaultdict, Counter
from nltk.corpus import stopwords
import nltk
nltk.download("stopwords")

In [None]:
class PaperDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]).float() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).float()
        return item

    def __len__(self):
        return len(self.labels)

def raw_read_dataset(data_dir: Path, num_texts=None):
    file_paths = glob.glob(f"{data_dir}/*.json")
    if num_texts != None:
        file_paths = file_paths[:num_texts]
    raws = []
    for i, file_path in enumerate(tqdm(file_paths)):
        with open(file_path) as f:
            paper_json = json.load(f)
            raws.append(paper_json)
    return raws

def read_dataset(data_dirs, num_texts=None, restrict_file=None):
    if not isinstance(data_dirs, list):
        data_dirs = [data_dirs]

    # with open(restrict_file, "r") as f:
    #     filter_file_names = f.read().splitlines()
    #     for data_dir in data_dirs:
    #         file_paths = glob.glob(f"{data_dir}/*.json")
    #         file_paths = [p for p in file_paths if p.split("/")[-1] in filter_file_names]
    #         print(data_dir, len(file_paths))

    file_paths = []
    for data_dir in data_dirs:
        file_paths.extend(glob.glob(f"{data_dir}/*.json"))
        
    if restrict_file:
        with open(restrict_file, "r") as f:
            filter_file_names = f.read().splitlines()
            file_paths = [p for p in file_paths if p.split("/")[-1] in filter_file_names]

    if num_texts != None:
        file_paths = file_paths[:num_texts]
    
    abstracts = []
    sections = []
    labels = []
    for i, file_path in enumerate(tqdm(file_paths)):
        with open(file_path) as f:
            paper_json = json.load(f)
            accepted = paper_json["review"]["accepted"]
            abstract = paper_json["review"]["abstract"]
            _sections = paper_json["pdf"]["metadata"]["sections"]
            _sections = _sections if _sections else []
            
            abstracts.append(abstract)
            labels.append(int(accepted))
            sections.append(_sections)
    return abstracts, sections, labels

In [None]:
data_dir = "data/original"
augmented = ["data/back-translations-train-accepted", "data/back-translations-train-rejected"]
data_dirs = [data_dir]# + augmented
#8121, 122194
_, train_sections, train_labels = read_dataset(data_dirs, restrict_file="data/train.txt")
_, val_sections, val_labels = read_dataset(data_dir, restrict_file="data/val.txt")
_, test_sections, test_labels = read_dataset(data_dir, restrict_file="data/test.txt")

In [None]:
# def label_distribution(labels):
#     num_rejected, num_accepted = labels.count(0), labels.count(1)
#     print(num_rejected, num_rejected / len(labels), num_accepted, num_accepted / len(labels))

# label_distribution(train_labels)
# label_distribution(val_labels)
# label_distribution(test_labels)

# print(len(np.nonzero(list(map(lambda x: len(x), train_sections)))[0]))
# print(len(np.nonzero(list(map(lambda x: len(x), val_sections)))[0]))
# print(len(np.nonzero(list(map(lambda x: len(x), test_sections)))[0]))

In [None]:
# num_sections = list(map(lambda x: len(x), train_sections))
# num_sections = sorted(num_sections)
# len(num_sections)
# q = 0.95
# index = int(len(num_sections) * q)
# percentile = num_sections[index]
# print(q, percentile, index, len(num_sections) - index)
# print(num_sections[index:])

In [None]:
def extract_sections(sections, labels):
    all_sections = []
    all_labels = []
    assignments = []
    section_counter = 0
    for _sections, label in zip(sections, labels):
        if len(sections) == 0:
            continue
        texts = list(map(lambda x: x["text"], _sections))
        all_sections.extend(texts)
        all_labels.extend([label] * len(_sections))

        # Create mapping from original submission to flattened sections 
        new_section_counter = section_counter + len(_sections)
        assignments.append((section_counter, new_section_counter))
        section_counter = new_section_counter
    return all_sections, all_labels, assignments

flattened_train_sections, flattened_train_labels, _ = extract_sections(train_sections, train_labels)
flattened_val_sections, flattened_val_labels, val_assignemnts = extract_sections(val_sections, val_labels)
flattened_test_sections, flattened_test_labels, test_assignemnts = extract_sections(test_sections, test_labels)
print(len(flattened_train_sections), len(flattened_val_sections), len(flattened_test_sections))

num_accepted, num_rejected = flattened_train_labels.count(1), flattened_train_labels.count(0)
print(num_accepted, num_rejected)
label_weight = num_rejected / np.array([num_rejected, num_accepted])
print(label_weight)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-TinyBERT-L6-v2')
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
# tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
train_encodings = tokenizer(flattened_train_sections, truncation=True, padding="max_length", max_length=512)
val_encodings = tokenizer(flattened_val_sections, truncation=True, padding="max_length", max_length=512)
test_encodings = tokenizer(flattened_test_sections, truncation=True, padding="max_length", max_length=512)

# print(np.array(train_encodings["input_ids"]).shape)

train_dataset = PaperDataset(train_encodings, flattened_train_labels)
val_dataset = PaperDataset(val_encodings, flattened_val_labels)
test_dataset = PaperDataset(test_encodings, flattened_test_labels)

In [None]:
# special_characters = set(['-', '\'', '.', ',', '!','"','#','$','%','&','(',')','*','+','/',':',';','<','=','>','@','[','\\',']','^','`','{','|','}','~','\t'])
# english_stopwords = set(stopwords.words('english'))

# def tokenize(text: str, token_to_id=None, encode=False):
#     for c in special_characters:
#         text = text.replace(c, ' ')
#     text = text.lower()
#     tokens = [t for t in text.split(' ') if t]
#     tokens = [t for t in tokens if t.isalpha()]
#     tokens = [t for t in tokens if not t in english_stopwords]
#     if token_to_id:
#         tokens = [t for t in tokens if t in token_to_id]
#         if encode:
#             tokens = [token_to_id[t] for t in tokens]
#             onehot = np.zeros(len(token_to_id))
#             onehot[tokens] = 1
#             return onehot
#     return tokens

# train_tokens = list(map(tokenize, flattened_train_sections))
# flattened_tokens = [item for sublist in train_tokens for item in sublist]
# vocab = sorted(list(set(flattened_tokens)))
# token_to_id = {t: i for i, t in enumerate(vocab)}
# val_tokens = list(map(lambda x: tokenize(x, token_to_id), flattened_val_sections))
# test_tokens = list(map(lambda x: tokenize(x, token_to_id), flattened_test_sections))
# print("Vocab Size:", len(vocab))

In [None]:
train_encodings = list(map(lambda x: tokenize(x, token_to_id, True), flattened_train_sections))
val_encodings = list(map(lambda x: tokenize(x, token_to_id, True), flattened_val_sections))
test_encodings = list(map(lambda x: tokenize(x, token_to_id, True), flattened_test_sections))

train_dataset = PaperDataset({"tokens": train_encodings}, flattened_train_labels)
val_dataset = PaperDataset({"tokens": val_encodings}, flattened_val_labels)
test_dataset = PaperDataset({"tokens": test_encodings}, flattened_test_labels)

In [None]:
samples_weight = [label_weight[t] for t in flattened_train_labels]
samples_weight = torch.Tensor(samples_weight)

sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

In [None]:
def compute_metrics(logits, labels, prefix="eval"):
    # predictions = np.argmax(logits, axis=1)
    predictions = np.array(logits) >= 0
    actual = np.array(labels)

    tp = ((predictions == 1) & (actual == 1)).sum()
    fp = ((predictions == 1) & (actual == 0)).sum()
    fn = ((predictions == 0) & (actual == 1)).sum()
    tn = ((predictions == 0) & (actual == 0)).sum()

    precision = tp / (tp + fp) if tp + fp > 0 else 0.0
    recall = tp / (tp + fn) if tp + fn > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    mcc = (tp * tn - fp * fn) / (((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))**0.5)
    mcc = mcc if np.isnan(mcc) else 0.0
    p = tp + fn
    n = tn + fp

    metrics = {"metric/accuracy": accuracy, "metric/precision": precision, "metric/recall": recall, "metric/f1": f1, "metric/mcc": mcc,
            "classification/tp": tp, "classification/fp": fp, "classification/fn": fn, "classification/tn": tn, "classification/n": n, "classification/p": p}
    metrics = {f"{prefix}/{key}": metric for key, metric in metrics.items()}
    # "augmentation/train": train_dataset.num_augmentations, "augmentation/val": val_dataset.num_augmentations
    return metrics

def compute_original_metrics(logits, original_labels, assignments, prefix="eval"):
    assert len(original_labels) == len(assignments)
    new_logits = []
    for start_idx, end_idx in assignments:
        _assignment = np.array(logits[start_idx:end_idx])
        # TODO: Sum could also be possible but should only change confidence and not the outcome
        reduced_logits = _assignment.mean(axis=0)
        new_logits.append(reduced_logits.tolist())
    return compute_metrics(new_logits, original_labels, prefix)

logits = [[1.0, 2.5], [1.3, 0.7]]
labels = [0, 1]

result = compute_metrics(logits, labels)
print(result)

logits = [[1.0, 2.5], [1.3, 0.7], [1.0, 2.5], [1.0, 2.5]]
assignments = [(0, 2), (2, 4)]
result = compute_original_metrics(logits, labels, assignments)
print(result)

In [None]:
# class MyModel(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.base_model = T5ForConditionalGeneration.from_pretrained("t5-base")
#         self.classifier = nn.Linear(768, 2, bias=False)
    
#     def forward(self, x):
#         print(x)
#         emb = self.base_model(**x)
#         print(emb.shape)
#         y = self.classifier(emb)
#         print(y.shape)
#         return y

# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# model = MyModel()
# model.to(device)

# loader = DataLoader(train_dataset, batch_size=4, shuffle=False)
# for _batch in loader:
#     inputs = {key: val.to(device) for key, val in _batch.items()}
#     labels = inputs.pop("labels")
#     y = model(inputs)
#     break

In [None]:
# from klib.misc import kdict

# class BowClassifier(nn.Module):
#     def __init__(self, input_size, output_size=1):
#         super().__init__()
#         self.input_size = input_size
#         self.output_size = output_size

#         self.classifier = nn.Linear(self.input_size, self.output_size, bias=False)

#     def forward(self, tokens):
#         y = self.classifier(tokens)
#         y = y.squeeze()
#         return kdict(logits=y)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# model = BowClassifier(len(vocab)).float()
model = AutoModelForSequenceClassification.from_pretrained('sentence-transformers/paraphrase-TinyBERT-L6-v2', num_labels=2)
# model = AutoModelForSequenceClassification.from_pretrained('allenai/scibert_scivocab_uncased', num_labels=2)
# model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=2)
model.to(device)

train_batch_size, val_batch_size = 64, 64
# train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
train_loader = DataLoader(train_dataset, batch_size=train_batch_size, sampler=sampler)
val_loader = DataLoader(val_dataset, batch_size=val_batch_size, shuffle=False)

# optimizer = AdamW(model.parameters(), lr=5e-5)
optimizer = optim.AdamW(model.parameters(), lr=0.001)
# _label_weight = torch.from_numpy(label_weight).float().to(device)
# loss_func = torch.nn.CrossEntropyLoss(_label_weight, reduction="sum")
loss_func = torch.nn.CrossEntropyLoss()

pos_weight = torch.tensor(label_weight[1]).float().to(device)
weighted_loss_func = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
loss_func = nn.BCEWithLogitsLoss()

num_epochs = 3

wandb_logging = False

run_name = datetime.now().strftime('%d-%m-%Y_%H_%M_%S')
print("Run name:", run_name)
if wandb_logging:
    wandb.login()
    wandb.init(entity="paper-judging", project="huggingface", name=run_name)

output_dir=f'results/{run_name}'
os.makedirs(output_dir)
# logging_steps, eval_steps = 200, 2000
logging_steps, eval_steps = 400, 2000
# logging_steps, eval_steps = 5, 10
_steps = 0

def run_model(_model, _batch):
    inputs = {key: val.to(device) for key, val in _batch.items()}
    labels = inputs.pop("labels")
    outputs = _model(**inputs)
    
    logits = outputs.logits
    loss = loss_func(logits, labels)
    return loss, logits

min_val_loss, best_model, best_metrics, best_loss_epoch = None, None, None, None
best_f1, best_epoch = None, None
for epoch in range(num_epochs):
    print(f"Epoch: {epoch + 1}/{num_epochs}")
    train_losses = []
    for batch in tqdm(train_loader, position=0, leave=True):
        model.train()
        optimizer.zero_grad()
        train_loss, _ = run_model(model, batch)
        train_losses.append(train_loss)
        train_loss.backward()
        optimizer.step()

        metrics = {}
        if _steps % logging_steps == 0:
            _train_loss = sum(train_losses) / len(train_losses)
            metrics.update({"train/loss": _train_loss.item(), "steps": _steps})
            train_losses = []
        
        if _steps % eval_steps == 0:
            print("EVAL")
            model.eval()
            val_losses = []
            logits = []
            with torch.no_grad():
                for val_batch in val_loader:
                    _val_loss, _logits = run_model(model, val_batch)
                    val_losses.append(_val_loss.item())
                    logits.extend(_logits.tolist())

            val_loss = sum(val_losses) / len(val_losses)
            _metrics = compute_metrics(logits, flattened_val_labels, "eval/flattened")
            _original_metrics = compute_original_metrics(logits, val_labels, val_assignemnts, "eval")
            metrics["eval/loss"] = val_loss
            metrics.update(_metrics)
            metrics.update(_original_metrics)

            if min_val_loss == None:
                min_val_loss = val_loss
            elif min_val_loss > val_loss:
                min_val_loss = val_loss
                best_loss_epoch = epoch

            f1 = metrics["eval/metric/f1"]
            if best_f1 == None:
                best_f1 = f1
                best_model = copy.deepcopy(model)
            elif f1 > best_f1:
                best_f1 = f1
                best_model = copy.deepcopy(model)
                best_metrics = metrics
                best_epoch = epoch
        
        if metrics:
            print(metrics) 
            if wandb_logging:
                wandb.log(metrics)

        _steps += 1

snapshot_dir = f"results/{run_name}/network-snapshot-latest"
# best_model.save_pretrained(snapshot_dir)

print(f"Best val metrics during training epoch {best_epoch}:")
print(best_metrics)
test_loader = DataLoader(test_dataset, batch_size=val_batch_size, shuffle=False)
best_model.eval()

with torch.no_grad():
    logits = []
    for val_batch in val_loader:
        _, _logits = run_model(best_model, val_batch)
        logits.extend(_logits.tolist())
    metrics = compute_metrics(logits, flattened_val_labels, "eval/flattened")
    original_metrics = compute_original_metrics(logits, val_labels, val_assignemnts, "eval")
    print("Val metrics:")
    print(metrics)
    print(original_metrics)

    logits = []
    for test_batch in test_loader:
        _, _logits = run_model(best_model, test_batch)
        logits.extend(_logits.tolist())
    metrics = compute_metrics(logits, flattened_test_labels, "test/flattened")
    original_metrics = compute_original_metrics(logits, test_labels, test_assignemnts, "test")
    print("Test metrics:")
    print(metrics)
    print(original_metrics)

if wandb_logging:
    wandb.save(f"{snapshot_dir}/*", os.path.dirname(snapshot_dir))
    wandb.finish()

In [None]:
# wandb.finish()

In [None]:
# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
# model.to(device)

In [None]:
# val_batch_size = 16
# val_loader = DataLoader(val_dataset, batch_size=val_batch_size, shuffle=False)
# _label_weight = torch.from_numpy(label_weight).float().to(device)
# loss_func = torch.nn.CrossEntropyLoss(_label_weight, reduction="sum")

# model.eval()
# val_losses = []
# logits = []
# with torch.no_grad():
#     for val_batch in tqdm(val_loader):
#         inputs = {key: val.to(device) for key, val in val_batch.items()}
#         _labels = inputs.pop("labels")
#         _outputs = model(**inputs)

#         _logits = _outputs.logits
#         _val_loss = loss_func(_logits, _labels)
#         val_losses.append(_val_loss.item())
#         logits.extend(_logits.tolist())

# val_loss = sum(val_losses) / len(val_losses)
# metrics = compute_metrics((logits, val_labels))

In [None]:
# snapshot_dir = f"results/{run_name}/network-snapshot-latest"
# model.save_pretrained(snapshot_dir)

# if wandb_logging:
#     wandb.save(f"{snapshot_dir}/*", os.path.dirname(snapshot_dir))
#     wandb.finish()

In [None]:
# !pip install wandb
# import wandb
# wandb.login()
# wandb.finish()

In [None]:
# from transformers import AutoTokenizer, AutoModelWithLMHead

# t5_tokenizer = AutoTokenizer.from_pretrained("t5-base")

# t5_model = AutoModelWithLMHead.from_pretrained("t5-base").to(device)

In [None]:
# input = "That's great"
# input_enc = t5_tokenizer(input, truncation=True, padding=True, return_tensors="pt")
# output = t5_model(**input_enc.to(device))
# print(output)
# print(torch.softmax(output.logits, dim=1))

In [None]:
# model_path = f"network-snapshot-latest-279194.pt"
# model = BowClassifier(len(vocab))
# model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

# def find_word(word, tokens, labels):
#     total = [0, 0]
#     classes = [0, 0]
#     for i, _tokens in enumerate(tokens):
#         _tokens = set(_tokens)
#         label = labels[i]
#         if word in _tokens:
#             classes[label] += 1
#         total[label] += 1
#     # print(f"Not accepted: {classes[0]}/{total[0]} ({classes[0]/total[0]}), Accepted: {classes[1]}/{total[1]} ({classes[1]/total[1]})")
#     return classes[0]/total[0], classes[1]/total[1]

# def analyze(params, k, tokens, labels):
#     val, ind = params.topk(k)
#     for i in ind:
#         word = vocab[i]
#         rejected, accepted = find_word(word, tokens, labels)
#         n = 4
#         print(f"{word} & {np.round(params[i].item(), 3)} & {np.round(rejected*100, n)} & {np.round(accepted*100, n)}")

# tokens = train_tokens + val_tokens + test_tokens
# print(len(train_tokens), len(val_tokens), len(test_tokens))
# labels = flattened_train_labels + flattened_val_labels + flattened_test_labels
# print(len(flattened_train_labels), len(flattened_val_labels), len(flattened_test_labels))
# params = list(model.parameters())[0][0]
# k = 8
# print(len(tokens), len(labels))

# print("Positive:")
# analyze(params, k, tokens, labels)
# print()

# print("Negative:")
# analyze(-params, k, tokens, labels)
# print()

# print("Unnecessary:")
# val, ind = (-(params.abs())).topk(k)
# for i in ind:
#     print(vocab[i], params[i].item())

In [None]:
# paper_abstract = """Generative adversarial networks (GANs) have shown
# outstanding performance on a wide range of problems in
# computer vision, graphics, and machine learning, but often require numerous training data and heavy computational resources. To tackle this issue, several methods introduce a transfer learning technique in GAN training. They,
# however, are either prone to overfitting or limited to learning small distribution shifts. In this paper, we show that
# simple fine-tuning of GANs with frozen lower layers of
# the discriminator performs surprisingly well. This simple
# baseline, FreezeD, significantly outperforms previous techniques used in both unconditional and conditional GANs.
# We demonstrate the consistent effect using StyleGAN and
# SNGAN-projection architectures on several datasets of Animal Face, Anime Face, Oxford Flower, CUB-200-2011, and
# Caltech-256 datasets. The code and results are available at
# https://github.com/sangwoomo/FreezeD."""

In [None]:
# input = ["It's incredibly bad", paper_abstract, "hello", "darkness"]
# input_enc = tokenizer(input, truncation=True, padding=True, return_tensors="pt")
# output = model(**input_enc.to(device))
# print(output)
# # print(torch.softmax(output.logits, dim=1))

In [None]:
# prediction = output.logits.argmax(dim=1)
# actual = prediction
# n = tp = fp = fn = tn = 0
# tp += (prediction == 1) & (actual == 1)
# fp += (prediction == 1) & (actual == 0)
# fn += (prediction == 0) & (actual == 1)
# tn += (prediction == 0) & (actual == 0)

# LIME


In [None]:
# !pip install lime transformers

In [None]:
# import numpy as np
# import lime
# import torch
# import torch.nn.functional as F
# from lime.lime_text import LimeTextExplainer

# from transformers import AutoTokenizer, AutoModelForSequenceClassification

# tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
# model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
# class_names = ['positive','negative', 'neutral']

# def predictor(texts):
#     print(len(texts))
#     outputs = model(**tokenizer(texts, return_tensors="pt", padding=True))
#     probas = F.softmax(outputs.logits).detach().numpy()
#     return probas

# explainer = LimeTextExplainer(class_names=class_names)

# str_to_predict = "surprising increase in revenue in spite of decrease in market share"
# exp = explainer.explain_instance(str_to_predict, predictor, num_features=20, num_samples=100)
# exp.show_in_notebook(text=str_to_predict)