In [None]:
!nvidia-smi

In [None]:
%cd /content/
!git clone -b reduced-data https://github.com/westphal-jan/peer-data
%cd /content/peer-data
# !git checkout huggingface
!git submodule update --init --recursive

In [None]:
# !pip install pytorch-lightning wandb python-dotenv catalyst sentence-transformers numpy requests nlpaug sentencepiece nltk
!pip install wandb nltk

In [None]:
import os
import torch
import json
import glob
from pathlib import Path
from tqdm import tqdm
# from sklearn.model_selection import train_test_split
# from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments, TrainerCallback, TrainerState, TrainerControl, AdamW
import wandb
# from datetime import datetime
# import pickle
import numpy as np
from torch import nn, optim
# import nlpaug.augmenter.word as naw
from torch.utils.data import DataLoader, WeightedRandomSampler
# from copy import copy
import copy
from datetime import datetime
from nltk.corpus import stopwords
import nltk
nltk.download("stopwords")

In [None]:
class PaperDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]).float() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
def read_dataset(data_dir: Path, num_texts=None, restrict_file=None):
    file_paths = glob.glob(f"{data_dir}/*.json")
    if restrict_file:
        with open(restrict_file, "r") as f:
            filter_file_names = f.read().splitlines()
            file_paths = [p for p in file_paths if p.split("/")[-1] in filter_file_names]

    if num_texts != None:
        file_paths = file_paths[:num_texts]
    texts = []
    labels = []
    for i, file_path in enumerate(tqdm(file_paths)):
        with open(file_path) as f:
            paper_json = json.load(f)
            accepted = paper_json["review"]["accepted"]
            abstract = paper_json["review"]["abstract"]
            
            texts.append(abstract)
            labels.append(int(accepted))
    return texts, labels

In [None]:
# dataset = "data/original"

# data_dir = Path(dataset)
# texts, labels = read_dataset(data_dir)
# texts, labels = np.array(texts), np.array(labels)

# num_accepted = len(list(filter(lambda x: x == 1, labels)))
# num_not_accepted = len(list(filter(lambda x: x == 0, labels)))

# print(num_accepted, num_not_accepted)
# label_weight = num_not_accepted / np.array([num_not_accepted, num_accepted])

# # Get random index split for train/val/test.
# idx = list(range(len(texts)))
# # Get constant split across runs
# rnd = np.random.RandomState(42)
# rnd.shuffle(idx)
# total_len = len(idx)
# train_len, val_len = int(0.8*total_len), int(0.1*total_len)
# train_idx = idx[:train_len]
# val_idx = idx[train_len:(train_len + val_len)]
# test_idx = idx[(train_len + val_len):]

# train_texts, train_labels = texts[train_idx], labels[train_idx]
# val_texts, val_labels = texts[val_idx], labels[val_idx]
# text_texts, test_labels = texts[test_idx], labels[test_idx]

In [None]:
data_dir = Path("data/original")

train_texts, train_labels = read_dataset(data_dir, restrict_file="data/train.txt")
val_texts, val_labels = read_dataset(data_dir, restrict_file="data/val.txt")
test_texts, test_labels = read_dataset(data_dir, restrict_file="data/test.txt")
# val_texts, val_labels = read_dataset(data_dir, restrict_file="data/test.txt")
# test_texts, test_labels = read_dataset(data_dir, restrict_file="data/val.txt")

num_accepted = len(list(filter(lambda x: x == 1, train_labels)))
num_rejected = len(list(filter(lambda x: x == 0, train_labels)))

print(num_accepted, num_rejected)
label_weight = num_rejected / np.array([num_rejected, num_accepted])

In [None]:
def label_distribution(labels):
    num_rejected, num_accepted = labels.count(0), labels.count(1)
    print(num_rejected, num_rejected / len(labels), num_accepted, num_accepted / len(labels))

label_distribution(train_labels)
label_distribution(val_labels)
label_distribution(test_labels)

In [None]:
label_weight[1]

In [None]:
special_characters = set(['-', '\'', '.', ',', '!','"','#','$','%','&','(',')','*','+','/',':',';','<','=','>','@','[','\\',']','^','`','{','|','}','~','\t'])
english_stopwords = set(stopwords.words('english'))

def tokenize(text: str, token_to_id=None, encode=False):
    for c in special_characters:
        text = text.replace(c, ' ')
    text = text.lower()
    tokens = [t for t in text.split(' ') if t]
    # tokens = [t for t in tokens if t.isalpha()]
    tokens = [t for t in tokens if not t in english_stopwords]
    if token_to_id:
        tokens = [t for t in tokens if t in token_to_id]
        if encode:
            tokens = [token_to_id[t] for t in tokens]
            onehot = np.zeros(len(token_to_id))
            onehot[tokens] = 1
            return onehot
    return tokens

train_tokens = list(map(tokenize, train_texts))
flattened_tokens = [item for sublist in train_tokens for item in sublist]
vocab = sorted(list(set(flattened_tokens)))
token_to_id = {t: i for i, t in enumerate(vocab)}
val_tokens = list(map(lambda x: tokenize(x, token_to_id), val_texts))
test_tokens = list(map(lambda x: tokenize(x, token_to_id), test_texts))
print("Vocab Size:", len(vocab))

In [None]:
train_encodings = list(map(lambda x: tokenize(x, token_to_id, True), train_texts))
val_encodings = list(map(lambda x: tokenize(x, token_to_id, True), val_texts))
test_encodings = list(map(lambda x: tokenize(x, token_to_id, True), test_texts))

train_dataset = PaperDataset({"tokens": train_encodings}, train_labels)
val_dataset = PaperDataset({"tokens": val_encodings}, val_labels)
test_dataset = PaperDataset({"tokens": test_encodings}, test_labels)

In [None]:
samples_weight = [label_weight[t] for t in train_labels]
samples_weight = torch.Tensor(samples_weight)

sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

In [None]:
def compute_metrics(eval_pred, prefix="eval"):
    logits, labels = eval_pred
    # predictions = np.argmax(logits, axis=1)
    predictions = np.array(logits) >= 0
    actual = np.array(labels)

    tp = ((predictions == 1) & (actual == 1)).sum()
    fp = ((predictions == 1) & (actual == 0)).sum()
    fn = ((predictions == 0) & (actual == 1)).sum()
    tn = ((predictions == 0) & (actual == 0)).sum()

    precision = tp / (tp + fp) if tp + fp > 0 else 0.0
    recall = tp / (tp + fn) if tp + fn > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    mcc = (tp * tn - fp * fn) / (((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))**0.5)
    mcc = 0.0 if np.isnan(mcc) else mcc
    p = tp + fn
    n = tn + fp
    metrics = {"metric/precision": precision, "metric/recall": recall, "metric/f1": f1, "metric/accuracy": accuracy,
            "classification/tp": tp, "classification/fp": fp, "classification/fn": fn, "classification/tn": tn, "classification/n": n, "classification/p": p}
    metrics = {f"{prefix}/{key}": metric for key, metric in metrics.items()}
    # "augmentation/train": train_dataset.num_augmentations, "augmentation/val": val_dataset.num_augmentations
    return metrics

logits = [0, 0]
labels = [0, 1]

result = compute_metrics((logits, labels))
print(result)

# my_callback = MyCallback()

In [None]:
class BowClassifier(nn.Module):
    def __init__(self, input_size, output_size=1):
        super().__init__()
        self.input_size = input_size
        self.output_size = output_size

        self.classifier = nn.Linear(self.input_size, self.output_size, bias=False)

    def forward(self, x):
        y = self.classifier(x)
        return y

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = BowClassifier(len(vocab))
model.to(device)

train_batch_size, val_batch_size = 64, 64
use_sampler = False
is_unweighted = False
if use_sampler:
    train_loader = DataLoader(train_dataset, batch_size=train_batch_size, sampler=sampler)
else:
    train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=val_batch_size, shuffle=False)

# optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
optimizer = optim.AdamW(model.parameters(), lr=0.001)
# _label_weight = torch.from_numpy(label_weight).float().to(device)
# loss_func = torch.nn.CrossEntropyLoss(_label_weight, reduction="sum")
pos_weight = torch.tensor(label_weight[1]).float().to(device)
weighted_loss_func = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
if use_sampler or is_unweighted:
    print("Use unweighted BCE")
    weighted_name = "unweighted"
    loss_func = nn.BCEWithLogitsLoss()
    if is_unweighted:
        weighted_loss_func = loss_func
else:
    print("Use weighted BCE")
    weighted_name = "weighted"
    loss_func = weighted_loss_func

wandb_logging = False

run_name = datetime.now().strftime('%d-%m-%Y_%H_%M_%S')
optimizer_name = "adam" if isinstance(optimizer, optim.AdamW) else "sgd"
oversampling_name = "-oversampling" if use_sampler else ""
num_train_epochs = 20
run_name += f"-{optimizer_name}-e{num_train_epochs}-b{train_batch_size}-{weighted_name}{oversampling_name}-remove_stopwords"
print("Run name:", run_name)
if wandb_logging:
    wandb.login()
    wandb.init(entity="paper-judging", project="final-evaluation-bow", name=run_name)

output_dir=f'results/{run_name}'
os.makedirs(output_dir)

# logging_steps, eval_steps = 10, 50
# logging_steps, eval_steps = 20, 200
# logging_steps, eval_steps = 5, 10
logging_steps, eval_steps = 50, len(train_loader)
_steps = 0

def run_model(_model, _batch, _loss_func=loss_func):
    inputs = {key: val.to(device) for key, val in _batch.items()}
    labels = inputs.pop("labels").float()
    tokens = inputs.pop("tokens")
    outputs = _model(tokens)

    logits = outputs.squeeze(dim=1)
    loss = _loss_func(logits, labels)
    return loss, logits

min_val_loss, best_model, best_metrics, best_loss_epoch = None, None, None, None
best_f1, best_epoch = None, None
for epoch in range(num_train_epochs):
    print(f"Epoch: {epoch + 1}/{num_train_epochs}")
    train_losses = []
    for batch in tqdm(train_loader, position=0):
        model.train()
        optimizer.zero_grad()
        train_loss, _ = run_model(model, batch)
        train_losses.append(train_loss)
        train_loss.backward()
        optimizer.step()

        metrics = {}
        if _steps % logging_steps == 0:
            _train_loss = sum(train_losses) / len(train_losses)
            metrics.update({"train/loss": _train_loss.item(), "steps": _steps})
            train_losses = []
        
        if _steps % eval_steps == 0:
            model.eval()
            val_losses = []
            logits = []
            with torch.no_grad():
                for val_batch in val_loader:
                    _val_loss, _logits = run_model(model, val_batch, weighted_loss_func)
                    val_losses.append(_val_loss.item())
                    logits.extend(_logits.tolist())

            val_loss = sum(val_losses) / len(val_losses)
            _metrics = compute_metrics((logits, val_labels))
            metrics["eval/loss"] = val_loss
            metrics.update(_metrics)

            if min_val_loss == None:
                min_val_loss = val_loss
            elif min_val_loss > val_loss:
                min_val_loss = val_loss
                best_loss_epoch = epoch

            f1 = metrics["eval/metric/f1"]
            if best_f1 == None:
                best_f1 = f1
            elif f1 > best_f1:
                best_f1 = f1
                best_model = copy.deepcopy(model)
                best_metrics = metrics
                best_epoch = epoch
        
        if metrics:
            print(metrics)
            if wandb_logging:
                wandb.log(metrics)

        _steps += 1

print(f"Best val metrics during training epoch {best_epoch}:")
print(best_metrics)
test_loader = DataLoader(test_dataset, batch_size=val_batch_size, shuffle=False)
best_model.eval()

with torch.no_grad():
    logits = []
    for val_batch in val_loader:
        _, _logits = run_model(best_model, val_batch)
        logits.extend(_logits.tolist())
    metrics = compute_metrics((logits, val_labels))
    print("Val metrics:")
    print(metrics)

    logits = []
    for test_batch in test_loader:
        _, _logits = run_model(best_model, test_batch)
        logits.extend(_logits.tolist())
    metrics = compute_metrics((logits, test_labels))
    print("Test metrics:")
    print(metrics)

# snapshot_dir = f"results/{run_name}/network-snapshot-latest"
# model.save_pretrained(snapshot_dir)

if wandb_logging:
    # wandb.save(f"{snapshot_dir}/*", os.path.dirname(snapshot_dir))
    wandb.finish()

In [None]:
# wandb.finish()

In [None]:
params = list(model.parameters())[0][0]
k = 20
val, ind = params.topk(k)
print("Positive:")
for i in ind:
    print(vocab[i], params[i].item())
print()
print("Negative:")
val, ind = (-params).topk(k)
for i in ind:
    print(vocab[i], params[i].item())

print()
print("Unnecessary:")
val, ind = (-(params.abs())).topk(k)
for i in ind:
    print(vocab[i], params[i].item())

In [None]:
def find_word(word, tokens, labels):
    total = [0, 0]
    classes = [0, 0]
    for i, tokens in enumerate(tokens):
        tokens = set(tokens)
        label = labels[i]
        if word in tokens:
            classes[label] += 1
        total[label] += 1
    # print(f"Not accepted: {classes[0]}/{total[0]} ({classes[0]/total[0]}), Accepted: {classes[1]}/{total[1]} ({classes[1]/total[1]})")
    return classes[0]/total[0], classes[1]/total[1]

def analyze(params, k, tokens, labels):
    val, ind = params.topk(k)
    for i in ind:
        word = vocab[i]
        rejected, accepted = find_word(word, tokens, labels)
        n = 2
        print(f"{word} & {np.round(params[i].item(), 3)} & {np.round(rejected*100, n)} & {np.round(accepted*100, n)}")

tokens = train_tokens + val_tokens + test_tokens
labels = train_labels + val_labels + test_labels
params = list(model.parameters())[0][0]
k = 8

print("Positive:")
analyze(params, k, tokens, labels)
print()

print("Negative:")
analyze(-params, k, tokens, labels)
print()

print("Unnecessary:")
val, ind = (-(params.abs())).topk(k)
for i in ind:
    print(vocab[i], params[i].item())

In [None]:
def find_word(word, all_tokens, labels):
    total = [0, 0]
    classes = [0, 0]
    for i, tokens in enumerate(all_tokens):
        tokens = set(tokens)
        label = labels[i]
        if word in tokens:
            classes[label] += 1
        total[label] += 1
    print(f"Not accepted: {classes[0]}/{total[0]} ({classes[0]/total[0]}), Accepted: {classes[1]}/{total[1]} ({classes[1]/total[1]})")

word = "theoretical"
print("Train:")
find_word(word, train_tokens, train_labels)
print("Val:")
find_word(word, val_tokens, val_labels)
print("Test:")
find_word(word, test_tokens, test_labels)

n = 10
for tokens, text in zip(train_tokens, train_texts):
    if word in tokens:
        print(text)
        n -= 1
    if n == 0:
        break

In [None]:
paper_abstract = """Generative adversarial networks (GANs) have shown
outstanding performance on a wide range of problems in
computer vision, graphics, and machine learning, but often require numerous training data and heavy computational resources. To tackle this issue, several methods introduce a transfer learning technique in GAN training. They,
however, are either prone to overfitting or limited to learning small distribution shifts. In this paper, we show that
simple fine-tuning of GANs with frozen lower layers of
the discriminator performs surprisingly well. This simple
baseline, FreezeD, significantly outperforms previous techniques used in both unconditional and conditional GANs.
We demonstrate the consistent effect using StyleGAN and
SNGAN-projection architectures on several datasets of Animal Face, Anime Face, Oxford Flower, CUB-200-2011, and
Caltech-256 datasets. The code and results are available at
https://github.com/sangwoomo/FreezeD."""

In [None]:
v_idx = 9
input = ["It's incredibly bad", paper_abstract, "hello", "darkness", val_texts[v_idx]]
print(val_labels[v_idx])
print(val_texts[v_idx])
print(tokenize(val_texts[v_idx]))
encoded = list(map(lambda x: tokenize(x, token_to_id, True), input))
encoded = torch.tensor(encoded).float()
# print(encoded)
output = model(encoded)
print(output)
# print(torch.softmax(output.logits, dim=1))

In [None]:
# prediction = output.logits.argmax(dim=1)
# actual = prediction
# n = tp = fp = fn = tn = 0
# tp += (prediction == 1) & (actual == 1)
# fp += (prediction == 1) & (actual == 0)
# fn += (prediction == 0) & (actual == 1)
# tn += (prediction == 0) & (actual == 0)

In [None]:
# !python huggingface_train.py

# LIME


In [None]:
import numpy as np
import lime
import torch
import torch.nn.functional as F
from lime.lime_text import LimeTextExplainer

from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
class_names = ['positive','negative', 'neutral']

def predictor(texts):
outputs = model(**tokenizer(texts, return_tensors="pt", padding=True))
probas = F.softmax(outputs.logits).detach().numpy()
return probas

explainer = LimeTextExplainer(class_names=class_names)

str_to_predict = "surprising increase in revenue in spite of decrease in market share"
exp = explainer.explain_instance(str_to_predict, predictor, num_features=20, num_samples=2000)
exp.show_in_notebook(text=str_to_predict)