In [1]:
import pickle
from pathlib import Path
import numpy as np
import scipy as sp
import pandas as pd
import glob

import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import random

from tqdm import tqdm

gpu = "0"
device = torch.device(f"cuda:{gpu}" if torch.cuda.is_available() else "cpu")

In [2]:
ig_inference_results = glob.glob("results/NEW*_4_*.pickle")
new_inference_results = glob.glob("results/NEW*_5_*.pickle")

In [3]:
Q = []
X = []
y = []
for path in ig_inference_results:
    with open(path, "rb") as infile:
        results = pickle.loads(infile.read())
    Q.extend(results['question'])
    X.extend(results['attributes_first'])
    y.extend(results['correct'])
    
old = pd.DataFrame({"question": Q, "attributes_first": X, "correct": y})

Q = []
X = []
y = []
for path in new_inference_results:
    with open(path, "rb") as infile:
        results = pickle.loads(infile.read())
    Q.extend(results['question'])
    X.extend(results['saliency'])
    y.extend(results['input_x_gradient'])
    
new = pd.DataFrame({"question": Q, "saliency": X, "input_x_gradient": y})

In [4]:
md = old.merge(new, on='question', how='left')

In [5]:
X = md['attributes_first'].to_list()
y = md['correct'].to_list()

# Model Training

In [None]:
num_epochs = 250
learning_rate = 1e-4
weight_decay = 1e-2

class RNNHallucinationClassifier(torch.nn.Module):
    def __init__(self, dropout=0.25):
        super().__init__()
        hidden_dim = 128
        num_layers = 4
        self.gru = torch.nn.GRU(1, hidden_dim, num_layers, dropout=dropout, batch_first=True, bidirectional=False)
        self.linear = torch.nn.Linear(hidden_dim, 2)
    
    def forward(self, seq):
        gru_out, _ = self.gru(seq)
        return self.linear(gru_out)[-1, -1, :]
    
rnn_model = RNNHallucinationClassifier().to(device)
optimizer = torch.optim.AdamW(rnn_model.parameters(), lr=learning_rate, weight_decay=weight_decay)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=21)

In [None]:
rnn_model.train()
for step in tqdm(range(num_epochs)):
    optimizer.zero_grad()
    preds = torch.stack([rnn_model(torch.tensor(i).view(1, -1, 1).to(torch.float).to(device)) for i in X_train])
    loss = torch.nn.functional.cross_entropy(preds, torch.tensor(y_train).to(torch.long).to(device))
    loss.backward()
    optimizer.step()

In [11]:
# Save the entire model
torch.save(rnn_model, 'IG_RNN_classifier.pth')

# Result

In [6]:
IG_rnn_model = torch.load('IG_RNN_classifier.pth', weights_only=False).to(device)

In [8]:
classifier_results = {}

IG_rnn_model.eval()

preds = torch.stack([IG_rnn_model(torch.tensor(i).view(1, -1, 1).to(torch.float).to(device)) for i in X_test])
preds = torch.nn.functional.softmax(preds, dim=1)
prediction_classes = (preds[:,1]>0.5).type(torch.long).cpu()
classifier_results['attribution_rnn_roc'] = roc_auc_score(y_test, preds[:,1].detach().cpu().numpy())
classifier_results['attribution_rnn_acc'] = (prediction_classes.numpy()==y_test).mean()

In [9]:
classifier_results

{'attribution_rnn_roc': np.float64(0.6109626503245111),
 'attribution_rnn_acc': np.float64(0.5972222222222222)}

# Saliency

In [6]:
X = md['saliency'].to_list()
y = md['correct'].to_list()

In [7]:
num_epochs = 250
learning_rate = 1e-4
weight_decay = 1e-2

class RNNHallucinationClassifier(torch.nn.Module):
    def __init__(self, dropout=0.25):
        super().__init__()
        hidden_dim = 128
        num_layers = 4
        self.gru = torch.nn.GRU(1, hidden_dim, num_layers, dropout=dropout, batch_first=True, bidirectional=False)
        self.linear = torch.nn.Linear(hidden_dim, 2)
    
    def forward(self, seq):
        gru_out, _ = self.gru(seq)
        return self.linear(gru_out)[-1, -1, :]
    
rnn_model = RNNHallucinationClassifier().to(device)
optimizer = torch.optim.AdamW(rnn_model.parameters(), lr=learning_rate, weight_decay=weight_decay)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=21)

In [8]:
rnn_model.train()
for step in tqdm(range(num_epochs)):
    optimizer.zero_grad()
    preds = torch.stack([rnn_model(torch.tensor(i).view(1, -1, 1).to(torch.float).to(device)) for i in X_train])
    loss = torch.nn.functional.cross_entropy(preds, torch.tensor(y_train).to(torch.long).to(device))
    loss.backward()
    optimizer.step()

100%|██████████| 250/250 [56:08<00:00, 13.47s/it]


In [None]:
classifier_results = {}

rnn_model.eval()

preds = torch.stack([rnn_model(torch.tensor(i).view(1, -1, 1).to(torch.float).to(device)) for i in X_test])
preds = torch.nn.functional.softmax(preds, dim=1)
prediction_classes = (preds[:,1]>0.5).type(torch.long).cpu()

classifier_results['attribution_rnn_roc'] = roc_auc_score(y_test, preds[:,1].detach().cpu().numpy())
classifier_results['attribution_rnn_acc'] = (prediction_classes.numpy()==y_test).mean()
classifier_results

{'attribution_rnn_roc': np.float64(0.6075690788677222),
 'attribution_rnn_acc': np.float64(0.6005555555555555)}

# input_x_gradient

In [13]:
X = md['input_x_gradient'].to_list()
y = md['correct'].to_list()

In [14]:
num_epochs = 250
learning_rate = 1e-4
weight_decay = 1e-2

class RNNHallucinationClassifier(torch.nn.Module):
    def __init__(self, dropout=0.25):
        super().__init__()
        hidden_dim = 128
        num_layers = 4
        self.gru = torch.nn.GRU(1, hidden_dim, num_layers, dropout=dropout, batch_first=True, bidirectional=False)
        self.linear = torch.nn.Linear(hidden_dim, 2)
    
    def forward(self, seq):
        gru_out, _ = self.gru(seq)
        return self.linear(gru_out)[-1, -1, :]
    
rnn_model = RNNHallucinationClassifier().to(device)
optimizer = torch.optim.AdamW(rnn_model.parameters(), lr=learning_rate, weight_decay=weight_decay)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=21)

In [15]:
rnn_model.train()
for step in tqdm(range(num_epochs)):
    optimizer.zero_grad()
    preds = torch.stack([rnn_model(torch.tensor(i).view(1, -1, 1).to(torch.float).to(device)) for i in X_train])
    loss = torch.nn.functional.cross_entropy(preds, torch.tensor(y_train).to(torch.long).to(device))
    loss.backward()
    optimizer.step()

100%|██████████| 250/250 [55:34<00:00, 13.34s/it]


In [16]:
classifier_results = {}

rnn_model.eval()

preds = torch.stack([rnn_model(torch.tensor(i).view(1, -1, 1).to(torch.float).to(device)) for i in X_test])
preds = torch.nn.functional.softmax(preds, dim=1)
prediction_classes = (preds[:,1]>0.5).type(torch.long).cpu()
classifier_results['attribution_rnn_roc'] = roc_auc_score(y_test, preds[:,1].detach().cpu().numpy())
classifier_results['attribution_rnn_acc'] = (prediction_classes.numpy()==y_test).mean()
classifier_results

{'attribution_rnn_roc': np.float64(0.6107160652411883),
 'attribution_rnn_acc': np.float64(0.6044444444444445)}