In [2]:
from transformers import AutoModel, AutoTokenizer

# Load the model
model_path = 'artifact_expert_model_v2\\artifact_expert_model_v2\\checkpoint-1611'
artifact_expert = AutoModel.from_pretrained(model_path)

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('google/electra-small-discriminator', use_fast=True)

artifact_expert

  from .autonotebook import tqdm as notebook_tqdm


ElectraModel(
  (embeddings): ElectraEmbeddings(
    (word_embeddings): Embedding(30522, 128, padding_idx=0)
    (position_embeddings): Embedding(512, 128)
    (token_type_embeddings): Embedding(2, 128)
    (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
  (encoder): ElectraEncoder(
    (layer): ModuleList(
      (0-11): 12 x ElectraLayer(
        (attention): ElectraAttention(
          (self): ElectraSelfAttention(
            (query): Linear(in_features=256, out_features=256, bias=True)
            (key): Linear(in_features=256, out_features=256, bias=True)
            (value): Linear(in_features=256, out_features=256, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): ElectraSelfOutput(
            (dense): Linear(in_features=256, out_features=256, bias=True)
            (LayerNorm): LayerNorm((

In [3]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset('veluribharath/snli')

dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'id'],
        num_rows: 549367
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label', 'id'],
        num_rows: 9842
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label', 'id'],
        num_rows: 9824
    })
})

In [15]:
def tokenize_function(example):
    # Tokenize the hypothesis and apply padding
    return tokenizer(example['hypothesis'], truncation=True, padding='max_length', max_length=tokenizer.model_max_length)

# Apply the tokenize function to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove the 'premise' column
tokenized_datasets = tokenized_datasets.remove_columns(["premise"])

# The dataset now has the tokenized and padded hypothesis, labels, and ids, but not the premise
print(tokenized_datasets)


Map:   0%|          | 0/549367 [00:00<?, ? examples/s]

Map: 100%|██████████| 549367/549367 [03:22<00:00, 2717.75 examples/s]
Map: 100%|██████████| 9842/9842 [00:03<00:00, 2627.78 examples/s]
Map: 100%|██████████| 9824/9824 [00:03<00:00, 2697.22 examples/s]

DatasetDict({
    train: Dataset({
        features: ['hypothesis', 'label', 'id', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 549367
    })
    validation: Dataset({
        features: ['hypothesis', 'label', 'id', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9842
    })
    test: Dataset({
        features: ['hypothesis', 'label', 'id', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9824
    })
})





In [16]:
from torch.utils.data import DataLoader
from transformers import default_data_collator
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Move model to device
artifact_expert.to(device)

# Define a simple linear classifier with the same number of output units as classes
num_labels = 3  # Adjust the number of labels as per your dataset specifics
classifier = torch.nn.Linear(artifact_expert.config.hidden_size, num_labels).to(device)

def get_predictions(dataset, model, classifier, tokenizer, batch_size=16):
    # DataLoader to handle batching
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size, 
        collate_fn=default_data_collator
    )

    # Ensure model and classifier are on the same device
    model.to(device)
    classifier.to(device)

    model.eval()
    classifier.eval()

    all_predictions = []
    all_confidences = []
    all_true_labels = []

    for batch in dataloader:
        # Move relevant model input keys to the same device as the model
        model_inputs = {k: v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}

        # Keep 'labels' on CPU (or move to GPU if you plan to use GPU for evaluation)
        labels = batch['labels']

        with torch.no_grad():
            # Get model output using only the model input parts of the batch
            outputs = model(**model_inputs)
            # Apply the classifier to the last hidden states
            logits = classifier(outputs.last_hidden_state[:, 0, :])  # Use the [CLS] representation
            # Calculate softmax probabilities
            probs = torch.nn.functional.softmax(logits, dim=-1)
            # Get predictions and confidence scores
            predictions = probs.argmax(dim=-1)
            confidences = probs.max(dim=-1).values

        all_predictions.extend(predictions.cpu().numpy())
        all_confidences.extend(confidences.cpu().numpy())
        all_true_labels.extend(labels.numpy())  # Assuming labels are not on GPU

    # Convert lists to numpy arrays for indexing
    all_predictions = np.array(all_predictions)
    all_confidences = np.array(all_confidences)
    all_true_labels = np.array(all_true_labels)

    return all_predictions, all_confidences, all_true_labels

# Get predictions, confidences, and true labels for the validation set
predictions, confidences, true_labels = get_predictions(tokenized_datasets['validation'], artifact_expert, classifier, tokenizer)


In [21]:
# Filter out the accurate predictions
accurate_indices = predictions == true_labels
accurate_predictions = predictions[accurate_indices]
accurate_confidences = confidences[accurate_indices]

# Filter out the accurate predictions
inaccurate_indices = predictions != true_labels
inaccurate_predictions = predictions[inaccurate_indices]
inaccurate_confidences = confidences[inaccurate_indices]

print(f'Number of accurate predictions: {len(accurate_predictions)}')
print(f'Number of inaccurate predictions: {len(inaccurate_predictions)}')

#Percentage of accurate predictions and inaccurate predictions
print(f'Percentage of accurate predictions: {len(accurate_predictions)/len(predictions)}')
print(f'Percentage of inaccurate predictions: {len(inaccurate_predictions)/len(predictions)}')

assert len(accurate_predictions) + len(inaccurate_predictions) == len(predictions)

Number of accurate predictions: 4215
Number of inaccurate predictions: 5627
Percentage of accurate predictions: 0.4282666124771388
Percentage of inaccurate predictions: 0.5717333875228612


In [23]:
# dict of accurate output
accurate_output = {
    'predictions': accurate_predictions, 
    'confidences': accurate_confidences,
    'true_labels': true_labels[accurate_indices]
    }

inaccurate_output = {
    'predictions': inaccurate_predictions, 
    'confidences': inaccurate_confidences,
    'true_labels': true_labels[inaccurate_indices]
    }

print(accurate_output)
print(inaccurate_output)

{'predictions': array([1, 1, 2, ..., 0, 2, 1], dtype=int64), 'confidences': array([0.3742352 , 0.38159496, 0.5081238 , ..., 0.41395518, 0.57047284,
       0.3831905 ], dtype=float32), 'true_labels': array([1, 1, 2, ..., 0, 2, 1], dtype=int64)}
{'predictions': array([2, 1, 2, ..., 1, 1, 2], dtype=int64), 'confidences': array([0.48271525, 0.43716353, 0.54936486, ..., 0.3873359 , 0.3891356 ,
       0.3886815 ], dtype=float32), 'true_labels': array([0, 2, 0, ..., 0, 2, 0], dtype=int64)}


In [25]:
import numpy as np

# Function to sort a dictionary based on confidence scores
def sort_by_confidence(output_dict):
    # Extract arrays from the dictionary
    predictions = output_dict['predictions']
    confidences = output_dict['confidences']
    true_labels = output_dict['true_labels']

    # Get the sorted indices based on confidences
    sorted_indices = np.argsort(confidences)[::-1]

    # Sort all arrays using the sorted indices
    sorted_dict = {
        'predictions': predictions[sorted_indices],
        'confidences': confidences[sorted_indices],
        'true_labels': true_labels[sorted_indices]
    }
    return sorted_dict

# Sort both dictionaries
sorted_accurate_output = sort_by_confidence(accurate_output)
sorted_inaccurate_output = sort_by_confidence(inaccurate_output)

# Print the sorted dictionaries
print(sorted_accurate_output)
print(sorted_inaccurate_output)

# Function to print the top n predictions in pretty format
def print_top_n(sorted_dict, n=10):
    # Extract arrays from the dictionary
    predictions = sorted_dict['predictions']
    confidences = sorted_dict['confidences']
    true_labels = sorted_dict['true_labels']

    # Print the top n predictions in a nicely formatted way
    for i in range(n):
        print(f'Prediction: {predictions[i]}, Confidence: {confidences[i]}, True label: {true_labels[i]}')

# Print the top 10 accurate predictions
print_top_n(sorted_accurate_output)

# Print the top 10 inaccurate predictions
print_top_n(sorted_inaccurate_output)

{'predictions': array([2, 2, 2, ..., 0, 0, 0], dtype=int64), 'confidences': array([0.60370725, 0.5984036 , 0.59822285, ..., 0.33801007, 0.33570334,
       0.33429477], dtype=float32), 'true_labels': array([2, 2, 2, ..., 0, 0, 0], dtype=int64)}
{'predictions': array([2, 2, 2, ..., 2, 2, 1], dtype=int64), 'confidences': array([0.62734747, 0.61239845, 0.6118995 , ..., 0.33675805, 0.335899  ,
       0.335044  ], dtype=float32), 'true_labels': array([0, 0, 1, ..., 0, 0, 0], dtype=int64)}
Prediction: 2, Confidence: 0.6037072539329529, True label: 2
Prediction: 2, Confidence: 0.5984035730361938, True label: 2
Prediction: 2, Confidence: 0.5982228517532349, True label: 2
Prediction: 2, Confidence: 0.5926666259765625, True label: 2
Prediction: 2, Confidence: 0.5924074649810791, True label: 2
Prediction: 2, Confidence: 0.5921867489814758, True label: 2
Prediction: 2, Confidence: 0.5902161598205566, True label: 2
Prediction: 2, Confidence: 0.5892191529273987, True label: 2
Prediction: 2, Confidenc

In [29]:
# Function to print the percentage of predictions with confidence above a threshold
def print_percentage_above_threshold(sorted_dict, threshold=0.50):
    # Extract arrays from the dictionary
    predictions = sorted_dict['predictions']
    confidences = sorted_dict['confidences']
    true_labels = sorted_dict['true_labels']

    # Calculate the percentage of predictions with confidence above the threshold
    above_threshold = confidences >= threshold
    percentage_above_threshold = np.mean(above_threshold) * 100

    print(f'Percentage of predictions with confidence above {threshold}: {percentage_above_threshold:.2f}%')

# Print the percentage of accurate predictions with confidence above 0.9
print_percentage_above_threshold(sorted_accurate_output)

# Print the percentage of inaccurate predictions with confidence above 0.9
print_percentage_above_threshold(sorted_inaccurate_output)

Percentage of predictions with confidence above 0.5: 16.09%
Percentage of predictions with confidence above 0.5: 20.99%
