# Load and Preprocess Data

In [None]:
!pip install pandas
!pip install torch
!pip3 install torch torchvision torchaudio
!pip install transformers
!pip install scikit-learn

In [45]:
import argparse
import json
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import classification_report

In [66]:
def read_json(filepath):
    data = []
    
    with open(filepath, 'r', encoding='utf-8') as file:
        for i in file:
          item = json.loads(i)
          data.append(item)
    return data

matched_list = read_json('dev_matched_sampled-1.jsonl')
mismatched_list = read_json('dev_mismatched_sampled-1.jsonl')


# Finetune Pretrained Model


In [95]:
label_mapping = ['contradiction', 'entailment', 'neutral']

def run_model(data,transfomer,output_name,prompt_method,prompt=""):
    print("Loading ", transfomer)
    model = AutoModelForSequenceClassification.from_pretrained(transfomer)
    tokenizer = AutoTokenizer.from_pretrained(transfomer)

    # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # model.to(device)
    # model.eval()

    
    actual,pred = [],[]
    premise,hypothesis = [],[]
    
    for i in data:
        if prompt_method == 1:
            s = f"Consider the following situation: The premise is {i['sentence1']}. Now, if I say {i['sentence2']}, would you say this is a Entailed (i.e. always true), Contradicted (i.e. always false), or Neutral (neither entailed nor contradicted) based on the premise?"
        elif prompt_method == 2:
            s = f"{prompt} {i['sentence1']} [SEP] {i['sentence2']}"
        elif prompt_method ==3:
            s = f"Is the following statement consistent with, contradictory to, or unrelated to the given premise? Premise: {i['sentence1']}. Statement: {i['sentence2']}."
        premise.append(i['sentence1'])
        hypothesis.append(i['sentence2'])
        
        actual.append(i['gold_label'])
        input_ids = tokenizer.encode(s, return_tensors="pt",truncation_strategy='only_first')
        
        outputs = model(input_ids)

        predictions = outputs.logits.argmax(dim=1)
        prediction_label = [label_mapping[pred] for pred in predictions]
        pred.append(prediction_label[0])

    # ncorrect = sum([int(pred == label) for pred, label in zip(pred, actual)])
    # nsamples = len(pred)
    
    p = [0 if i=="contradiction" else 1 if i =="entailment" else 2 for i in pred]
    l = [0 if i=="contradiction" else 1 if i =="entailment" else 2 for i in actual]
    print(classification_report(p, l,target_names=label_mapping))
    
    acc = float(sum([int(pred == label) for pred, label in zip(pred, actual)])) / int(len(actual))
    print('Accuracy: ', acc)
    output = list(zip(premise, hypothesis, actual, pred))
    pd.DataFrame(output).to_csv(f'{output_name}.csv')

### 1. BERT-large-uncased


In [74]:
#bert-large-uncased
run_model(matched_list,'bert-large-uncased',"bert_match")
run_model(mismatched_list,'bert-large-uncased',"bert_mismatch")

Loading  bert-large-uncased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

contradiction       0.08      0.36      0.13       178
   entailment       0.94      0.36      0.52      2322
      neutral       0.00      0.00      0.00         0

     accuracy                           0.36      2500
    macro avg       0.34      0.24      0.22      2500
 weighted avg       0.88      0.36      0.49      2500

Accuracy:  0.3604
Loading  bert-large-uncased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


               precision    recall  f1-score   support

contradiction       0.88      0.32      0.47      2205
   entailment       0.15      0.47      0.23       295
      neutral       0.00      0.00      0.00         0

     accuracy                           0.34      2500
    macro avg       0.34      0.26      0.23      2500
 weighted avg       0.80      0.34      0.44      2500

Accuracy:  0.3352


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 2. ROBERTA

In [92]:
run_model(matched_list,'roberta-large',"roberta-large_match")
run_model(mismatched_list,'roberta-large',"roberta-large_mismatch")

Loading  roberta-large


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

contradiction       0.00      0.14      0.00         7
   entailment       1.00      0.36      0.52      2493
      neutral       0.00      0.00      0.00         0

     accuracy                           0.36      2500
    macro avg       0.33      0.17      0.18      2500
 weighted avg       0.99      0.36      0.52      2500

Accuracy:  0.3556
Loading  roberta-large


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


               precision    recall  f1-score   support

contradiction       0.98      0.32      0.48      2463
   entailment       0.02      0.46      0.04        37
      neutral       0.00      0.00      0.00         0

     accuracy                           0.32      2500
    macro avg       0.33      0.26      0.17      2500
 weighted avg       0.97      0.32      0.47      2500

Accuracy:  0.3196


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [84]:
run_model(matched_list,'cross-encoder/nli-roberta-base',"roberta_match")
run_model(mismatched_list,'cross-encoder/nli-roberta-base',"roberta_mismatch")

Loading  cross-encoder/nli-roberta-base




               precision    recall  f1-score   support

contradiction       0.89      0.88      0.88       787
   entailment       0.83      0.90      0.86       818
      neutral       0.84      0.78      0.81       895

     accuracy                           0.85      2500
    macro avg       0.85      0.85      0.85      2500
 weighted avg       0.85      0.85      0.85      2500

Accuracy:  0.8428
Loading  cross-encoder/nli-roberta-base




               precision    recall  f1-score   support

contradiction       0.88      0.87      0.88       804
   entailment       0.85      0.91      0.88       837
      neutral       0.84      0.79      0.81       859

     accuracy                           0.86      2500
    macro avg       0.86      0.86      0.86      2500
 weighted avg       0.86      0.86      0.85      2500

Accuracy:  0.8492


### 3. DEBERTA

In [76]:
run_model(matched_list,'cross-encoder/nli-deberta-base',"deberta_match")
run_model(mismatched_list,'cross-encoder/nli-deberta-base',"deberta_mismatch")

Loading  cross-encoder/nli-deberta-base




               precision    recall  f1-score   support

contradiction       0.92      0.89      0.91       803
   entailment       0.85      0.92      0.88       825
      neutral       0.86      0.82      0.84       872

     accuracy                           0.87      2500
    macro avg       0.88      0.88      0.88      2500
 weighted avg       0.88      0.87      0.87      2500

Accuracy:  0.8668
Loading  cross-encoder/nli-deberta-base




               precision    recall  f1-score   support

contradiction       0.92      0.89      0.91       803
   entailment       0.85      0.92      0.88       825
      neutral       0.86      0.82      0.84       872

     accuracy                           0.87      2500
    macro avg       0.88      0.88      0.88      2500
 weighted avg       0.88      0.87      0.87      2500

Accuracy:  0.8668


# Prompt Engineering

## Bert

In [96]:
run_model(matched_list,'bert-large-uncased',"bert_match_m1",1)
run_model(mismatched_list,'bert-large-uncased',"bert_mismatch_m1",1)
run_model(matched_list,'bert-large-uncased',"bert_match_m3",3)
run_model(mismatched_list,'bert-large-uncased',"bert_mismatch_m3",3)

Loading  bert-large-uncased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

contradiction       1.00      0.31      0.47      2500
   entailment       0.00      0.00      0.00         0
      neutral       0.00      0.00      0.00         0

     accuracy                           0.31      2500
    macro avg       0.33      0.10      0.16      2500
 weighted avg       1.00      0.31      0.47      2500

Accuracy:  0.3108
Loading  bert-large-uncased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

contradiction       0.00      0.00      0.00         0
   entailment       1.00      0.36      0.53      2500
      neutral       0.00      0.00      0.00         0

     accuracy                           0.36      2500
    macro avg       0.33      0.12      0.18      2500
 weighted avg       1.00      0.36      0.53      2500

Accuracy:  0.36
Loading  bert-large-uncased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

contradiction       1.00      0.31      0.47      2500
   entailment       0.00      0.00      0.00         0
      neutral       0.00      0.00      0.00         0

     accuracy                           0.31      2500
    macro avg       0.33      0.10      0.16      2500
 weighted avg       1.00      0.31      0.47      2500

Accuracy:  0.3108
Loading  bert-large-uncased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


               precision    recall  f1-score   support

contradiction       0.82      0.31      0.45      2080
   entailment       0.12      0.25      0.16       420
      neutral       0.00      0.00      0.00         0

     accuracy                           0.30      2500
    macro avg       0.31      0.19      0.21      2500
 weighted avg       0.70      0.30      0.40      2500

Accuracy:  0.3036


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [100]:
prompt_sentence1= "According to the premise and hypothesis, determine the relationship and provide explanation:"
run_model(matched_list,'bert-large-uncased',"bert_match_p1",2,prompt_sentence1)
run_model(mismatched_list,'bert-large-uncased',"bert_mismatch_p1",2,prompt_sentence1)

Loading  bert-large-uncased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

contradiction       0.60      0.33      0.42      1417
   entailment       0.49      0.40      0.44      1083
      neutral       0.00      0.00      0.00         0

     accuracy                           0.36      2500
    macro avg       0.36      0.24      0.29      2500
 weighted avg       0.55      0.36      0.43      2500

Accuracy:  0.3596
Loading  bert-large-uncased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


               precision    recall  f1-score   support

contradiction       0.98      0.32      0.48      2461
   entailment       0.01      0.23      0.02        39
      neutral       0.00      0.00      0.00         0

     accuracy                           0.31      2500
    macro avg       0.33      0.18      0.17      2500
 weighted avg       0.96      0.31      0.47      2500

Accuracy:  0.3144


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## nli-roberta-base

In [97]:
run_model(matched_list,'cross-encoder/nli-roberta-base',"nli-roberta-base_match_m1",1)
run_model(mismatched_list,'cross-encoder/nli-roberta-base',"nli-roberta-base_mismatch_m1",1)
run_model(matched_list,'cross-encoder/nli-roberta-base',"nli-roberta-base_match_m3",3)
run_model(mismatched_list,'cross-encoder/nli-roberta-base',"nli-roberta-base_mismatch_m3",3)

Loading  cross-encoder/nli-roberta-base




               precision    recall  f1-score   support

contradiction       0.98      0.32      0.49      2363
   entailment       0.03      0.80      0.06        35
      neutral       0.07      0.56      0.12       102

     accuracy                           0.34      2500
    macro avg       0.36      0.56      0.22      2500
 weighted avg       0.93      0.34      0.47      2500

Accuracy:  0.3384
Loading  cross-encoder/nli-roberta-base




               precision    recall  f1-score   support

contradiction       0.97      0.33      0.50      2319
   entailment       0.06      1.00      0.12        57
      neutral       0.09      0.57      0.15       124

     accuracy                           0.36      2500
    macro avg       0.38      0.64      0.26      2500
 weighted avg       0.91      0.36      0.47      2500

Accuracy:  0.3592
Loading  cross-encoder/nli-roberta-base




               precision    recall  f1-score   support

contradiction       0.91      0.80      0.86       881
   entailment       0.81      0.91      0.86       788
      neutral       0.79      0.79      0.79       831

     accuracy                           0.83      2500
    macro avg       0.84      0.84      0.83      2500
 weighted avg       0.84      0.83      0.83      2500

Accuracy:  0.8288
Loading  cross-encoder/nli-roberta-base




               precision    recall  f1-score   support

contradiction       0.90      0.82      0.86       875
   entailment       0.84      0.90      0.87       845
      neutral       0.78      0.81      0.79       780

     accuracy                           0.84      2500
    macro avg       0.84      0.84      0.84      2500
 weighted avg       0.84      0.84      0.84      2500

Accuracy:  0.8356


In [102]:
run_model(matched_list,'cross-encoder/nli-roberta-base',"nli-roberta-base_match_p1",2,prompt_sentence1)
run_model(mismatched_list,'cross-encoder/nli-roberta-base',"nli-roberta-base_mismatch_p1",2,prompt_sentence1)

Loading  cross-encoder/nli-roberta-base




               precision    recall  f1-score   support

contradiction       0.87      0.87      0.87       779
   entailment       0.79      0.91      0.85       776
      neutral       0.87      0.77      0.81       945

     accuracy                           0.84      2500
    macro avg       0.84      0.85      0.84      2500
 weighted avg       0.85      0.84      0.84      2500

Accuracy:  0.834
Loading  cross-encoder/nli-roberta-base




               precision    recall  f1-score   support

contradiction       0.88      0.87      0.87       798
   entailment       0.84      0.90      0.87       837
      neutral       0.84      0.78      0.81       865

     accuracy                           0.85      2500
    macro avg       0.85      0.85      0.85      2500
 weighted avg       0.85      0.85      0.85      2500

Accuracy:  0.8424


### ROBERTA-Large

In [98]:
run_model(matched_list,'roberta-large',"roberta-large_match_m1",1)
run_model(mismatched_list,'roberta-large',"roberta-large_mismatch_m1",1)
run_model(matched_list,'roberta-large',"roberta-large_match_m3",3)
run_model(mismatched_list,'roberta-large',"roberta-large_mismatch_m3",3)

Loading  roberta-large


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

contradiction       0.71      0.32      0.44      1709
   entailment       0.33      0.37      0.35       791
      neutral       0.00      0.00      0.00         0

     accuracy                           0.34      2500
    macro avg       0.34      0.23      0.26      2500
 weighted avg       0.59      0.34      0.41      2500

Accuracy:  0.336
Loading  roberta-large


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

contradiction       1.00      0.32      0.48      2500
   entailment       0.00      0.00      0.00         0
      neutral       0.00      0.00      0.00         0

     accuracy                           0.32      2500
    macro avg       0.33      0.11      0.16      2500
 weighted avg       1.00      0.32      0.48      2500

Accuracy:  0.3176
Loading  roberta-large


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

contradiction       0.99      0.31      0.47      2466
   entailment       0.01      0.32      0.02        34
      neutral       0.00      0.00      0.00         0

     accuracy                           0.31      2500
    macro avg       0.33      0.21      0.17      2500
 weighted avg       0.97      0.31      0.47      2500

Accuracy:  0.3112
Loading  roberta-large


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


               precision    recall  f1-score   support

contradiction       0.08      0.32      0.13       209
   entailment       0.92      0.36      0.52      2291
      neutral       0.00      0.00      0.00         0

     accuracy                           0.36      2500
    macro avg       0.33      0.23      0.22      2500
 weighted avg       0.85      0.36      0.48      2500

Accuracy:  0.356


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [91]:
prompt_sentence1= "Consider the premise and hypothesis provided. Can you classify their relationship as either Entailed, Contradicted, or Neutral? Additionally, explain your reasoning behind this classification"
run_model(matched_list,'roberta-large',"roberta-large_match_p1",2,prompt_sentence1)
run_model(mismatched_list,'roberta-large',"roberta-large_mismatch_p1",2,prompt_sentence1)


Loading  roberta-large


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

contradiction       0.00      0.00      0.00         0
   entailment       1.00      0.36      0.53      2500
      neutral       0.00      0.00      0.00         0

     accuracy                           0.36      2500
    macro avg       0.33      0.12      0.18      2500
 weighted avg       1.00      0.36      0.53      2500

Accuracy:  0.3568
Loading  roberta-large


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


               precision    recall  f1-score   support

contradiction       0.07      0.28      0.12       212
   entailment       0.89      0.35      0.51      2288
      neutral       0.00      0.00      0.00         0

     accuracy                           0.35      2500
    macro avg       0.32      0.21      0.21      2500
 weighted avg       0.82      0.35      0.47      2500

Accuracy:  0.3456


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


##  DEBERTA

In [99]:
run_model(matched_list,'cross-encoder/nli-deberta-base',"nli-deberta-base_match_m1",1)
run_model(mismatched_list,'cross-encoder/nli-deberta-base',"nli-deberta-base_mismatch_m1",1)
run_model(matched_list,'cross-encoder/nli-deberta-base',"nli-deberta-base_match_m3",3)
run_model(mismatched_list,'cross-encoder/nli-deberta-base',"nli-deberta-base_mismatch_m3",3)

Loading  cross-encoder/nli-deberta-base




               precision    recall  f1-score   support

contradiction       0.28      0.96      0.44       230
   entailment       0.04      0.80      0.08        45
      neutral       0.98      0.37      0.53      2225

     accuracy                           0.43      2500
    macro avg       0.43      0.71      0.35      2500
 weighted avg       0.90      0.43      0.52      2500

Accuracy:  0.4132
Loading  cross-encoder/nli-deberta-base




               precision    recall  f1-score   support

contradiction       0.29      0.95      0.45       247
   entailment       0.04      0.94      0.07        36
      neutral       0.99      0.36      0.53      2217

     accuracy                           0.43      2500
    macro avg       0.44      0.75      0.35      2500
 weighted avg       0.90      0.43      0.51      2500

Accuracy:  0.412
Loading  cross-encoder/nli-deberta-base




               precision    recall  f1-score   support

contradiction       0.92      0.84      0.88       854
   entailment       0.81      0.91      0.86       791
      neutral       0.81      0.79      0.80       855

     accuracy                           0.84      2500
    macro avg       0.85      0.85      0.84      2500
 weighted avg       0.85      0.84      0.84      2500

Accuracy:  0.8368
Loading  cross-encoder/nli-deberta-base




               precision    recall  f1-score   support

contradiction       0.89      0.82      0.85       860
   entailment       0.83      0.90      0.86       829
      neutral       0.79      0.79      0.79       811

     accuracy                           0.84      2500
    macro avg       0.84      0.84      0.84      2500
 weighted avg       0.84      0.84      0.84      2500

Accuracy:  0.8296


In [88]:
run_model(matched_list,'cross-encoder/nli-deberta-base',"deberta_match_p1",2,prompt_sentence1)
run_model(mismatched_list,'cross-encoder/nli-deberta-base',"deberta_mismatch_p1",2,prompt_sentence1)

Loading  cross-encoder/nli-deberta-base




               precision    recall  f1-score   support

contradiction       0.89      0.89      0.89       780
   entailment       0.83      0.91      0.87       814
      neutral       0.85      0.78      0.82       906

     accuracy                           0.86      2500
    macro avg       0.86      0.86      0.86      2500
 weighted avg       0.86      0.86      0.86      2500

Accuracy:  0.8484
Loading  cross-encoder/nli-deberta-base




               precision    recall  f1-score   support

contradiction       0.88      0.89      0.88       778
   entailment       0.86      0.90      0.88       859
      neutral       0.84      0.79      0.81       863

     accuracy                           0.86      2500
    macro avg       0.86      0.86      0.86      2500
 weighted avg       0.86      0.86      0.86      2500

Accuracy:  0.852
