In [1]:
import pandas as pd
import joblib 
import numpy as np
import os
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import LabelEncoder

In [2]:
bad_classes = [
    "prejudicial language",
    "fallacy of slippery slope",
    "slothful induction"
]

In [3]:
all_predictions = os.listdir('cache/predictions/all')

In [4]:
all_predictions = [x for x in all_predictions if x.startswith('outputs_dict')]

In [5]:
data = []
for prediction in all_predictions:
    try:
        outputs_dict = joblib.load('cache/predictions/all/' + prediction)
        if 'meta' in outputs_dict.keys():
            data.append(outputs_dict)
    except Exception as e:
        print(e)
        continue
len(data)





968

In [6]:
def get_new_data(data, note, dataset = "data/finegrained", threshold = 0.29):
    new_data = [
        data_point
        for data_point
        in data
        if data_point["meta"]["data_dir"] == dataset and 'note' in data_point.keys() and (data_point['note'] == note or note == None)
    ]

    new_data = [
        data_point 
        for data_point in new_data
        if data_point['predictions']['metrics']['test_f1'] > threshold
    ]

    print(len(new_data))
    return new_data

In [7]:
_ = get_new_data(data, 'best_hps_final_without_attention', dataset = 'data/bigbench', threshold=0)

60


In [8]:
def get_metrics(y_true, y_pred):
    return {
        'f1': f1_score(y_true, y_pred, average = "weighted"),
        'precision': precision_score(y_true, y_pred, average = "weighted"),
        'recall': recall_score(y_true, y_pred, average = 'weighted'),
        'accuracy': accuracy_score(y_true, y_pred)
    }

In [9]:
def get_unfolded_labels(sample_data):
    label_encoder = sample_data["label_encoder"]
    cbr_labels = sample_data["cbr_labels"]
    all_cbr_labels = []
    for sample_cbr_labels in cbr_labels:
        unfolded_sample_cbr_labels = []
        for retriever_cbr_samples in sample_cbr_labels:
            for filtered_cbr_sample in retriever_cbr_samples:
                for inner_label in filtered_cbr_sample:
                    unfolded_sample_cbr_labels.append(inner_label)
        all_cbr_labels.append(unfolded_sample_cbr_labels)
        
    all_cbr_labels = [
        label_encoder.transform(cbr_labels)
        for cbr_labels
        in all_cbr_labels
    ]
    return all_cbr_labels

In [10]:
def get_overlap(sample_data):
    label_encoder = sample_data["label_encoder"]
    
    labels = sample_data["predictions"]["label_ids"]
    
    cbr_labels = sample_data["cbr_labels"]
    
    all_cbr_labels = []
    for sample_cbr_labels in cbr_labels:
        unfolded_sample_cbr_labels = []
        for retriever_cbr_samples in sample_cbr_labels:
            for filtered_cbr_sample in retriever_cbr_samples:
                for inner_label in filtered_cbr_sample:
                    unfolded_sample_cbr_labels.append(inner_label)
        all_cbr_labels.append(unfolded_sample_cbr_labels)
        
    all_cbr_labels = [
        label_encoder.transform(cbr_labels)
        for cbr_labels
        in all_cbr_labels
    ]
    
    predicted_labels = np.argmax(sample_data["predictions"]["predictions"], axis = -1)
    correct_predictions = np.where(labels == predicted_labels)[0]

    overlap_count = 0
    for index in correct_predictions:
        if labels[index] in all_cbr_labels[index]:
            overlap_count += 1
    return overlap_count / len(correct_predictions)

In [11]:
def get_results_df(new_data):
    total_cbr_texts = []
    total_cbr_labels = []
    total_num_cases = []
    total_retrievers = []
    total_overlaps = []
    total_thresholds = []
    total_f1_scores = []
    total_precisions = []
    total_recalls = []
    total_accuracies = []
    total_true_labels = []
    total_predicted_labels = []
    for sample_data in new_data:
        if 'cbr_labels' in sample_data.keys():
            total_num_cases.append(sample_data['meta']['num_cases'])
            total_retrievers.append(' '.join(sample_data['meta']["retrievers"]))
            total_cbr_texts.append(sample_data['cbr'])
            total_cbr_labels.append(get_unfolded_labels(sample_data))
            
            total_overlaps.append(get_overlap(sample_data))
            total_thresholds.append(sample_data['meta']['cbr_threshold'])
        else:
            total_cbr_texts.append(None)
            total_cbr_labels.append(None)
            total_num_cases.append(None)
            total_retrievers.append(None)
            total_overlaps.append(None)
            total_thresholds.append(None)
            
        total_f1_scores.append(sample_data['predictions']['metrics']['test_f1'])
        total_precisions.append(sample_data['predictions']['metrics']['test_precision'])
        total_recalls.append(sample_data['predictions']['metrics']['test_recall'])
        total_accuracies.append(sample_data['predictions']['metrics']['test_accuracy'])
        total_true_labels.append(sample_data['predictions']['label_ids'])
        total_predicted_labels.append(
            np.argmax(sample_data['predictions']['predictions'], axis = -1).tolist()
        )
    
    
        
    results_df = pd.DataFrame({
        'num_cases': total_num_cases,
        'threshold': total_thresholds,
        'retrievers': total_retrievers,
        'overlaps': total_overlaps,
        'f1': total_f1_scores,
        'precision': total_precisions,
        'recall': total_recalls,
        'accuracy': total_accuracies,
        'cbr': np.array(total_cbr_texts).squeeze().tolist(),
        'cbr_labels': np.array(total_cbr_labels).squeeze().tolist(),
        'label_ids': total_true_labels,
        'predicted_labels': total_predicted_labels
    })
    return results_df

### Fine Grained

In [28]:
new_data = get_new_data(data, "best_hps_final_baseline")
results_df = get_results_df(new_data)
results_df.f1.mean()

18


0.6256211702260767

In [29]:
new_data = get_new_data(data, "best_hps_final_without_attention")
results_df = get_results_df(new_data)
print(results_df.f1.mean())
results_df.groupby('retrievers')[['overlaps', 'f1', 'precision', 'recall', 'accuracy']].mean()

18
0.5916446984296276


Unnamed: 0_level_0,overlaps,f1,precision,recall,accuracy
retrievers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
empathy,0.0,0.590704,0.596686,0.595,0.595
simcse,0.0,0.590486,0.6026,0.593333,0.593333
simcse empathy,0.0,0.596712,0.60668,0.6,0.6


In [30]:
new_data = get_new_data(data, "best_hps_final")
results_df = get_results_df(new_data)
print(results_df.f1.mean())
print(results_df.f1.max())
results_df.groupby('retrievers')[['overlaps', 'f1', 'precision', 'recall', 'accuracy']].mean()

21
0.6190162279101896
0.6430050638282316


  'cbr_labels': np.array(total_cbr_labels).squeeze().tolist(),


Unnamed: 0_level_0,overlaps,f1,precision,recall,accuracy
retrievers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
empathy,0.322404,0.606562,0.614255,0.61,0.61
simcse,0.525773,0.643005,0.64609,0.646667,0.646667
simcse empathy,0.630702,0.607849,0.627357,0.612,0.612


### Coarse Grained

In [31]:
new_data = get_new_data(data, "best_hps_final_baseline", dataset = "data/coarsegrained")
results_df = get_results_df(new_data)
print(results_df[['accuracy', 'precision', 'recall', 'f1']].mean())
print(results_df[['accuracy', 'precision', 'recall', 'f1']].max())

18
f1           0.782848
precision    0.789266
recall       0.787224
accuracy     0.787224
dtype: float64
f1           0.813135
precision    0.812066
recall       0.817814
accuracy     0.817814
dtype: float64


In [78]:
new_data = get_new_data(data, "best_hps_final_without_attention", dataset = "data/coarsegrained", threshold = 0.4)
results_df = get_results_df(new_data)
print(results_df[['accuracy', 'precision', 'recall', 'f1']].mean())
print(results_df[['accuracy', 'precision', 'recall', 'f1']].max())
results_df.groupby('retrievers')[['accuracy', 'precision', 'recall', 'f1']].mean()

19
accuracy     0.783507
precision    0.777657
recall       0.783507
f1           0.778725
dtype: float64
accuracy     0.805668
precision    0.800807
recall       0.805668
f1           0.800680
dtype: float64


Unnamed: 0_level_0,accuracy,precision,recall,f1
retrievers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
empathy,0.768219,0.762251,0.768219,0.764453
simcse,0.803933,0.800543,0.803933,0.799332
simcse empathy,0.773279,0.765336,0.773279,0.76783


In [79]:
new_data = get_new_data(data, "best_hps_final", dataset = "data/coarsegrained", threshold=0.6)
results_df = get_results_df(new_data)
print(results_df[['accuracy', 'precision', 'recall', 'f1']].mean())
print(results_df[['accuracy', 'precision', 'recall', 'f1']].max())
results_df.groupby('retrievers')[['accuracy', 'precision', 'recall', 'f1']].mean()

21
accuracy     0.779834
precision    0.782774
recall       0.779834
f1           0.776281
dtype: float64
accuracy     0.801619
precision    0.801724
recall       0.801619
f1           0.795360
dtype: float64


  'cbr_labels': np.array(total_cbr_labels).squeeze().tolist(),


Unnamed: 0_level_0,accuracy,precision,recall,f1
retrievers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
empathy,0.793522,0.799528,0.793522,0.79077
simcse,0.774629,0.774732,0.774629,0.772452
simcse empathy,0.771761,0.774147,0.771761,0.766476


### Big Bench

In [80]:
new_data = get_new_data(data, None, dataset = "data/bigbench")
results_df = get_results_df(new_data)
print(results_df[['accuracy', 'precision', 'recall', 'f1']].mean())
print(results_df[['accuracy', 'precision', 'recall', 'f1']].max())
results_df.groupby('retrievers')[['overlaps', 'f1', 'precision', 'recall', 'accuracy']].max()

249
accuracy     0.789042
precision    0.761081
recall       0.789042
f1           0.764592
dtype: float64
accuracy     0.997619
precision    0.997630
recall       0.997619
f1           0.997619
dtype: float64


  'cbr': np.array(total_cbr_texts).squeeze().tolist(),
  'cbr_labels': np.array(total_cbr_labels).squeeze().tolist(),


Unnamed: 0_level_0,overlaps,f1,precision,recall,accuracy
retrievers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
empathy,0.975177,0.954781,0.954898,0.954762,0.954762
simcse,0.990881,0.966685,0.966879,0.966667,0.966667
simcse empathy,0.979112,0.911998,0.91508,0.911905,0.911905


In [81]:
new_data = get_new_data(data, "best_hps_final_baseline", dataset = "data/bigbench")
results_df = get_results_df(new_data)
print(results_df[['accuracy', 'precision', 'recall', 'f1']].mean())
print(results_df[['accuracy', 'precision', 'recall', 'f1']].max())

18
accuracy     0.851455
precision    0.886535
recall       0.851455
f1           0.849502
dtype: float64
accuracy     0.997619
precision    0.997630
recall       0.997619
f1           0.997619
dtype: float64


In [84]:
new_data = get_new_data(data, "best_hps_final_without_attention", dataset = "data/bigbench", threshold=0.6)
results_df = get_results_df(new_data)
print(results_df[['accuracy', 'precision', 'recall', 'f1']].mean())
print(results_df[['accuracy', 'precision', 'recall', 'f1']].max())
results_df.groupby('retrievers')[['overlaps', 'f1', 'precision', 'recall', 'accuracy']].max()

41
accuracy     0.826190
precision    0.829776
recall       0.826190
f1           0.824667
dtype: float64
accuracy     0.942857
precision    0.944043
recall       0.942857
f1           0.942916
dtype: float64


Unnamed: 0_level_0,overlaps,f1,precision,recall,accuracy
retrievers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
empathy,0.0,0.942916,0.944043,0.942857,0.942857
simcse,0.0,0.921416,0.921413,0.921429,0.921429
simcse empathy,0.0,0.842674,0.842838,0.842857,0.842857


In [85]:
new_data = get_new_data(data, "best_hps_final", dataset = "data/bigbench", threshold=0.6)
results_df = get_results_df(new_data)
print(results_df[['accuracy', 'precision', 'recall', 'f1']].mean())
print(results_df[['accuracy', 'precision', 'recall', 'f1']].max())
results_df.groupby('retrievers')[['overlaps', 'f1', 'precision', 'recall', 'accuracy']].max()

42
accuracy     0.782710
precision    0.789373
recall       0.782710
f1           0.782221
dtype: float64
accuracy     0.911905
precision    0.915080
recall       0.911905
f1           0.911998
dtype: float64


  'cbr_labels': np.array(total_cbr_labels).squeeze().tolist(),


Unnamed: 0_level_0,overlaps,f1,precision,recall,accuracy
retrievers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
empathy,0.975177,0.864418,0.865343,0.864286,0.864286
simcse,0.98951,0.861946,0.862026,0.861905,0.861905
simcse empathy,0.979112,0.911998,0.91508,0.911905,0.911905


### New Fine Grained

In [37]:
new_data = get_new_data(data, "best_hps_final_baseline", dataset = "data/new_finegrained", threshold  = 0)
results_df = get_results_df(new_data)
print(results_df[['accuracy', 'precision', 'recall', 'f1']].mean())
print(results_df[['accuracy', 'precision', 'recall', 'f1']].max())

94
f1           0.400703
precision    0.421119
recall       0.409751
accuracy     0.409751
dtype: float64
f1           0.629232
precision    0.649869
recall       0.622356
accuracy     0.622356
dtype: float64


In [38]:
new_data = get_new_data(data, "best_hps_final_without_attention", dataset = "data/new_finegrained", threshold=0)
results_df = get_results_df(new_data)
print(results_df[['accuracy', 'precision', 'recall', 'f1']].mean())
print(results_df[['accuracy', 'precision', 'recall', 'f1']].max())
results_df.groupby('retrievers')[['overlaps', 'f1', 'precision', 'recall', 'accuracy']].mean()

40
f1           0.497668
precision    0.506981
recall       0.502341
accuracy     0.502341
dtype: float64
f1           0.598112
precision    0.614553
recall       0.595166
accuracy     0.595166
dtype: float64


Unnamed: 0_level_0,overlaps,f1,precision,recall,accuracy
retrievers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
empathy,0.0,0.572369,0.579059,0.577039,0.577039
simcse,0.0,0.586747,0.598886,0.585887,0.585887
simcse empathy,0.0,0.395183,0.403464,0.404162,0.404162


In [39]:
new_data = get_new_data(data, "best_hps_final", dataset = "data/new_finegrained", threshold=0.4)
results_df = get_results_df(new_data)
print(results_df[['accuracy', 'precision', 'recall', 'f1']].mean())
print(results_df[['accuracy', 'precision', 'recall', 'f1']].max())
results_df.groupby('retrievers')[['overlaps', 'f1', 'precision', 'recall', 'accuracy']].mean()

49
f1           0.512411
precision    0.526223
recall       0.512670
accuracy     0.512670
dtype: float64
f1           0.561420
precision    0.577600
recall       0.558912
accuracy     0.558912
dtype: float64


  'cbr_labels': np.array(total_cbr_labels).squeeze().tolist(),


Unnamed: 0_level_0,overlaps,f1,precision,recall,accuracy
retrievers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
simcse,0.79469,0.489436,0.502549,0.490181,0.490181
simcse empathy,0.566083,0.534466,0.548951,0.53426,0.53426


In [40]:
new_data = get_new_data(data, "best_hps_final", dataset = "data/new_finegrained", threshold=0.4)
results_df = get_results_df(new_data)
print(results_df[['accuracy', 'precision', 'recall', 'f1']].mean())
print(results_df[['accuracy', 'precision', 'recall', 'f1']].max())
results_df.groupby('num_cases')[['overlaps', 'f1', 'precision', 'recall', 'accuracy']].mean()

49
f1           0.512411
precision    0.526223
recall       0.512670
accuracy     0.512670
dtype: float64
f1           0.561420
precision    0.577600
recall       0.558912
accuracy     0.558912
dtype: float64


  'cbr_labels': np.array(total_cbr_labels).squeeze().tolist(),


Unnamed: 0_level_0,overlaps,f1,precision,recall,accuracy
num_cases,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.322432,0.544082,0.554685,0.548338,0.548338
3,0.696257,0.496235,0.511318,0.495945,0.495945
4,0.786249,0.51961,0.53313,0.518966,0.518966
5,0.815951,0.493503,0.509017,0.492447,0.492447


In [41]:
new_data = get_new_data(data, "best_hps_final", dataset = "data/bigbench")

best_result = None
best_f1 = -np.inf
for point in new_data:
    if point['predictions']['metrics']['test_f1'] > best_f1:
        best_f1 = point['predictions']['metrics']['test_f1']
        best_result = point

59


### Best Hps

## New Finegrained

In [42]:
new_data = get_new_data(data, note = "best_hps_final_baseline_best_ps", dataset = "data/new_finegrained", threshold=0)
results_df = get_results_df(new_data)
print(results_df[['accuracy', 'precision', 'recall', 'f1']].mean())
print(results_df[['accuracy', 'precision', 'recall', 'f1']].max())

18
f1           0.501514
precision    0.517504
recall       0.503525
accuracy     0.503525
dtype: float64
f1           0.532737
precision    0.540722
recall       0.537764
accuracy     0.537764
dtype: float64


In [65]:
new_data = get_new_data(data, note = "best_hps_final_without_attention_best_ps", dataset = "data/new_finegrained", threshold=0)
results_df = get_results_df(new_data)
print(results_df[['accuracy', 'precision', 'recall', 'f1']].mean())
print(results_df[['accuracy', 'precision', 'recall', 'f1']].max())
results_df.groupby('retrievers')[['accuracy', 'precision', 'recall', 'f1']].max()

18
accuracy     0.535247
precision    0.542838
recall       0.535247
f1           0.532134
dtype: float64
accuracy     0.540785
precision    0.551131
recall       0.540785
f1           0.539524
dtype: float64


Unnamed: 0_level_0,accuracy,precision,recall,f1
retrievers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
empathy,0.531722,0.535219,0.531722,0.52682
simcse,0.540785,0.551131,0.540785,0.539524
simcse empathy,0.540785,0.543191,0.540785,0.530175


In [12]:
new_data = get_new_data(data, note = "best_hps_final_best_ps", dataset = "data/new_finegrained", threshold=0)
results_df = get_results_df(new_data)
print(results_df[['accuracy', 'precision', 'recall', 'f1']].mean())
print(results_df[['accuracy', 'precision', 'recall', 'f1']].max())
results_df.groupby('retrievers')[['accuracy', 'precision', 'recall', 'f1']].mean()

75
accuracy     0.550211
precision    0.562931
recall       0.550211
f1           0.548913
dtype: float64
accuracy     0.574018
precision    0.593917
recall       0.574018
f1           0.571961
dtype: float64


  'cbr_labels': np.array(total_cbr_labels).squeeze().tolist(),


Unnamed: 0_level_0,accuracy,precision,recall,f1
retrievers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
simcse,0.550261,0.563291,0.550261,0.549252
simcse empathy,0.549849,0.560295,0.549849,0.546428


In [13]:
results_df.groupby('num_cases')[['accuracy', 'precision', 'recall', 'f1']].mean()

Unnamed: 0_level_0,accuracy,precision,recall,f1
num_cases,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.540785,0.549701,0.540785,0.535543
2,0.570997,0.593917,0.570997,0.571961
3,0.555891,0.570874,0.555891,0.557301
4,0.559248,0.570435,0.559248,0.5574
5,0.52568,0.525849,0.52568,0.521101
6,0.528701,0.548948,0.528701,0.531194


### Results with Electra

In [55]:
new_data = get_new_data(data, note = "best_hps_final_best_ps_electra", dataset = "data/new_finegrained", threshold=0)
results_df = get_results_df(new_data)
print(results_df[['accuracy', 'precision', 'recall', 'f1']].mean())
print(results_df[['accuracy', 'precision', 'recall', 'f1']].max())
results_df.groupby('retrievers')[['accuracy', 'precision', 'recall', 'f1']].max()

30
f1           0.565262
precision    0.577704
recall       0.566062
accuracy     0.566062
dtype: float64
f1           0.627227
precision    0.638502
recall       0.631420
accuracy     0.631420
dtype: float64


  'cbr_labels': np.array(total_cbr_labels).squeeze().tolist(),


Unnamed: 0_level_0,accuracy,precision,recall,f1
retrievers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
simcse,0.63142,0.638502,0.63142,0.627227
simcse empathy,0.498489,0.518512,0.498489,0.501245


In [53]:
results_df.groupby('num_cases')[['accuracy', 'precision', 'recall', 'f1']].max()

Unnamed: 0_level_0,accuracy,precision,recall,f1
num_cases,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.570997,0.579555,0.570997,0.569204
2,0.498489,0.518512,0.498489,0.501245
5,0.63142,0.638502,0.63142,0.627227


### Analysis on the stuff that help the model with CBR

In [None]:
cbr_data = get_new_data(data, note = "best_hps_final_best_ps", dataset = "data/new_finegrained", threshold=0)
cbr_model_df = get_results_df(cbr_data)
cbr_model_df

25


  'cbr_labels': np.array(total_cbr_labels).squeeze().tolist(),


Unnamed: 0,num_cases,threshold,retrievers,overlaps,f1,precision,recall,accuracy,cbr,cbr_labels,label_ids,predicted_labels
0,4,-10000000,simcse,0.790698,0.519892,0.535958,0.519637,0.519637,[Either go to the party or spend the night in ...,"[[10, 1, 3, 1], [11, 11, 11, 11], [7, 7, 7, 7]...","[10, 11, 5, 8, 7, 8, 0, 0, 0, 3, 1, 7, 1, 9, 1...","[3, 11, 7, 5, 7, 7, 0, 0, 0, 3, 1, 7, 1, 11, 1..."
1,4,-10000000,simcse,0.762162,0.56142,0.57623,0.558912,0.558912,[Either go to the party or spend the night in ...,"[[10, 1, 3, 1], [11, 11, 11, 11], [7, 7, 7, 7]...","[10, 11, 5, 8, 7, 8, 0, 0, 0, 3, 1, 7, 1, 9, 1...","[2, 11, 12, 5, 11, 11, 0, 0, 12, 3, 1, 7, 1, 1..."
2,4,-10000000,simcse,0.762162,0.56142,0.57623,0.558912,0.558912,[Either go to the party or spend the night in ...,"[[10, 1, 3, 1], [11, 11, 11, 11], [7, 7, 7, 7]...","[10, 11, 5, 8, 7, 8, 0, 0, 0, 3, 1, 7, 1, 9, 1...","[2, 11, 12, 5, 11, 11, 0, 0, 12, 3, 1, 7, 1, 1..."
3,4,-10000000,simcse empathy,0.879121,0.546428,0.560295,0.549849,0.549849,[Either go to the party or spend the night in ...,"[[10, 1, 3, 1, 3, 8, 8, 3], [11, 11, 11, 11, 1...","[10, 11, 5, 8, 7, 8, 0, 0, 0, 3, 1, 7, 1, 9, 1...","[11, 11, 3, 12, 11, 3, 0, 0, 0, 3, 1, 7, 1, 9,..."
4,4,-10000000,simcse empathy,0.879121,0.546428,0.560295,0.549849,0.549849,[Either go to the party or spend the night in ...,"[[10, 1, 3, 1, 3, 8, 8, 3], [11, 11, 11, 11, 1...","[10, 11, 5, 8, 7, 8, 0, 0, 0, 3, 1, 7, 1, 9, 1...","[11, 11, 3, 12, 11, 3, 0, 0, 0, 3, 1, 7, 1, 9,..."
5,4,-10000000,simcse empathy,0.879121,0.546428,0.560295,0.549849,0.549849,[Either go to the party or spend the night in ...,"[[10, 1, 3, 1, 3, 8, 8, 3], [11, 11, 11, 11, 1...","[10, 11, 5, 8, 7, 8, 0, 0, 0, 3, 1, 7, 1, 9, 1...","[11, 11, 3, 12, 11, 3, 0, 0, 0, 3, 1, 7, 1, 9,..."
6,4,-10000000,simcse,0.762162,0.56142,0.57623,0.558912,0.558912,[Either go to the party or spend the night in ...,"[[10, 1, 3, 1], [11, 11, 11, 11], [7, 7, 7, 7]...","[10, 11, 5, 8, 7, 8, 0, 0, 0, 3, 1, 7, 1, 9, 1...","[2, 11, 12, 5, 11, 11, 0, 0, 12, 3, 1, 7, 1, 1..."
7,4,-10000000,simcse,0.762162,0.56142,0.57623,0.558912,0.558912,[Either go to the party or spend the night in ...,"[[10, 1, 3, 1], [11, 11, 11, 11], [7, 7, 7, 7]...","[10, 11, 5, 8, 7, 8, 0, 0, 0, 3, 1, 7, 1, 9, 1...","[2, 11, 12, 5, 11, 11, 0, 0, 12, 3, 1, 7, 1, 1..."
8,4,-10000000,simcse,0.762162,0.56142,0.57623,0.558912,0.558912,[Either go to the party or spend the night in ...,"[[10, 1, 3, 1], [11, 11, 11, 11], [7, 7, 7, 7]...","[10, 11, 5, 8, 7, 8, 0, 0, 0, 3, 1, 7, 1, 9, 1...","[2, 11, 12, 5, 11, 11, 0, 0, 12, 3, 1, 7, 1, 1..."
9,4,-10000000,simcse empathy,0.879121,0.546428,0.560295,0.549849,0.549849,[Either go to the party or spend the night in ...,"[[10, 1, 3, 1, 3, 8, 8, 3], [11, 11, 11, 11, 1...","[10, 11, 5, 8, 7, 8, 0, 0, 0, 3, 1, 7, 1, 9, 1...","[11, 11, 3, 12, 11, 3, 0, 0, 0, 3, 1, 7, 1, 9,..."


In [16]:
base_data = get_new_data(data, note = "best_hps_final_baseline_best_ps", dataset = "data/new_finegrained", threshold=0)
base_data_df = get_results_df(base_data)
base_data_df

18


Unnamed: 0,num_cases,threshold,retrievers,overlaps,f1,precision,recall,accuracy,cbr,cbr_labels,label_ids,predicted_labels
0,,,,,0.532737,0.540722,0.537764,0.537764,,,"[10, 11, 5, 8, 7, 8, 0, 0, 0, 3, 1, 7, 1, 9, 1...","[10, 11, 11, 3, 7, 0, 0, 0, 0, 3, 1, 11, 11, 1..."
1,,,,,0.499677,0.516139,0.501511,0.501511,,,"[10, 11, 5, 8, 7, 8, 0, 0, 0, 3, 1, 7, 1, 9, 1...","[2, 11, 2, 3, 7, 11, 0, 0, 0, 3, 8, 8, 3, 11, ..."
2,,,,,0.499677,0.516139,0.501511,0.501511,,,"[10, 11, 5, 8, 7, 8, 0, 0, 0, 3, 1, 7, 1, 9, 1...","[2, 11, 2, 3, 7, 11, 0, 0, 0, 3, 8, 8, 3, 11, ..."
3,,,,,0.499677,0.516139,0.501511,0.501511,,,"[10, 11, 5, 8, 7, 8, 0, 0, 0, 3, 1, 7, 1, 9, 1...","[2, 11, 2, 3, 7, 11, 0, 0, 0, 3, 8, 8, 3, 11, ..."
4,,,,,0.499677,0.516139,0.501511,0.501511,,,"[10, 11, 5, 8, 7, 8, 0, 0, 0, 3, 1, 7, 1, 9, 1...","[2, 11, 2, 3, 7, 11, 0, 0, 0, 3, 8, 8, 3, 11, ..."
5,,,,,0.499677,0.516139,0.501511,0.501511,,,"[10, 11, 5, 8, 7, 8, 0, 0, 0, 3, 1, 7, 1, 9, 1...","[2, 11, 2, 3, 7, 11, 0, 0, 0, 3, 8, 8, 3, 11, ..."
6,,,,,0.499677,0.516139,0.501511,0.501511,,,"[10, 11, 5, 8, 7, 8, 0, 0, 0, 3, 1, 7, 1, 9, 1...","[2, 11, 2, 3, 7, 11, 0, 0, 0, 3, 8, 8, 3, 11, ..."
7,,,,,0.499677,0.516139,0.501511,0.501511,,,"[10, 11, 5, 8, 7, 8, 0, 0, 0, 3, 1, 7, 1, 9, 1...","[2, 11, 2, 3, 7, 11, 0, 0, 0, 3, 8, 8, 3, 11, ..."
8,,,,,0.499677,0.516139,0.501511,0.501511,,,"[10, 11, 5, 8, 7, 8, 0, 0, 0, 3, 1, 7, 1, 9, 1...","[2, 11, 2, 3, 7, 11, 0, 0, 0, 3, 8, 8, 3, 11, ..."
9,,,,,0.499677,0.516139,0.501511,0.501511,,,"[10, 11, 5, 8, 7, 8, 0, 0, 0, 3, 1, 7, 1, 9, 1...","[2, 11, 2, 3, 7, 11, 0, 0, 0, 3, 8, 8, 3, 11, ..."


In [17]:
base_model_index = 1
cbr_model_index = 19

wrong_predictions_in_base_model = np.where(base_data_df.iloc[base_model_index]['predicted_labels'] != base_data_df.iloc[base_model_index]['label_ids'])[0]
predictions_correct_by_cbr = np.where(cbr_model_df.iloc[cbr_model_index]['predicted_labels'] == cbr_model_df.iloc[cbr_model_index]['label_ids'])[0]

NameError: name 'cbr_model_df' is not defined

In [18]:
indices_of_interest = np.intersect1d(wrong_predictions_in_base_model, predictions_correct_by_cbr)

NameError: name 'predictions_correct_by_cbr' is not defined

In [19]:
labels_for_the_indices_of_interest = [base_data_df.iloc[base_model_index]['label_ids'][i] for i in indices_of_interest]
print(labels_for_the_indices_of_interest)

NameError: name 'indices_of_interest' is not defined

In [20]:
cbr_labels_for_the_indices_of_interest = [cbr_model_df.iloc[cbr_model_index]['cbr_labels'][i] for i in indices_of_interest]
print(cbr_labels_for_the_indices_of_interest)

NameError: name 'indices_of_interest' is not defined

In [99]:
label_encoder = cbr_data[0]['label_encoder']

test_df = pd.read_csv(os.path.join('data/new_finegrained', "test.csv"))
test_df = test_df[~test_df["label"].isin(bad_classes)]
test_df['label'] = label_encoder.transform(test_df['label'])

In [106]:
for index, label, cbr in zip(indices_of_interest, labels_for_the_indices_of_interest, cbr_labels_for_the_indices_of_interest):
    print(label, cbr, test_df['text'].iloc[index])
    print(cbr_model_df.iloc[cbr_model_index]['cbr'][index])
    print('--------')

1 [1 1] Everyone is going to get the new smart phone when it comes out this weekend. Why aren’t you?
"I'm gonna get an iPhone because everybody else has an iPhone and they're cool." Everyone wants the iPhone 11 because it's the best phone on the market!
--------
7 [7 7] surgeons have X-rays to guide them during an operation, lawyers have briefs to guide them during a trial, carpenters have blueprints to guide them when they are building a house. Why, then, shouldn’t students be allowed to look at their textbooks during an examination?
Doctors refer to medical books all the time when they are treating patients. In the same way, I should be allowed to use a textbook in my medical exam. If I say that a surgeon should be allowed to use a guidebook to carry out surgery like a student can use open notes on a test, I have made a
--------
1 [1 1] it is used to promote something based on popularity
Argues that because something is popular, it must be right. You appealed to popularity or the fac