Evaluation of model predictions for SI-NLI test set for all models

In [None]:
import pandas as pd
import numpy as np
np.random.seed(42)

In [None]:
# SI-NLI test set
test = pd.read_csv('data/SI-NLI/test.tsv', sep="\t")

In [None]:
def i_to_label(i):
	if i == 0:
		return 'entailment'
	elif i == 1:
		return 'neutral'
	elif i == 2:
		return 'contradiction'
	else:
		return 'error'

In [None]:
# loads predictions from all models (in `/napovedi/sinli/` folder in this repository instead of `data/`) and converts them to labels
sloberta_model_names = ['sinli', 'sinli_smaller', 'sinli_unanimous', 'esnlisi_4k', 'esnlisi_40k', 'esnlisi_50k', 'esnlisi_sinli']
preds = {}
for name in sloberta_model_names:
	preds[name] = pd.read_csv(f'data/predictions_{name}.csv').to_numpy().argmax(axis=1)
	preds[name] = [i_to_label(p) for p in preds[name].tolist()]

In [None]:
# loads predictions from GPT model
names = sloberta_model_names + ['gpt_0shot']
preds['gpt_0shot'] = pd.read_csv('data/test_gpt.tsv', sep="\t")['0shot'].tolist()

In [None]:
# add predictions to test set
for name in names:
	test[f'{name}_pred'] = preds[name]

In [None]:
# names used in the paper (for the plots)
final_names = ['SI-NLI', 'SI-NLI-manjša', 'SI-NLI-soglasni', 'ESNLIsi-4k', 'ESNLIsi-40k', 'ESNLIsi', 'ESNLIsi SI-NLI', 'GPT-3.5-turbo']

## Confusion matrices for all

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def confusion_matrix_for_column(df, name, display_name):
	cm = confusion_matrix(df['label'], df[f'{name}_pred'], labels=['entailment', 'neutral', 'contradiction'])
	sns.heatmap(cm, annot=True, xticklabels=['implikacija', 'nevtralno', 'kontradikcija'], yticklabels=['implikacija', 'nevtralno', 'kontradikcija'])
	plt.xlabel('Napovedana oznaka')
	plt.ylabel('Pravilna oznaka')
	plt.title(display_name)
	plt.show()

In [None]:
for i, name in enumerate(names):
	confusion_matrix_for_column(test, name, final_names[i])

## Metrics

In [None]:
from sklearn import metrics

In [None]:
for name in names:
	print(name)
	print(metrics.classification_report(test['label'], test[f'{name}_pred'], digits=3))
	print()