In [2]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
import altair as alt

# Calculating metrics using `sklear` with `average='macro'`

In [10]:
def get_metrics(fname, model):
	df = pd.read_csv(fname)
	df = df.dropna()
	precision, recall, f1, _ = precision_recall_fscore_support(df["label"], df["gpt_label"], average="macro")
	metrics = pd.DataFrame({
		'model': model,
		'precision': precision,
		'recall': recall,
		'f1': f1
	}, index=[0])
	return metrics

In [11]:
metrics_gpt = get_metrics('../out/results_temp.csv', 'gpt3.5')
metrics_bert = pd.DataFrame({'model': 'BERT', 'precision': 0.21, 'recall': 0.43, 'f1': 0.28}, index=[0])

metrics = pd.concat([metrics_bert, metrics_gpt])
metrics = metrics.melt(id_vars="model", var_name="metric", value_name="score")
metrics.query('model == "gpt3.5"')

Unnamed: 0,model,metric,score
1,gpt3.5,precision,0.527336
3,gpt3.5,recall,0.841187
5,gpt3.5,f1,0.525738


In [12]:
def plot_metrics(metrics):
    bars = alt.Chart(metrics).mark_bar().encode(
        y=alt.Y("score", title=''),
        x=alt.X("model", title=''),
        color="model",
    )

    text = alt.Chart(metrics).mark_text(dx=0, dy=-5, color='blue').encode(
        y="score",
        x="model",
        detail='score',
        text=alt.Text('score', format='.2f'),
        color='model'
    )

    chart = alt.layer(bars, text).facet(
        column=alt.Column("metric", sort=["precision", "recall", "f1"])
    ) 
    return chart

In [13]:
plot_metrics(metrics)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [None]:
# chart.save('../out/metrics_50k.png', scale_factor=2.0)

# Calculating metrics manually

In [3]:
utterances = pd.read_csv('../out/50k_labelled_gpt.csv')

In [4]:
true_pos = len(utterances.query('gpt_label == 1 & label == 1'))
false_pos = len(utterances.query('gpt_label == 1 & label == 0'))
true_neg = len(utterances.query('gpt_label == 0 & label == 0'))
false_neg = len(utterances.query('gpt_label == 0 & label == 1'))

In [5]:
precision = true_pos / (true_pos + false_pos)
recall = true_pos / (true_pos + false_neg)
accuracy = (true_pos + true_neg) / (true_pos + false_pos + true_neg + false_neg)
f_one = 2 * ((precision * recall) / (precision + recall))

In [6]:
gpt_metrics_manual = pd.DataFrame({
	'model': 'GPT3.5',
	'precision': round(precision, 3),
	'recall': round(recall, 3),
	'accuracy': round(accuracy, 3),
	'F1': round(f_one, 3)
}, index=[0])

gpt_metrics_manual

Unnamed: 0,model,precision,recall,accuracy,F1
0,GPT3.5,0.057,0.783,0.898,0.105


In [7]:
bert_metrics = pd.DataFrame({
	'model': 'BERT',
	'precision': 0.27,
	'recall': 0.39,
	'accuracy': 0.98,
	'F1': 0.32
}, index=[0])

bert_metrics

Unnamed: 0,model,precision,recall,accuracy,F1
0,BERT,0.27,0.39,0.98,0.32


In [8]:
metrics_manual_comparison = pd.concat([gpt_metrics_manual, bert_metrics])
metrics_manual_comparison = metrics_manual_comparison.melt(id_vars="model", var_name="metric", value_name="score")

In [14]:
metrics_plot = plot_metrics(metrics_manual_comparison)
metrics_plot

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [16]:
metrics_plot.save('../out/metrics_50k_binary.png')

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


## using `sklearn` and `average = 'binary'`

In [15]:
precision_skl, recall_skl, f1_skl, _ = precision_recall_fscore_support(
	utterances['label'], utterances['gpt_label'], 
	average='binary'
	)

print(precision_skl, recall_skl, f1_skl, sep='\n')

0.05652911249293386
0.783289817232376
0.1054481546572935


In [18]:
from sklearn.metrics import precision_score

precision_score(utterances['label'], utterances['gpt_label'])


0.05652911249293386