In [None]:
# This notebook is only for server (no colab func.)

import os
os.chdir("/app")

In [None]:
# Datasets
!find /app/data/ -name "*_dataset_*"

In [None]:
# Tests
!ls -l /app/logs/

In [None]:
# constatnts
MAX_SEQ_LEN = 128
MIN_SEQ_LEN = int(MAX_SEQ_LEN / 2)

PRETRAINED_MODEL_PATH = "./models/bert/"
# modify if model changed to other H-XXXX!!!!
OUTPUT_SIZE = 1024

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import numpy as np
import tensorflow as tf

import json

from IPython.display import display
import plotly
import plotly.graph_objs as go
import cufflinks as cf
cf.go_offline()

from jalef.statistics import evaluate_result, compare_thresholds, select_errors, remove_uncertain_preds
from jalef.plots import plot_confusion_matrix, enable_plotly_in_cell

enable_plotly_in_cell()

In [None]:
np.random.seed(1234)
tf.set_random_seed(1234)

# Evaluate single test

## Statistics

In [None]:
SOURCE = "coursera"
EMBEDDING = "bert"
N_INTENTS = 100
BALANCED = True
CUSTOM_PARAMETERS = dict()
CUSTOM_PARAMETERS_AS_STRING = "_".join([str(key) + "=" + str(value) for key, value in CUSTOM_PARAMETERS])

# Please fill these
DATASET_PATH = "/app/data/{}_dataset_top{}".format(SOURCE, N_INTENTS)
TEST_TITLE = "Predictions for the top {} intents in the {} dataset with {} training.".format(N_INTENTS, SOURCE, "balanced" if BALANCED else "normal")
TEST_NAME = "dataset={}_embedding={}{}{}intents={}".format(
    SOURCE, 
    EMBEDDING, 
    "_balanced_" if BALANCED else "_", 
    CUSTOM_PARAMETERS_AS_STRING + "_" if CUSTOM_PARAMETERS_AS_STRING != "", 
    N_INTENTS
)

In [None]:
# Load dataset and look-up tables
dataset = pd.read_csv(os.path.join(DATASET_PATH, "test.csv"))
lut = pd.read_csv(os.path.join(DATASET_PATH, "lut.csv"), names=["Intent"], index_col=0, header=0)
reverse_lut = pd.read_csv(os.path.join(DATASET_PATH, "lut.csv"), names=["Label"], index_col=1, header=0)

dataset["Intent"] = [lut.at[label, "Intent"] for label in dataset["Label"].values]

print("The test set has {} rows.".format(len(dataset)))
display(dataset.head(3))

In [None]:
# load predictions for dataset
preds = np.load(
    os.path.join(os.path.join("/app/logs/", TEST_NAME), predictions.npy")
)

y_pred = np.array([lut.at[np.argmax(e), "Intent"] for e in preds])
y_true = np.array([lut.at[e, "Intent"] for e in dataset["Label"].values])

In [None]:
# Plot confusion matrix
plot_confusion_matrix(y_true=y_true, y_pred=y_pred, normalize=False, title=TEST_TITLE)

In [None]:
# Compare the thresholds
print("The performance of the model with different uncertainty thresholds:")
compare_thresholds(y_true, y_pred, preds, [0, 0.7, 0.8, 0.85, 0.9, 0.95])

In [None]:
print("These are the cases, where the classifier predicts the wrong class:")
errors = select_errors(y_true, y_pred, preds, reverse_lut)
display(errors)

## Plots

In [None]:
# Show intent distribution in test set
dataset.groupby("Intent").count().sort_values(by=["Label"], ascending=False)["Label"].iplot(
    kind="bar",
    xTitle="Intent",
    yTitle="Count",
    title="Intent distribution in dataset",
    tickfont=dict(
        size=9,
    ),
)

# Compare two tests

In [None]:
SOURCE_1 = "coursera"
EMBEDDING_1 = "bert"
N_INTENTS_1 = 100
BALANCED_1 = True
CUSTOM_PARAMETERS_1 = dict()
CUSTOM_PARAMETERS_AS_STRING_1 = "_".join([str(key) + "=" + str(value) for key, value in CUSTOM_PARAMETERS_1])

# Please fill these
DATASET_PATH_1 = "/app/data/{}_dataset_top{}".format(SOURCE_1, N_INTENTS_1)
TEST_TITLE_1 = "Predictions for the top {} intents in the {} dataset with {} training.".format(N_INTENTS_1, SOURCE_1, "balanced" if BALANCED_1 else "normal")
TEST_NAME_1 = "dataset={}_embedding={}{}{}intents={}".format(
    SOURCE_1, 
    EMBEDDING_1, 
    "_balanced_" if BALANCED_1 else "_", 
    CUSTOM_PARAMETERS_AS_STRING_1 + "_" if CUSTOM_PARAMETERS_AS_STRING_1 != "", 
    N_INTENTS_1
)

In [None]:
SOURCE_2 = "coursera"
EMBEDDING_2 = "bert"
N_INTENTS_2 = 100
BALANCED_2 = True
CUSTOM_PARAMETERS_2 = dict()
CUSTOM_PARAMETERS_AS_STRING_2 = "_".join([str(key) + "=" + str(value) for key, value in CUSTOM_PARAMETERS_2])

# Please fill these
DATASET_PATH_2 = "/app/data/{}_dataset_top{}".format(SOURCE_2, N_INTENTS_2)
TEST_TITLE_2 = "Predictions for the top {} intents in the {} dataset with {} training.".format(N_INTENTS_2, SOURCE_2, "balanced" if BALANCED_2 else "normal")
TEST_NAME_2 = "dataset={}_embedding={}{}{}intents={}".format(
    SOURCE_2, 
    EMBEDDING_2, 
    "_balanced_" if BALANCED_2 else "_", 
    CUSTOM_PARAMETERS_AS_STRING_2 + "_" if CUSTOM_PARAMETERS_AS_STRING_2 != "", 
    N_INTENTS_2
)

In [None]:
# Load dataset 1 and look-up tables
dataset_1 = pd.read_csv(os.path.join(DATASET_PATH_1, "test.csv"))
lut_1 = pd.read_csv(os.path.join(DATASET_PATH_1, "lut.csv"), names=["Intent"], index_col=0, header=0)
reverse_lut_1 = pd.read_csv(os.path.join(DATASET_PATH_1, "lut.csv"), names=["Label"], index_col=1, header=0)

dataset_1["Intent"] = [lut_1.at[label, "Intent"] for label in dataset_1["Label"].values]

# load predictions for dataset
preds_1 = np.load(
    os.path.join(os.path.join("/app/logs/", TEST_NAME_1), predictions.npy")
)

y_pred_1 = np.array([lut_1.at[np.argmax(e), "Intent"] for e in preds_1])
y_true_1 = np.array([lut_1.at[e, "Intent"] for e in dataset_1["Label"].values])

In [None]:
# Load dataset 2 and look-up tables
dataset_2 = pd.read_csv(os.path.join(DATASET_PATH_2, "test.csv"))
lut_2 = pd.read_csv(os.path.join(DATASET_PATH_2, "lut.csv"), names=["Intent"], index_col=0, header=0)
reverse_lut_2 = pd.read_csv(os.path.join(DATASET_PATH_2, "lut.csv"), names=["Label"], index_col=1, header=0)

dataset_2["Intent"] = [lut_2.at[label, "Intent"] for label in dataset_2["Label"].values]

# load predictions for dataset
preds_2 = np.load(
    os.path.join(os.path.join("/app/logs/", TEST_NAME_2), predictions.npy")
)

y_pred_2 = np.array([lut_2.at[np.argmax(e), "Intent"] for e in preds_2])
y_true_2 = np.array([lut_2.at[e, "Intent"] for e in dataset_2["Label"].values])

In [None]:
# Compare accurracy, precision, etc.
df = pd.DataFrame().from_dict(evaluate_result(y_true=y_true_1, y_pred=y_pred_1), columns=[TEST_TITLE_1], orient="index")
df[TEST_TITLE_2] = evaluate_result(y_true=y_true_2, y_pred=y_pred_2).values()

df

In [None]:
# Compare the confusion matrices
plot_confusion_matrix(y_true=y_true_1, y_pred=y_pred_1, normalize=False, title=TEST_TITLE_1)
plot_confusion_matrix(y_true=y_true_2, y_pred=y_pred_2, normalize=False, title=TEST_TITLE_2)

In [None]:
# Compare the threshold results

print(TEST_NAME_1)
compare_thresholds(y_true_1, y_pred_1, preds_1, [0, 0.7, 0.8, 0.85, 0.9, 0.95])

print(TEST_NAME_2)
compare_thresholds(y_true_2, y_pred_2, preds_2, [0, 0.7, 0.8, 0.85, 0.9, 0.95])