In [10]:
import evaluate
from datasets import load_dataset
from evaluate import evaluator
from transformers import AutoModelForSequenceClassification, pipeline
import random

In [3]:
evaluate.list_evaluation_modules(
    module_type="comparison",
    include_community=False,
    with_details=True
)

[{'name': 'mcnemar', 'type': 'comparison', 'community': False, 'likes': 1},
 {'name': 'exact_match', 'type': 'comparison', 'community': False, 'likes': 0},
 {'name': 'wilcoxon', 'type': 'comparison', 'community': False, 'likes': 0}]

In [4]:
accuracy = evaluate.load("accuracy")

In [None]:
print(accuracy)

EvaluationModule(name: "accuracy", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
    predictions (`list` of `int`): Predicted labels.
    references (`list` of `int`): Ground truth labels.
    normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
    sample_weight (`list` of `float`): Sample weights Defaults to None.

Returns:
    accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.

Examples:

    Example 1-A simple example
        >>> accuracy_metric = evaluate.load("accuracy")
        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
        >>> print(results)
    

In [8]:
data = load_dataset("imdb", split="test").shuffle(seed=42).select(range(1000))

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [12]:
print(data)
print(random.choice(data))

Dataset({
    features: ['text', 'label'],
    num_rows: 1000
})
{'text': "so. i was completely in love with this movie. gaga for it, even with all its plot twists...but the one thing i found really disturbing was the connection between the two best friends in Tim and Kyle. While the writer of the film gave us such a poignant moment between the two, and their sexual experimentation/confusion, he then gives us a plot twist that makes them half brothers?!?! (Although the subject isn't brought up in the film....and left unexplained and unaccounted for) I just thought that it was in bad taste, and the fact that it wasn't even discussed is even worse. (Oops we've created a taboo...now let's not address the situation, because that wouldn't really be P.C.) Otherwise a spectacular film", 'label': 1}


In [None]:
task_evaluator = evaluator("text-classification")


In [None]:
eval_results = task_evaluator.compute(
    model_or_pipeline="lvwerra/distilbert-imdb",
    data=data,
    metric=evaluate.combine(["accuracy", "recall", "precision", "f1"]),
    label_mapping={"NEGATIVE": 0, "POSITIVE": 1}
)
print(eval_results)