In [None]:
from pathlib import Path
from typing import List
import pandas as pd
import json

In [None]:
datasets_path = Path("./datasets")
dev_path = datasets_path / "emotion_lines" / "friends_dev.json"
test_path = datasets_path / "emotion_lines" / "friends_test.json"
train_path = datasets_path / "emotion_lines" / "friends_train.json"

In [None]:
def create_dataset(path: Path):
    # read json file
    data = json.loads(path.read_bytes())

    # extract sentences and labels
    sentences_list: List[List[str]] = []
    labels_list: List[List[str]] = []
    for dialog in data:
        sentences: List[str] = []
        labels: List[str] = []
        for utterance in dialog:
            sentences.append(f"{utterance['speaker']}: {utterance['utterance']}")
            labels.append(utterance["emotion"])

        sentences_list.append(sentences)
        labels_list.append(labels)

    # create dataframe
    df = pd.DataFrame({"sentences": sentences_list, "labels": labels_list})

    # save dataframe
    df.to_parquet(path.with_suffix(".parquet"))

    # unique labels
    unique_labels = set()
    for labels in labels_list:
        unique_labels.update(labels)
    return unique_labels

In [None]:
l1 = create_dataset(dev_path)
l2 = create_dataset(test_path)
l3 = create_dataset(train_path)

In [None]:
all_labels = l1.union(l2).union(l3)
print(all_labels)

In [None]:
# read datasets
dev_df = pd.read_parquet(dev_path.with_suffix(".parquet"))
test_df = pd.read_parquet(test_path.with_suffix(".parquet"))
train_df = pd.read_parquet(train_path.with_suffix(".parquet"))

In [None]:
test_df.head()

In [None]:
# i want to count the number of rows that contain the label "non-neutral"

# count the number of rows that contain the label "non-neutral"
print(dev_df["labels"].apply(lambda x: "non-neutral" in x).sum())

In [None]:
len(test_df)