In [None]:
from pathlib import Path
from typing import List
import pandas as pd
import json

In [None]:
datasets_path = Path("./datasets")
valid_path = datasets_path / "daily_dialog" / "dailydialog_valid.json"
test_path = datasets_path / "daily_dialog" / "dailydialog_test.json"
train_path = datasets_path / "daily_dialog" / "dailydialog_train.json"

In [None]:
label_map = {'happy': 'joy',
 'sad': 'sadness',
 'happines': 'joy', 
 'disgust': 'disgust', 
 'anger': 'anger',
 'excited': 'joy',
 'fear': 'fear',
 'surprised': 'surprise',
 'angry': 'anger', 
 'neutral': 'neutral',
 'surprise': 'surprise', 
 'sadness': 'sadness',
 'happiness': 'joy'
}

In [None]:
def create_dataset(path: Path):
    # read json file
    data = json.loads(path.read_bytes())

    # extract sentences and labels
    sentences_list: List[List[str]] = []
    labels_list: List[List[str]] = []
    for dialog_id, conversations in data.items():
        if len(conversations) > 1:
            print(f"Dialog {dialog_id} has more than one conversation")

        sentences: List[str] = []
        labels: List[str] = []
        for utterance in conversations[0]:
            sentences.append(f"Speaker {utterance['speaker']}: {utterance['utterance']}")
            labels.append(label_map[utterance["emotion"]])

        sentences_list.append(sentences)
        labels_list.append(labels)

    # create dataframe
    df = pd.DataFrame({"sentences": sentences_list, "labels": labels_list})

    # save dataframe
    df.to_parquet(path.with_suffix(".parquet"))

    # unique labels
    unique_labels = set()
    for labels in labels_list:
        unique_labels.update(labels)
    return unique_labels

In [None]:
l1 = create_dataset(valid_path)
l2 = create_dataset(test_path)
l3 = create_dataset(train_path)

In [None]:
all_labels = l1.union(l2).union(l3)
print(all_labels)

In [None]:
# read datasets
valid_df = pd.read_parquet(valid_path.with_suffix(".parquet"))
test_df = pd.read_parquet(test_path.with_suffix(".parquet"))
train_df = pd.read_parquet(train_path.with_suffix(".parquet"))

In [None]:
train_df.head()

In [None]:
valid_df["labels"]

In [None]:
len(train_df)

In [None]:
# count the labels
label_counts = train_df["labels"].explode().value_counts()

In [None]:
label_counts

In [None]:
# i want to find all rows that have a certain label
train_df[train_df["labels"].apply(lambda x: "fear" in x)]

In [None]:
train_df["labels"][18]

In [None]:
# count the labels of train_df["labels"][18]
train_df["labels"][18].count("fear")

In [None]:
label_list = list(train_df["labels"][18])

for idx, row in train_df[train_df["labels"].apply(lambda x: "fear" in x)].iterrows():
    label_list = list(row['labels'])
    label_counts = {label: label_list.count(label) for label in set(label_list)}
    print(idx)
    print(label_counts)


In [None]:
list(train_df[train_df["labels"].apply(lambda x: "fear" in x)].iterrows())