In [None]:
import pandas as pd
from pathlib import Path
import random
import json

In [None]:
train_paths = [
    Path("datasets/csabstruct/train.parquet"),
    Path("datasets/pubmed200k/train.parquet"),
    Path("datasets/coarsediscourse/coursediscourse_train.parquet"),
    Path("datasets/daily_dialog/dailydialog_train.parquet"),
    Path("datasets/emotion_lines/friends_train.parquet"),
]

for train_path in train_paths:
    df = pd.read_parquet(train_path)
    print(f"{train_path}: Number of samples: {len(df)}")

In [None]:
for train_path in train_paths:
    df = pd.read_parquet(train_path)
    df.head()

    # load dataset, so that i have a list of sentences and list of labels
    sentences = df['sentences'].tolist()
    labels = df['labels'].tolist()

    # this is now a list of lists, i need to flatten it
    sentences = [sentence for sublist in sentences for sentence in sublist]
    labels = [label for sublist in labels for label in sublist]

    # i want a dict with label as key and list of sentences as value
    label2sentences = {}
    for sentence, label in zip(sentences, labels):
        if label not in label2sentences:
            label2sentences[label] = []
        label2sentences[label].append(sentence)

    # for each label, i want to identify the K random sentences
    K = [2, 4, 8, 16, 32]
    for k in K:
        fewshot_samples = {}
        for label, sentences in label2sentences.items():
            random_sentences = random.sample(sentences, k)
            fewshot_samples[label] = random_sentences

        # save the fewshot samples
        with open(train_path.with_name(f"few_shot_nAll_k{k}.json"), "w") as f:
            json.dump(fewshot_samples, f, indent=4)