In [2]:
import pandas as pd
import json
from datasets import load_dataset, load_metric

In [86]:
import os, glob

data_dir = "./data/axis_evals/"

df = []
for file in glob.glob(data_dir + "*.json"):
    # load the jsonl as dataframe
    tmp = pd.read_json(file, lines=True)
    df.append(tmp)

df = pd.concat(df, ignore_index=True)


def get_article_text(row):
    try:
        return row["info"]["article"]
    except:
        return row["info"]["post"]

def get_score(row):
    try:
        return row["summary"]["axes"]["overall"]
    except:
        return ""

df["article"] = df.apply(get_article_text, axis=1)
df["summary_text"] = df["summary"].apply(lambda x: x["text"])
df["label"] = df.apply(get_score, axis=1)
print("Number of samples:", len(df))
# filter empty articles
df = df[df["article"] != ""]
print("Number of samples after filtering empty articles:", len(df))
# filter empty summaries
df = df[df["summary_text"] != ""]
print("Number of samples after filtering empty summaries:", len(df))
# filter empty labels
df = df[df["label"] != ""]
print("Number of samples after filtering empty labels:", len(df))
df = df.reset_index(drop=True)

Number of samples: 14897
Number of samples after filtering empty articles: 14897
Number of samples after filtering empty summaries: 14876
Number of samples after filtering empty labels: 14826


In [91]:
cols = ["article", "summary_text", "label"]
df[cols].to_csv("./data/axis_evals.csv", index=False)

from sklearn.model_selection import train_test_split
# split train, val, test with ratio 80:10:10
train, test = train_test_split(df, test_size=0.2, random_state=42)
val, test = train_test_split(test, test_size=0.5, random_state=42)
print("Train size:", len(train))
print("Val size:", len(val))
print("Test size:", len(test))

train[cols].to_csv("./data/axis_evals_train.csv", index=False)
val[cols].to_csv("./data/axis_evals_val.csv", index=False)
test[cols].to_csv("./data/axis_evals_test.csv", index=False)

Train size: 11860
Val size: 1483
Test size: 1483
