In [1]:
from pathlib import Path

import pandas as pd

## Non-GLUE Datasets

In [2]:
DATASETS = ("r8", "r52", "mr", "20ng", "ohsumed")

WORK_DIR = Path.cwd().absolute()
DATA_PATH = WORK_DIR.joinpath("data")
ORIGINAL_DATA_PATH = DATA_PATH.joinpath("original-data")
CORPARA_PATH = ORIGINAL_DATA_PATH.joinpath("corpus")
LABEL_PATH = ORIGINAL_DATA_PATH.joinpath("label-info")

In [3]:
def get_dataset_df(corpus_path: Path, label_path: Path) -> pd.DataFrame:
    corpus_df = pd.read_csv(corpus_path, sep="\t", header=None)
    corpus_df = corpus_df.rename(columns={0: 'terms'})

    label_df = pd.read_csv(label_path, sep="\t", header=None)
    label_df = label_df.rename(columns={2: 'label', 1: 'mode'})
    label_df.drop(columns=[0], inplace=True)

    dataset_df = pd.concat([label_df, corpus_df], axis=1)
    return dataset_df

def save_dataset_tsv(dataset_name: str, output_path: Path = DATA_PATH) -> None:
    corpus_path = CORPARA_PATH.joinpath(f"{dataset_name}.txt")
    label_path = LABEL_PATH.joinpath(f"{dataset_name}.txt")
    dataset_df = get_dataset_df(corpus_path, label_path)
    modes = dataset_df['mode'].unique().tolist()
    for mode in modes:
        split = dataset_df[dataset_df['mode'] == mode].drop(columns=['mode'])
        if "train" in mode:
            mode = "train"
        elif "test" in mode:
            mode = "test"
        else:
            raise ValueError(f"Unknown mode: {mode}")

        split.to_csv(output_path.joinpath(f"{dataset_name}-{mode}.tsv"), index=False, header=False, sep="\t")

for dataset in DATASETS:
    save_dataset_tsv(dataset, output_path= DATA_PATH)

## GLUE Datasets

### CoLA

In [None]:
COLA_PATH = DATA_PATH.joinpath("cola")
TOKENIZED_PATH = COLA_PATH.joinpath("original", "tokenized")

# Train
df = pd.read_csv(TOKENIZED_PATH.joinpath("in_domain_train.tsv"), sep="\t", header=None)
df.drop(columns=[0, 2], inplace=True)
df = df.rename(columns={1: "label", 3: "sentence"})
df.to_csv(DATA_PATH.joinpath("cola-train.tsv"), index=False, header=False, sep="\t")

# Dev
in_df = pd.read_csv(TOKENIZED_PATH.joinpath("in_domain_dev.tsv"), sep="\t", header=None)
out_df = pd.read_csv(
    TOKENIZED_PATH.joinpath("out_of_domain_dev.tsv"), sep="\t", header=None
)
df = pd.concat([in_df, out_df], ignore_index=True)
df.drop(columns=[0, 2], inplace=True)
df = df.rename(columns={1: "label", 3: "sentence"})
df.to_csv(DATA_PATH.joinpath("cola-test.tsv"), index=False, header=False, sep="\t")

### SST-2

In [None]:
SST2_PATH = DATA_PATH.joinpath("sst2")
in_splits, out_splits = ("train", "dev"), ("train", "test")
for i, o in zip(in_splits, out_splits):
    df = pd.read_csv(SST2_PATH.joinpath(f"{i}.tsv"), sep="\t")
    df = df[["label", "sentence"]]
    df.to_csv(DATA_PATH.joinpath(f"sst2-{o}.tsv"), index=False, header=False, sep="\t")