In [None]:
from pathlib import Path

import pandas as pd

In [None]:
DATASETS = ("r8", "r52", "mr", "20ng", "ohsumed")
DATASET = DATASETS[0]

In [None]:
WORK_DIR = Path.cwd().absolute()
DATA_PATH = WORK_DIR.joinpath("data")
ORIGINAL_DATA_PATH = DATA_PATH.joinpath("original-data")
CORPARA_PATH = ORIGINAL_DATA_PATH.joinpath("corpus")
LABEL_PATH = ORIGINAL_DATA_PATH.joinpath("label-info")

In [None]:
def get_dataset_df(corpus_path: Path, label_path: Path) -> pd.DataFrame:
    corpus_df = pd.read_csv(corpus_path, sep="\t", header=None)
    corpus_df = corpus_df.rename(columns={0: 'terms'})

    label_df = pd.read_csv(label_path, sep="\t", header=None)
    label_df = label_df.rename(columns={2: 'label', 1: 'mode'})
    label_df.drop(columns=[0], inplace=True)

    dataset_df = pd.concat([label_df, corpus_df], axis=1)
    return dataset_df

def save_dataset_tsv(corpus_path: Path, label_path: Path, output_path: Path = DATA_PATH) -> None:
    dataset_df = get_dataset_df(corpus_path, label_path)
    modes = dataset_df['mode'].unique().tolist()
    for mode in modes:
        split = dataset_df[dataset_df['mode'] == mode].drop(columns=['mode'])
        split.to_csv(output_path.joinpath(f"{DATASET}-{mode}.tsv"), index=False, header=False, sep="\t")

corpus_path = CORPARA_PATH.joinpath(f"{DATASET}.txt")
label_path = LABEL_PATH.joinpath(f"{DATASET}.txt")
save_dataset_tsv(corpus_path, label_path, output_path= DATA_PATH)