In [9]:
import os
import numpy as np
import pandas as pd
from itertools import product

from utils import read_texts
from tqdm import tqdm

In [10]:
DATA_SOURCE = "/home/welton/data/datasets"
DATASETS = ["webkb", "20ng", "reut", "acm"]
SPLITS = ["split_10"]#, "split_10_with_val"]

# Setting fold labels

In [11]:
def get_documents_by_id(documents: list, idxs: list):

    return [ documents[idx] for idx in idxs ]

# For each dataset.
for dataset, sp  in product(DATASETS, SPLITS):
    print(f"[{dataset.upper()} - {sp.upper()}]")
    # Reading documents settings.
    docs_path = f"{DATA_SOURCE}/data/{dataset}/texts.txt"
    documents = [ text for text in read_texts(docs_path) ]
    split = pd.read_pickle(f"{DATA_SOURCE}/data/{dataset}/splits/{sp}.pkl")
    # For each fold.
    for fold in tqdm(np.arange(split.shape[0]), miniters=10000):
        
        output_dir = f"{DATA_SOURCE}/documents/{sp}/{dataset}/{fold}"
        os.makedirs(output_dir, exist_ok=True)
        
        X_train = np.array(get_documents_by_id(documents, split.iloc[fold]["train_idxs"]))
        X_test = np.array(get_documents_by_id(documents, split.iloc[fold]["test_idxs"]))
        pd.DataFrame(X_train, columns=["docs"]).to_csv(f"{output_dir}/train.csv", index=False, sep=';')
        pd.DataFrame(X_test, columns=["docs"]).to_csv(f"{output_dir}/test.csv", index=False, sep=';')

        if sp == "split_10_with_val":
            X_val = np.array(get_documents_by_id(documents, split.iloc[fold]["val_idxs"]))
            pd.DataFrame(X_val, columns=["docs"]).to_csv(f"{output_dir}/val.csv", index=False, sep=';')

[WEBKB - SPLIT_10]


100%|██████████| 10/10 [00:19<00:00,  1.94s/it]


[20NG - SPLIT_10]


100%|██████████| 10/10 [00:40<00:00,  4.07s/it]


[REUT - SPLIT_10]


100%|██████████| 10/10 [00:03<00:00,  2.76it/s]


[ACM - SPLIT_10]


100%|██████████| 10/10 [00:09<00:00,  1.10it/s]


In [16]:
df = pd.read_csv("/home/welton/data/datasets/documents/split_10/webkb/0/test.csv")

In [18]:
df.shape

(823, 1)