In [24]:
from pathlib import Path
from typing import List
import pandas as pd

In [None]:
datasets_path = Path("./datasets/pubmed200k")
valid_path = datasets_path / "dev.txt"
test_path = datasets_path / "test.txt"
train_path = datasets_path / "train.txt"

In [26]:
def create_dataset(path: Path):
    # extract sentences and labels
    sentences_list: List[List[str]] = []
    labels_list: List[List[str]] = []
    with path.open('r') as file:
        lines = file.readlines()

        sentences: List[str] = []
        labels: List[str] = []

        for line in lines:
            if line == "\n":
                sentences_list.append(sentences)
                labels_list.append(labels)
                sentences = []
                labels = []
                
            splitted = line.strip().split("\t")
            if(len(splitted) == 2):
                labels.append(splitted[0].lower())
                sentences.append(splitted[1])

    # create dataframe
    df = pd.DataFrame({"sentences": sentences_list, "labels": labels_list})

    # save dataframe
    df.to_parquet(path.with_suffix(".parquet"))

    # unique labels
    unique_labels = set()
    for labels in labels_list:
        unique_labels.update(labels)
    return unique_labels

In [27]:
l1 = create_dataset(valid_path)
l2 = create_dataset(test_path)
l3 = create_dataset(train_path)

In [28]:
all_labels = l1.union(l2).union(l3)
print(all_labels)

{'results', 'objective', 'background', 'methods', 'conclusions'}


In [29]:
# read datasets
valid_df = pd.read_parquet(valid_path.with_suffix(".parquet"))
test_df = pd.read_parquet(test_path.with_suffix(".parquet"))
train_df = pd.read_parquet(train_path.with_suffix(".parquet"))

In [30]:
valid_df.head()

Unnamed: 0,sentences,labels
0,[IgE sensitization to Aspergillus fumigatus an...,"[background, background, objective, methods, m..."
1,"[Opioid antagonists ( e.g. , naltrexone ) and ...","[background, background, background, objective..."
2,[The sequencing of learning materials greatly ...,"[background, background, background, objective..."
3,[Patient adherence to appointments is key to i...,"[background, background, background, methods, ..."
4,[Insufficient skills in drug dose calculations...,"[background, background, background, backgroun..."


In [31]:
valid_df["labels"]

0       [background, background, objective, methods, m...
1       [background, background, background, objective...
2       [background, background, background, objective...
3       [background, background, background, methods, ...
4       [background, background, background, backgroun...
                              ...                        
2495    [background, background, background, backgroun...
2496    [background, background, methods, methods, met...
2497    [background, background, methods, methods, met...
2498    [background, methods, methods, methods, method...
2499    [background, methods, methods, methods, method...
Name: labels, Length: 2500, dtype: object