In [1]:
import pandas as pd
from os import listdir
from pathlib import Path

data_path = Path("../../data").resolve()
parquet_files = [x for x in listdir(data_path) if x.endswith(".parquet") and "captions" not in x]

In [2]:
df = None
for parquet_file in parquet_files:
  df_local = pd.read_parquet(data_path / parquet_file)
  df = pd.concat([df, df_local])

# there are only two sources with electron other, better move up one level
df.loc[(df.source == "tinman") & (df.label == "mic.ele.oth"), "label"] = "mic.ele"
df.loc[(df.label == "exp.gel.oth"), "label"] = "exp.gel"


In [3]:
df.groupby("label")["img"].count().reset_index()

Unnamed: 0,label,img
0,exp.gel,15524
1,exp.gel.nor,16
2,exp.gel.rpc,29
3,exp.gel.wes,197
4,exp.pla,590
5,gra,5328
6,gra.3dr,365
7,gra.flow,165
8,gra.his,49407
9,gra.lin,49817


In [4]:
from sklearn.model_selection import StratifiedShuffleSplit
from typing import List

def split_sets(data: pd.DataFrame, labels: List[str], test_size: float = 0.2, val_size: float = 0.1, random_state: int = 42):
    df_set = data[data.label.isin(labels)].reset_index()
    y = df_set.label
    sss = StratifiedShuffleSplit(n_splits=5, test_size=test_size, random_state=random_state)
    for _, (train_index, test_index) in enumerate(sss.split(df_set, y)):       
        df_set.loc[test_index,'split_set'] = 'TEST'
        df_set.loc[train_index,'split_set'] = 'TRAIN'
    
    df_test = df_set[df_set.split_set == "TEST"].reset_index()
    df_train = df_set[df_set.split_set == "TRAIN"].reset_index()
    y_train = df_train.label

    # split for validation
    num_val = int(df_set.shape[0] * val_size)
    val_test_size = num_val / df_train.shape[0]
    
    sss = StratifiedShuffleSplit(n_splits=5, test_size=val_test_size, random_state=random_state)
    for _, (train_index, test_index) in enumerate(sss.split(df_train, y_train)):       
        df_train.loc[test_index,'split_set'] = 'VAL'
        df_train.loc[train_index,'split_set'] = 'TRAIN'

    df_val = df_train[df_train.split_set == "VAL"]
    df_train = df_train[df_train.split_set == "TRAIN"]    

    return pd.concat([df_train, df_val, df_test]).reset_index(drop=True)



In [5]:
def add_children(parent_df: pd.DataFrame, child_df: pd.DataFrame, label: str):
    df_temp = child_df.copy()
    df_temp.label = label
    return pd.concat([parent_df, df_temp]).reset_index(drop=True)

test_size = 0.1
val_size  = 0.1

# gels classifier
print("gels")
gel_labels = ["exp.gel.nor", "exp.gel.rpc", "exp.gel.wes"]
df_gel = split_sets(df, gel_labels, test_size=test_size, val_size=val_size)

# experimental classifier
print("experimental")
exp_labels =  ["exp.gel", "exp.pla"]
df_exp = split_sets(df, exp_labels, test_size=test_size, val_size=val_size)
df_exp = add_children(df_exp, df_gel, "exp.gel")

# electron classifier
print("electron")
electron_labels = ["mic.ele.sca", "mic.ele.tra"]
df_electron = split_sets(df, electron_labels,test_size=test_size, val_size=val_size)

# microscopy classifier
print("microscopy")
microscopy_labels = ["mic.ele", "mic.flu", "mic.lig"]
df_microscopy = split_sets(df, microscopy_labels, test_size=test_size, val_size=val_size)
df_microscopy = add_children(df_microscopy, df_electron, "mic.ele")

# graphics classifier
print("graphics")
graphics_labels = ["gra.3dr", "gra.flow", "gra.his", "gra.lin", "gra.sca", "gra.oth", "gra.sig"]
df_graphics = split_sets(df, graphics_labels, test_size=test_size, val_size=val_size)

# radiology classifier
print("radiology")
radiology_labels = ["rad.cmp", "rad.uls", "rad.xra", "rad.ang"]
df_radiology = split_sets(df, radiology_labels, test_size=test_size, val_size=val_size)

# molecular classifier
print("molecular")
molecular_labels = ["mol.3ds", "mol.che", "mol.dna", "mol.pro"]
df_molecular = split_sets(df, molecular_labels, test_size=test_size, val_size=val_size)

# photography classifier
print("photography")
photo_labels = ["pho.der", "pho.org"]
df_photo = split_sets(df, photo_labels, test_size=test_size, val_size=val_size)

# high modality classifier
print("high modality")
high_modality_labels = ["exp", "mic", "gra", "rad", "mol", "pho", "oth"]
df_high_modality = split_sets(df, high_modality_labels)
df_high_modality = add_children(df_high_modality, df_exp, "exp")
df_high_modality = add_children(df_high_modality, df_microscopy, "mic")
df_high_modality = add_children(df_high_modality, df_graphics, "gra")
df_high_modality = add_children(df_high_modality, df_radiology, "rad")
df_high_modality = add_children(df_high_modality, df_molecular, "mol")
df_high_modality = add_children(df_high_modality, df_photo, "pho")


gels
experimental
electron
microscopy
graphics
radiology
molecular
photography
high modality


In [6]:
cols_to_drop = ["level_0", "index"]
df_gel.drop(cols_to_drop, axis=1, inplace=True)
df_exp.drop(cols_to_drop, axis=1, inplace=True)
df_electron.drop(cols_to_drop, axis=1, inplace=True)
df_microscopy.drop(cols_to_drop, axis=1, inplace=True)
df_graphics.drop(cols_to_drop, axis=1, inplace=True)
df_molecular.drop(cols_to_drop, axis=1, inplace=True)
df_photo.drop(cols_to_drop, axis=1, inplace=True)
df_radiology.drop(cols_to_drop, axis=1, inplace=True)
df_high_modality.drop(cols_to_drop, axis=1, inplace=True)

In [7]:
df_gel.to_parquet("../../data/0/cord19_gel_v1.parquet")
df_exp.to_parquet("../../data/0/cord19_experimental_v1.parquet")
df_electron.to_parquet("../../data/0/cord19_electron_v1.parquet")
df_microscopy.to_parquet("../../data/0/cord19_microscopy_v1.parquet")
df_graphics.to_parquet("../../data/0/cord19_graphics_v1.parquet")
df_molecular.to_parquet("../../data/0/cord19_molecular_v1.parquet")
df_photo.to_parquet("../../data/0/cord19_photography_v1.parquet")
df_radiology.to_parquet("../../data/0/cord19_radiology_v1.parquet")
df_high_modality.to_parquet("../../data/0/cord19_higher-modality_v1.parquet")