In [None]:
from pathlib import Path
import datasets
from datasets import Dataset, Features, Image
import os.path

#dataset_config = 'mimic-cxr','mimic-iii'  
#split = 'train','validate',test
def build_dataset(dataset_config, split):
    def generate_image_path(line):
        return str(Path(data_path).joinpath(dataset_config).joinpath(line.strip().split(',')[0]))
    
    data_path = '/nfs/turbo/umms-vgvinodv/data/bioNLP23-Task-1B/data/'
    
    findings_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.findings.tok')
    impression_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.impression.tok')
    image_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.image.tok')


    findings = [line.strip() for line in open(findings_file_path).readlines()]
    impression = [line.strip() for line in open(impression_file_path).readlines()]
    image_paths = [generate_image_path(line) for line in open(image_file_path).readlines()]
    
    dataset = datasets.Dataset.from_dict({"text":findings,"image":image_paths})
    
    def check_img_exists(example):
        return os.path.isfile(example["image"]) #example["image"].split('/')[10] != 'p10'

    dataset = dataset.filter(check_img_exists, num_proc=4)
    dataset = dataset.cast_column("image", Image())
    
    return dataset

In [None]:
raw_dataset = build_dataset("mimic-cxr", "train")

In [None]:
print(raw_dataset)
print(raw_dataset[0]["image"])

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("GanjinZero/biobart-base")

def tokenize(samples):
    input_text = ["summarize: "+text for text in samples["text"]]
    samples["input_ids"] = tokenizer(input_text)["input_ids"]
    return samples

train_dataset = raw_dataset.map(tokenize, batched=True, num_proc=4, remove_columns=["text"])

In [None]:
print(train_dataset[0])

In [None]:
from torchvision import transforms

normalize = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
transform = transforms.Compose([
    transforms.ToTensor(),
    normalize,
])

def image_transforms(samples):
    samples["query"] = [transform(image.convert("RGB").resize((384,384))) for image in samples["image"]]
    return samples

#image_text_dataset = raw_dataset.map(image_transforms, remove_columns=["image"], batched=True)

In [None]:
train_dataset.set_transform(image_transforms)

In [None]:
print(train_dataset[:2])

In [None]:
from pathlib import Path
import datasets
from datasets import Image
from torchvision import transforms
import os.path

#dataset_config = 'mimic-cxr','mimic-iii'  
#split = 'train','validate',test
def build_dataset(dataset_config, tokenizer, split):
    def generate_image_path(line):
        return str(Path(data_path).joinpath(dataset_config).joinpath(line.strip().split(',')[0]))
    
    data_path = '/nfs/turbo/umms-vgvinodv/data/bioNLP23-Task-1B/data/'
    
    findings_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.findings.tok')
    impression_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.impression.tok')
    image_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.image.tok')


    findings = [line.strip() for line in open(findings_file_path).readlines()]
    impression = [line.strip() for line in open(impression_file_path).readlines()]
    image_paths = [generate_image_path(line) for line in open(image_file_path).readlines()]
    
    dataset = datasets.Dataset.from_dict({"text":findings,"query":image_paths})
    
    def check_img_exists(example):
        return os.path.isfile(example["query"]) 

    dataset = dataset.filter(check_img_exists, num_proc=4)
    dataset = dataset.cast_column("query", Image())
    
    def tokenize(samples):
        input_text = [" ".join(['summarize:',text]) for text in samples["text"]]
        samples["input_ids"] = tokenizer(input_text).input_ids
        #samples["input_ids"] = tokenizer.encode(input_text, padding=True, return_tensors="pt")#.input_ids 
        return samples
    
    dataset = dataset.map(tokenize, batched=True, num_proc=4, remove_columns=["text"])
    dataset.set_format("pt", columns=['input_ids'], output_all_columns=True)
    
    normalize = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
    transform = transforms.Compose([
        transforms.ToTensor(),
        normalize,
        #transforms.Resize(384)
    ])
    
    text_transform = transforms.Compose([
        transforms.ToTensor(),
    ])

    def image_transforms(samples):
        samples["query"] = [transform(image.convert("RGB").resize((384,384))) for image in samples["query"]]
        samples["input_ids"] = 
        return samples
    #dataset = dataset.map(image_transforms, batched=True)
    #dataset.set_format(type="torch")
    
    
    dataset.set_transform(image_transforms)
    
    return dataset


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("GanjinZero/biobart-base")

train_data = build_dataset("mimic-cxr", tokenizer, "train")
#train_data.set_format(type="torch")
print(train_data)

In [None]:
print(train_data[:2]["query"])

In [None]:
print(train_data[0]["query"].shape)

In [None]:
print(train_data[0]["input_ids"])

In [None]:
print(type(train_data[0]["input_ids"]))

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("GanjinZero/biobart-base")

In [None]:
input_text = ["no acute process"]*2
print(len(input_text[1]))
text = tokenizer(input_text, return_tensors="pt")
print(text.input_ids)
print(text.attention_mask)

In [None]:
from pathlib import Path
import datasets
from datasets import Image
from torchvision import transforms
import os.path

def generate_image_path(line):
    return str(Path(data_path).joinpath(dataset_config).joinpath(line.strip().split(',')[0]))

dataset_config = "mimic-cxr"
split="train"
data_path = '/nfs/turbo/umms-vgvinodv/data/bioNLP23-Task-1B/data/'

findings_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.findings.tok')
impression_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.impression.tok')
image_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.image.tok')


findings = [line.strip() for line in open(findings_file_path).readlines()]
impression = [line.strip() for line in open(impression_file_path).readlines()]
image_paths = [generate_image_path(line) for line in open(image_file_path).readlines()]

dataset = datasets.Dataset.from_dict({"text":findings,"query":image_paths})

def check_img_exists(example):
    return os.path.isfile(example["query"])

dataset = dataset.filter(check_img_exists, num_proc=4)
dataset = dataset.cast_column("query", Image())

def tokenize(samples):
    input_text = [" ".join(['summarize:',text]) for text in samples["text"]]
    samples["input_ids"] = tokenizer(input_text).input_ids
    return samples

dataset = dataset.map(tokenize, batched=True, num_proc=4, remove_columns=["text"])

In [None]:
print(dataset)
print(dataset[0])

In [None]:
normalize = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
transform = transforms.Compose([
    transforms.ToTensor(),
    normalize,
])


def image_transforms(samples):
    samples["query"] = [image.convert("RGB").resize((384,384)) for image in samples["query"]]
    return samples
dataset = dataset.map(image_transforms, batched=True)

In [None]:
print(dataset)
print(dataset[0])

In [None]:
#dataset.set_format(type="torch")

In [None]:
dataset.save_to_disk("ppo_dataset.hf")

In [None]:
print(type(raw_dataset['input_ids'][:2]))

In [None]:
print(raw_dataset[:2])

In [None]:
raw_dataset.set_format(type="torch", columns=['input_ids'],output_all_columns=True)
print(raw_dataset[:2])

In [None]:
print(raw_dataset)

In [None]:
print(raw_dataset[0])

In [None]:
print(raw_dataset[0]["query"])

# create_ppo_dataset.py

In [1]:
from pathlib import Path
import datasets
from datasets import Image
from torchvision import transforms
import os.path


def generate_image_path(line):
    return str(Path(data_path).joinpath(dataset_config).joinpath(line.strip().split(',')[0]))

dataset_config = "mimic-cxr"
split="train"
data_path = '/nfs/turbo/umms-vgvinodv/data/bioNLP23-Task-1B/data/'

findings_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.findings.tok')
impression_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.impression.tok')
image_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.image.tok')


findings = [line.strip() for line in open(findings_file_path).readlines()]
impression = [line.strip() for line in open(impression_file_path).readlines()]
image_paths = [generate_image_path(line) for line in open(image_file_path).readlines()]

dataset = datasets.Dataset.from_dict({"text":findings,"query":image_paths})

def check_img_exists(example):
    return os.path.isfile(example["query"])

dataset = dataset.filter(check_img_exists, num_proc=4)
dataset = dataset.cast_column("query", Image())

print(dataset[0])

Filter (num_proc=4):   0%|          | 0/125417 [00:00<?, ? examples/s]

{'text': 'No focal consolidation is seen. There is no pleural effusion or pneumothorax. The cardiac and mediastinal silhouettes are unremarkable.', 'query': <PIL.JpegImagePlugin.JpegImageFile image mode=L size=512x615 at 0x15337A0BCAC0>}


In [None]:
from transformers import ImageFeatureExtractionMixin

feature_extractor = ImageFeatureExtractionMixin()

def to_pixels(image):
    image = feature_extractor.resize(image, size=384)
    image = feature_extractor.convert_rgb(image)
    #image = feature_extractor.normalize(image, mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
    #image = feature_extractor.to_numpy_array(image)
    return image

def process(examples):
    examples["pixel_values"] = [to_pixels(image) for image in examples["query"]]
    return examples

#features = Features({"pixel_values":})

prep_dataset = dataset.map(process, batched=True, batch_size=256)
print(prep_dataset[0])

Map:   0%|          | 0/101503 [00:00<?, ? examples/s]

In [None]:
from transformers import ImageFeatureExtractionMixin

feature_extractor = ImageFeatureExtractionMixin()

#convert_rgb
def to_pixels(image):
    image = feature_extractor.resize(image, size=384)
    image = feature_extractor.convert_rgb(image)
    image = feature_extractor.normalize(image, mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
    image = feature_extractor.to_numpy_array(image)
    return image

def process(examples):
    examples["pixel_values"] = [to_pixels(image) for image in examples["query"]]
    return examples

#features = Features({"pixel_values":})

prep_dataset = dataset.map(process, batched=True, batch_size=256)
print(prep_dataset[0])

In [None]:
normalize = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
transform = transforms.Compose([
    transforms.ToTensor(),
    normalize,
])


def image_transforms(samples):
    samples["query"] = [transform(image.convert("RGB").resize((384,384))) for image in samples["query"]]
    return samples
dataset = dataset.map(image_transforms, batched=True)

dataset.save_to_disk("ppo_dataset.hf")