## Perceptual Loss

In [32]:
from losses.Losses import *
from PIL import Image
from torchvision import transforms
from utils.environment_settings import env_settings
import torch
import torch.nn as nn
import torch.nn.functional as F 
import pandas as pd
import numpy as np
import torchvision.transforms as transforms
import torchvision.models as models
from health_multimodal.image.model.pretrained import get_biovil_t_image_encoder, get_biovil_image_encoder
import torch.nn as nn

In [2]:
#transform = transforms.ToTensor()
transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),  # Convert to 3-channel grayscale
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

In [3]:
def preprocess_image(image_path, transform):
    image = Image.open(image_path)
    image = transform(image)
    image = image.unsqueeze(0)
    return image


In [4]:
test_image_path = env_settings.DATA + "/p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg"
gt_image_path = env_settings.DATA + "/p10/p10000764/s57375967/096052b7-d256dc40-453a102b-fa7d01c6-1b22c6b4.jpg"
test_image = Image.open(test_image_path)
test_image = transform(test_image)
gt_image = Image.open(gt_image_path)
gt_image = transform(gt_image)
test_image = test_image.unsqueeze(0)  # Add a batch dimension
gt_image = gt_image.unsqueeze(0)      # Add a batch dimension
print(gt_image.shape)

torch.Size([1, 3, 224, 224])


In [5]:
test_image_path

'/home/data/DIVA/mimic/mimic-cxr-jpg/2.0.0/files/p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg'

In [11]:
biovil_model_dict = { 
    "biovil_t": get_biovil_t_image_encoder,
    "biovil": get_biovil_image_encoder
}

class ModifiedMultiImageEncoder(nn.Module):
    def __init__(self, original_model):
        super(ModifiedMultiImageEncoder, self).__init__()
        self.original_model = original_model
        self.original_model.vit_pooler = self.original_model.vit_pooler.eval()

    def forward(self, x):
        # Extract features from ResNet layers
        ################### RESNET ####################################
        x = self.original_model.encoder.conv1(x)
        x = self.original_model.encoder.bn1(x)
        x = self.original_model.encoder.relu(x)
        x = self.original_model.encoder.maxpool(x)
    
        resnet_features = []
        for layer_name in ['layer1', 'layer2', 'layer3', 'layer4']:
            layer = getattr(self.original_model.encoder, layer_name)
            x = layer(x)
            resnet_features.append(x)
    
        ###################### BACKBONE TO VIT ######################
        x = self.original_model.backbone_to_vit(x)
        x = x.flatten(2).transpose(1, 2)
        B, N, C = x.shape  # Adjusted to include channel dimension
    
        # Adjust positional and type embeddings
        pos_embed = self.original_model.vit_pooler.pos_embed[:, :N, :].repeat(B, 1, 1)
        type_embed = self.original_model.vit_pooler.type_embed[0].expand(B, N, -1)
    
        # Combine embeddings and pass through ViT pooler
        pos_and_type_embed = pos_embed + type_embed
        x = self.original_model.vit_pooler.pos_drop(x)
    
        vit_features = []
        for block in self.original_model.vit_pooler.blocks:
            x = block(x, pos_and_type_embed)
            vit_features.append(x)
    
        x = self.original_model.vit_pooler.norm_post(x)
    
        # Extract current patch features
    
        cur_img_token_id = 0
        current_token_features = x[:, cur_img_token_id : self.original_model.vit_pooler.num_patches + cur_img_token_id]
        
        # Calculate the spatial dimensions for each patch
        num_patches = self.original_model.vit_pooler.num_patches
        patch_height = patch_width = int(num_patches ** 0.5)
        
        # Reshape into the spatial grid of patches
        current_patch_features = current_token_features.view(B, patch_height, patch_width, -1).permute(0, 3, 1, 2)
        
        return resnet_features, vit_features
    
class PerceptualLoss_BioVil(nn.Module):
    def __init__(self, base_model, return_all=False):
        super(PerceptualLoss_BioVil, self).__init__()
        self.return_all = return_all
        self.original_model = biovil_model_dict[base_model]().encoder
        self.original_model = self.original_model.cuda()
        self.feature_extractor = ModifiedMultiImageEncoder(self.original_model)
        # self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    def forward(self, y_true, y_pred):
        true_resnet_features, true_vit_features = self.feature_extractor(y_true)
        pred_resnet_features, pred_vit_features = self.feature_extractor(y_pred)

        # Calculate loss for ResNet features
        resnet_loss = 0.0
        for true_feat, pred_feat in zip(true_resnet_features, pred_resnet_features):
            resnet_loss += nn.functional.mse_loss(true_feat, pred_feat)

        
        # Calculate loss for ViT features
        vit_loss = 0.0
        for true_feat, pred_feat in zip(true_vit_features, pred_vit_features):
            vit_loss += nn.functional.mse_loss(true_feat, pred_feat)

        # Combine the losses from ResNet and ViT features
        total_loss = resnet_loss + vit_loss
        if self.return_all:
            return total_loss, resnet_loss, vit_loss
        else:
            return total_loss

In [12]:
perceptual = PerceptualLoss_BioVil("biovil_t", return_all=True)

Using downloaded and verified file: /tmp/biovil_t_image_model_proj_size_128.pt


In [13]:
gt_image = gt_image.cuda()
test_image = test_image.cuda()
loss_0, loss_01, loss_02 = perceptual(gt_image, gt_image)
loss_1, loss_11, loss_12 = perceptual(gt_image, test_image)
loss_2, loss_21, loss_22 = perceptual(test_image, test_image)

print(f'gt_vs_gt Total Loss: {loss_0}, ResNet Loss: {loss_01}, ViT Loss: {loss_02}')
print(f'gt_vs_test Total Loss: {loss_1}, ResNet Loss: {loss_11}, ViT Loss: {loss_12}')
print(f'test_vs_test Total Loss: {loss_2}, ResNet Loss: {loss_21}, ViT Loss: {loss_22}')


gt_vs_gt Total Loss: 0.0, ResNet Loss: 0.0, ViT Loss: 0.0
gt_vs_test Total Loss: 0.2605489492416382, ResNet Loss: 0.1153903678059578, ViT Loss: 0.1451585739850998
test_vs_test Total Loss: 0.0, ResNet Loss: 0.0, ViT Loss: 0.0


In [None]:
import os
from utils.utils import *  
params = read_config(env_settings.CONFIG)
path = 'data/master_df_zeros.csv'
data_frame = pd.read_csv(path)
filter_df_no_findings = data_frame[data_frame['No Finding'] == 1]


total_loss = 0
count = 0
image_base = env_settings.DEBUG

for index, row in filter_df_no_findings.iterrows():
    # Load the first image (assuming the first column after 'No Finding' is the image path)
    first_image_path = row.iloc[4]
    first_image_path = os.path.join(image_base + first_image_path)
    
    first_image = preprocess_image(first_image_path, transform).cuda()

    #for index, row in filter_df_no_findings.iterrows():  # Adjust the range as per your DataFrame structure
    for index, row in data_frame.iterrows():
        other_image_path = row.iloc[4]
        other_image_path = os.path.join(image_base + other_image_path)
        
        if os.path.exists(other_image_path):  # Check if the image path exists
            other_image = preprocess_image(other_image_path, transform).cuda()
            total_loss_value, resnet_loss_value, vit_loss_value = perceptual(first_image, other_image)
    
            # Accumulate only the total loss
            total_loss += total_loss_value.item()
            # Print the total, ResNet, and ViT losses
            if row[15] == 1:
                print(f'No finding and no finding: Total Loss: {total_loss_value.item()}, ResNet Loss: {resnet_loss_value.item()}, ViT Loss: {vit_loss_value.item()}')
            else:
                print(f'No finding and other: Total Loss: {total_loss_value.item()}, ResNet Loss: {resnet_loss_value.item()}, ViT Loss: {vit_loss_value.item()}')
            count += 1

    # Calculate mean loss
mean_loss = total_loss / count if count > 0 else 0
print(f"Mean Perceptual Loss: {mean_loss}")

  if row[15] == 1:


No finding and no finding: Total Loss: 0.0, ResNet Loss: 0.0, ViT Loss: 0.0
No finding and no finding: Total Loss: 0.18618245422840118, ResNet Loss: 0.0901113748550415, ViT Loss: 0.09607107937335968
No finding and no finding: Total Loss: 0.24803365767002106, ResNet Loss: 0.11520323157310486, ViT Loss: 0.1328304260969162
No finding and no finding: Total Loss: 0.1339447796344757, ResNet Loss: 0.06982745975255966, ViT Loss: 0.06411731988191605
No finding and other: Total Loss: 0.2605489492416382, ResNet Loss: 0.1153903678059578, ViT Loss: 0.1451585739850998
No finding and no finding: Total Loss: 0.13433673977851868, ResNet Loss: 0.05954261124134064, ViT Loss: 0.07479412853717804
No finding and no finding: Total Loss: 0.18134000897407532, ResNet Loss: 0.09228979796171188, ViT Loss: 0.08905021101236343
No finding and other: Total Loss: 0.19169454276561737, ResNet Loss: 0.09989026188850403, ViT Loss: 0.09180428087711334
No finding and other: Total Loss: 0.2138615995645523, ResNet Loss: 0.099

## Difussion Model

In [1]:
from models.DDPM import ContextUnet, DDPM
import numpy as np
import pytorch_lightning as pl
from torchvision.datasets import MNIST, ImageFolder
from torch.utils.data import DataLoader
from torchvision import models, transforms
from pytorch_lightning.loggers import TensorBoardLogger

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
n_epoch = 100
n_T = 400 # 500
device = "cuda:0"
n_classes = 3
n_feat = 128 # 128 ok, 256 better (but slower)
image_size = 64
lrate = 1e-4
batch_size = 32
save_model = False

In [3]:
model = DDPM(nn_model=ContextUnet(in_channels=1, n_feat=n_feat, n_classes=n_classes), 
                          betas=(1e-4,0.01),
                          n_T=n_T, drop_prob=0.1)

In [4]:
model

DDPM(
  (nn_model): ContextUnet(
    (init_conv): ResidualConvBlock(
      (conv1): Sequential(
        (0): Conv2d(1, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): GELU(approximate='none')
      )
      (conv2): Sequential(
        (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): GELU(approximate='none')
      )
    )
    (down1): UnetDown(
      (model): Sequential(
        (0): ResidualConvBlock(
          (conv1): Sequential(
            (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): GELU(approximate='none')
          )
          (conv2): Sequential(
            (0): Conv2d(128, 128, kernel_size=(3, 

## Discriminator

In [1]:
from models.Discriminator import *

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
import torch
import torch.nn as nn
import numpy as np
# Your ReportDiscriminator class (as you provided)

# Initialize the discriminator
input_dim = 13  # Example input dimension, change it as per your needs
discriminator = ReportDiscriminator(input_dim)

# Generate a random tensor as input
# The shape of this tensor should match the expected input shape of the discriminator
# For instance, if your input is a 1D tensor with 'input_dim' elements
random_input = torch.randn(1, input_dim)  # Batch size is 1, change it if needed

# Pass the random input through the discriminator
output = discriminator(random_input)

# Print the output
print(output)


tensor([[0.0082]], grad_fn=<AddmmBackward>)


In [14]:
np.ones((random_input.size(0), *discriminator.output_shape))

array([[1.]])

In [10]:
random_input.shape

torch.Size([1, 13])

In [5]:
import torch
import torch.nn as nn

# Your ImageDiscriminator class (as you provided)

# Initialize the discriminator
# Example input shape: 3 channels (RGB image), 64x64 pixels
input_shape = (3, 64, 64) 
discriminator = ImageDiscriminator(input_shape)

# Generate a random tensor as input
# For a single image, the shape would be (1, channels, height, width)
# You can change the batch size as needed
random_input = torch.randn(1, *input_shape)  # Batch size is 1

# Pass the random input through the discriminator
output = discriminator(random_input)

# Print the output
print(output)


tensor([[[[-0.1691, -0.0934, -0.0404, -0.2769],
          [-0.4468, -0.8547,  0.4016,  0.1725],
          [ 0.3373, -0.5122, -0.4857,  0.0630],
          [-0.0303, -0.3434, -0.2633,  0.0101]]]],
       grad_fn=<MkldnnConvolutionBackward>)


## DataLoader

In [2]:
from modules.ChexpertModule import ChexpertDataModule
from modules.CycleGANModule import CycleGAN
from tensorboard import program
from utils.utils import read_config
from torch.utils.data import DataLoader, random_split
from utils._prepare_data import DataHandler

[2024-02-08 02:52:47,141] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [8]:
import tempfile
from enum import Enum, unique
from pathlib import Path
from typing import List, Optional, Tuple, Union

import requests
from torchvision.datasets.utils import check_integrity

from health_multimodal.image import ImageInferenceEngine
from health_multimodal.image.data.transforms import create_chest_xray_transform_for_inference
from health_multimodal.image.model.pretrained import get_biovil_t_image_encoder
from health_multimodal.text.utils import BertEncoderType, get_bert_inference
from health_multimodal.vlp.inference_engine import ImageTextInferenceEngine

RESIZE = 512
CENTER_CROP_SIZE = 512

In [9]:
params = read_config(env_settings.CONFIG)

In [10]:
processor = DataHandler(opt=params["dataset"])
chexpert_data_module = ChexpertDataModule(opt=params['dataset'], processor=processor)

length of train images: 8000
length of val images: 1000
shuffling train images to
Created unpaired dataset.


In [11]:
chexpert_data_module.setup()

In [12]:
val_dataloder = chexpert_data_module.val_dataloader()

In [13]:
images = []
labels = []
i = 0
for batch in val_dataloder:
    if i == 10:
        break
    images.append(batch['target'])
    labels.append(batch['report'])
    i += 1

In [14]:
len(images)

10

In [15]:
RESIZE = 512
CENTER_CROP_SIZE = 512
@unique
class ClassType(str, Enum):
    """Enum for the different types of CXR abnormality classes."""

    PNEUMONIA = "pneumonia"
    NO_PNEUMONIA = "no_pneumonia"

In [16]:
chexpert_prompts = {
    'Atelectasis': {
        'positive': ["Atelectasis observed", "Partial lung collapse noted", "Signs of atelectasis"],
        'negative': ["No atelectasis", "Lungs fully expanded", "Atelectasis absent"],
    },
    'Cardiomegaly': {
        'positive': ["Cardiomegaly present", "Enlarged cardiac silhouette", "Heart size increased"],
        'negative': ["No cardiomegaly", "Normal heart size", "Cardiac silhouette within normal limits"],
    },
    'Consolidation': {
        'positive': ["Consolidation seen", "Areas of consolidation", "Lung consolidation present"],
        'negative': ["No consolidation", "Clear of consolidation", "Consolidation absent"],
    },
    'Edema': {
        'positive': ["Pulmonary edema", "Signs of edema", "Edema in lung tissue"],
        'negative': ["No edema", "No signs of pulmonary edema", "Absence of edema"],
    },
    'Enlarged Cardiomediastinum': {
        'positive': ["Cardiomediastinum enlarged", "Widened mediastinum", "Mediastinal enlargement"],
        'negative': ["No enlargement of the cardiomediastinum", "Normal mediastinal contours", "Cardiomediastinum not enlarged"],
    },
    'Fracture': {
        'positive': ["Bone fracture identified", "Fracture visible", "Evidence of fracture"],
        'negative': ["No fracture detected", "Bone integrity maintained", "Absence of fracture"],
    },
    'Lung Lesion': {
        'positive': ["Lung lesion observed", "Presence of lung lesion", "Lesion in lung"],
        'negative': ["No lung lesion", "Lung fields clear of lesions", "No evidence of lung lesion"],
    },
    'Lung Opacity': {
        'positive': ["Lung opacity noted", "Opacity within lung", "Areas of opacity"],
        'negative': ["No lung opacity", "Clear lung fields", "Opacity absent"],
    },
    'No Finding': {
        'positive': ["No acute findings", "No abnormalities detected", "Normal chest X-ray"],
        'negative': ["Abnormalities present", "Findings noted", "Not a normal study"],  # Context-specific; might be less used
    },
    'Pleural Effusion': {
        'positive': ["Pleural effusion", "Fluid in pleural space", "Signs of pleural effusion"],
        'negative': ["No pleural effusion", "Pleural spaces clear", "Absence of pleural fluid"],
    },
    'Pleural Other': {
        'positive': ["Pleural abnormalities", "Non-effusion pleural abnormality", "Pleural thickening"],
        'negative': ["No pleural abnormalities", "Pleura appears normal", "No pleural thickening or masses"],
    },
    'Pneumonia': {
        'positive': ["Evidence of pneumonia", "Pneumonic infiltrates", "Signs consistent with pneumonia"],
        'negative': ["No pneumonia", "Clear of pneumonia", "No pneumonic infiltrates"],
    },
    'Pneumothorax': {
        'positive': ["Pneumothorax present", "Air in pleural space", "Signs of pneumothorax"],
        'negative': ["No pneumothorax", "No air leakage", "Pleural cavity intact"],
    },
}


In [17]:
import random

def construct_prompts_from_vector(condition_vector, chexpert_prompts):
    """
    Constructs a combined prompt based on the presence/absence of conditions in a binary vector.
    
    Args:
    - condition_vector (list of int): A 13-length binary vector representing conditions.
    - chexpert_prompts (dict): A dictionary mapping conditions to their positive and negative prompts.
    
    Returns:
    - str: A combined prompt for the given condition vector.
    """
    # Ensure the binary vector matches the number of conditions
    assert len(condition_vector) == len(chexpert_prompts)
    
    # Initialize list to hold selected prompts
    selected_prompts = []
    
    # Iterate over each condition and its presence/absence in the vector
    for condition, presence in zip(chexpert_prompts.keys(), condition_vector):
        # Choose the prompt list based on the condition's presence (1) or absence (0)
        prompt_list = chexpert_prompts[condition]['positive'] if presence else chexpert_prompts[condition]['negative']
        
        # Randomly select one prompt from the chosen list
        selected_prompt = random.choice(prompt_list)
        
        # Add the selected prompt to the list
        selected_prompts.append(selected_prompt)
    
    # Combine selected prompts into a single string, separating them by a space or another delimiter
    combined_prompt = ", ".join(selected_prompts)
    
    return combined_prompt


In [18]:
idx = 6 
label = labels[idx]
image= images[idx]

In [19]:
# Example binary vector representing conditions (1 for present, 0 for absent)
condition_vector = label.tolist()[0]
# Assuming `chexpert_prompts` is defined as shown in the previous example
combined_prompt = construct_prompts_from_vector(condition_vector, chexpert_prompts)
combined_prompt

'Atelectasis observed, No cardiomegaly, Clear of consolidation, Absence of edema, Mediastinal enlargement, Bone integrity maintained, Lung fields clear of lesions, Lung opacity noted, Abnormalities present, Absence of pleural fluid, No pleural abnormalities, Evidence of pneumonia, No air leakage'

In [22]:
def _get_vlp_inference_engine() -> ImageTextInferenceEngine:
    image_inference = ImageInferenceEngine(
        image_model=get_biovil_t_image_encoder(),
        #transform=create_chest_xray_transform_for_inference(resize=RESIZE, center_crop_size=CENTER_CROP_SIZE),
    )
    img_txt_inference = ImageTextInferenceEngine(
        image_inference_engine=image_inference,
        text_inference_engine=get_bert_inference(BertEncoderType.BIOVIL_T_BERT),
    )
    return img_txt_inference,image_inference

In [23]:
img_txt_inference,image_inference = _get_vlp_inference_engine()

Using downloaded and verified file: /tmp/biovil_t_image_model_proj_size_128.pt


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'CXRBertTokenizer'.
You are using a model of type bert to instantiate a model of type cxr-bert. This is not supported for all configurations of models and can yield errors.


In [173]:
score = img_txt_inference.get_similarity_score_from_raw_data(
    image=image, query_text=combined_prompt
)

In [24]:
img_embedding = image_inference.get_projected_global_embedding(images[6])

In [176]:
import health_multimodal
print(health_multimodal.__path__)

['/home/guests/usr_mlmi/.conda/envs/ark_biovil/lib/python3.9/site-packages/health_multimodal']


In [28]:
img_embedding

tensor([ 7.2087e-02,  8.0008e-03, -1.3950e-02, -1.4212e-01,  4.9195e-02,
         1.5357e-02, -3.1353e-02, -1.5227e-01,  4.0827e-02,  6.5098e-02,
         8.8336e-02, -1.2466e-01,  1.8646e-01,  6.0860e-02,  1.7561e-01,
        -1.6772e-01, -1.0939e-01, -5.8289e-02, -6.2366e-02, -7.3033e-02,
         1.0772e-02, -5.4196e-02, -2.2908e-02,  3.7817e-02,  1.0001e-02,
         1.0504e-01,  4.1775e-02, -4.3272e-02,  2.0983e-01,  1.9906e-02,
         1.5746e-02, -5.8027e-02, -3.7310e-02,  1.3939e-01,  1.3950e-02,
         7.3272e-02,  6.0896e-03, -5.4726e-02,  1.3109e-01,  1.2778e-02,
         2.2804e-02,  1.2257e-01,  3.1408e-02, -8.5368e-02, -1.4192e-01,
         1.2374e-02,  5.9470e-02, -1.0540e-01,  5.2910e-02,  1.2760e-02,
         1.4495e-01, -1.7663e-01, -2.6403e-02,  3.6074e-02,  9.5359e-02,
         9.0694e-02, -1.0298e-01, -2.9791e-02,  1.3576e-01, -6.2569e-02,
        -1.3589e-01,  5.7179e-02,  6.2819e-02,  9.0243e-02,  9.8306e-02,
         5.1156e-02, -7.9588e-02, -1.1480e-01,  9.7

In [4]:
import torch
dummy_inputs = torch.rand(5, 3, 224, 224)  # Shape: (batch_size, channels, height, width)

In [1]:
import torch
import timm
import torch.nn as nn
import pytorch_lightning as pl
import health_multimodal.image
from health_multimodal.image.model.model import BaseImageModel
from health_multimodal.image.utils import ImageModelType
from health_multimodal.image.model.pretrained import get_biovil_t_image_encoder, get_biovil_image_encoder

  from .autonotebook import tqdm as notebook_tqdm


[2024-02-08 20:10:12,948] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
image_model = get_biovil_t_image_encoder()

Using downloaded and verified file: /tmp/biovil_t_image_model_proj_size_128.pt


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


freeze_encoder: True
encoder train false


In [3]:
image_model.feature_size

512

In [6]:
embedding = image_model(dummy_inputs)

In [18]:
embedding.img_embedding.shape

torch.Size([5, 512])

In [21]:
from health_multimodal.image.model.modules import MultiTaskModel

In [26]:
multi_task_classifier = MultiTaskModel(
            input_dim=512, 
            classifier_hidden_dim=256, 
            num_classes=1, 
            num_tasks=13  # Assuming num_classes = num_tasks for Chexpert
)

In [27]:
logits = multi_task_classifier(embedding.img_embedding)

In [28]:
logits.shape

torch.Size([5, 1, 13])

In [29]:
logits

tensor([[[ 0.0491, -0.1006,  0.0317,  0.0799, -0.0240, -0.0249, -0.0636,
           0.0263, -0.0513,  0.0954,  0.0109, -0.0928,  0.0112]],

        [[ 0.1667, -0.1378,  0.1990,  0.1589, -0.1340,  0.1135, -0.0812,
           0.0549, -0.0145,  0.1042,  0.1995,  0.1188,  0.0225]],

        [[ 0.0387, -0.0797, -0.0546,  0.0086, -0.1265, -0.0036,  0.0849,
          -0.0069, -0.1375,  0.0649, -0.1081, -0.0219, -0.0632]],

        [[-0.0438,  0.1444,  0.0495, -0.0079,  0.0499, -0.1205,  0.1190,
           0.0109, -0.0693, -0.0235, -0.1987, -0.1430, -0.0202]],

        [[ 0.1045, -0.2022, -0.0081,  0.0211,  0.0224, -0.0231, -0.0844,
           0.0021, -0.0183,  0.0627,  0.0508,  0.0287, -0.0146]]],
       grad_fn=<CopySlices>)

In [1]:
from models.BioViL import *
import os
from utils.environment_settings import env_settings
from utils.utils import *
import torch
dummy_inputs = torch.rand(5, 3, 224, 224)  # Shape: (batch_size, channels, height, width)
opt = read_config(env_settings.CONFIG)

  from .autonotebook import tqdm as notebook_tqdm


[2024-02-08 20:36:21,577] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
 model = BioViL(embedding_size=opt["report_generator"]["embedding_size"], 
      num_classes=13, 
      hidden_1=opt["report_generator"]["classification_head_hidden1"],
      hidden_2=opt["report_generator"]["classification_head_hidden2"], 
      dropout_rate=opt["report_generator"]["dropout_prob"]
)

Using downloaded and verified file: /tmp/biovil_t_image_model_proj_size_128.pt


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


freeze_encoder: True
encoder train false


In [3]:
c_logits = model(dummy_inputs)

In [4]:
c_logits.shape

torch.Size([5, 13])

In [5]:
 model2 = BioViL_V2()

Using downloaded and verified file: /tmp/biovil_t_image_model_proj_size_128.pt
freeze_encoder: True
encoder train false


In [6]:
c_logits2 = model2(dummy_inputs)

In [7]:
c_logits2.shape

torch.Size([5, 13])

In [11]:
logit_raw.shape

torch.Size([5, 13, 1])

In [2]:
import torch
label = torch.tensor([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]).float()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
label

tensor([1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0.])

In [4]:
'Adding Gaussian noise to labels, you need to ensure that the noise keeps the labels within their intended range: 0 to 1.'
positive_noise = torch.randn_like(label) * 0.1  # Adjust the scale as needed
negative_noise = torch.randn_like(label) * 0.1  # Adjust the scale as needed

# Add noise conditionally
soft_label = torch.where(label > 0.5,
                        torch.clamp(label + positive_noise, 0.5, 1),
                        torch.clamp(label + negative_noise, 0, 0.5))


In [5]:
soft_label

tensor([0.9283, 0.1551, 0.9496, 0.0385, 0.7264, 0.0000, 0.8289, 0.0024, 0.9555,
        0.1157, 1.0000, 0.0000, 0.8345, 0.1431])

In [7]:
import health_multimodal.image
from health_multimodal.image.model.model import BaseImageModel
from health_multimodal.image.utils import ImageModelType
from health_multimodal.image.model.modules import MultiTaskModel
from health_multimodal.image.model.pretrained import get_biovil_t_image_encoder, get_biovil_image_encoder
model = get_biovil_t_image_encoder()

Using downloaded and verified file: /tmp/biovil_t_image_model_proj_size_128.pt


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


In [9]:
layers = list(model.children()) 

In [13]:
layers[-1]

MLP(
  (model): Sequential(
    (0): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1))
  )
)

In [20]:
layers[1]

MultiImageEncoder(
  (encoder): ResNetHIML(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequenti