In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from PIL import Image
import matplotlib.pyplot as plt
%matplotlib inline

## Loading the dataset and define the test set relative size

In [2]:
images_path = "/kaggle/input/kittiroadsegmentation/training/image_2"
masks_path = "/kaggle/input/kittiroadsegmentation/training/gt_image_2"

test_set_fraction = 0.20 # 20% of images in the test set
train_set_fraction = 0.80 # 80% of images in the train set

In [23]:

import glob
train_images_filenames = (glob.glob(f"{images_path}/*.png"))[:250]
print(len(train_images_filenames))

250


## Set random seed

In [3]:
np.random.seed(0)

## Load the images and masks

In [4]:
train_set_size = int(len(os.listdir(images_path)) * train_set_fraction)
test_set_size = int(len(os.listdir(masks_path)) * test_set_fraction)

print(f"Number of images in the train set = {train_set_size} and number of images in the test set = {test_set_size}")

Number of images in the train set = 231 and number of images in the test set = 76


In [5]:
def load_images(path, mask = False, verbose = False):
    result = []
    images = os.listdir(path)
    
    if mask is True:
        images = [entry for entry in images if "road" in entry]
    
    images = sorted(images)
    if verbose is True:
        print(f"Processing {len(images)} images")
    
    
    for img in images:
        im = Image.open(os.path.join(path, img))
        resized_im = im.resize((128, 128))
        res = np.asarray(resized_im)
        result.append(res)
    
    return np.array(result)

In [6]:
data = load_images(images_path, mask = False, verbose = True)

Processing 289 images


In [7]:
true_mask = load_images(masks_path, mask = True, verbose = True)

Processing 289 images


## Convert masks to binary format (Road and no road)

In [8]:
road_label = np.array([255, 0, 255])

In [9]:
def convert_to_binary_mask(true_mask):
    num_masks = len(true_mask)
    for i in range(num_masks):
        cond = np.all(true_mask[i] == road_label, axis = 2)
        true_mask[i] = true_mask[i] * cond[..., np.newaxis]

In [10]:
convert_to_binary_mask(true_mask)

## Preprocessing of images and masks

In [11]:
true_mask=np.dot(true_mask[..., :3], [0.2989, 0.5870, 0.1140])
true_mask=np.expand_dims(true_mask, axis=-1)

In [12]:
true_mask[true_mask != 0.0]=1.0

In [13]:
data  = data / 255.0

In [14]:
print(f"Shape of data = {data.shape} and shape of masks = {true_mask.shape}")

Shape of data = (289, 128, 128, 3) and shape of masks = (289, 128, 128, 1)


In [15]:
data=data.astype(np.float32)
true_mask=true_mask.astype(np.float32)

## Saving the numpy arrays as `.npy` files

In [16]:
np.save('images.npy', data)
np.save('binary_masks.npy', true_mask)

## Load BLIP Model

In [17]:
from PIL import Image

def load_image(file_name):
    im = Image.open(file_name).convert("RGB")
    return im

In [18]:
!pip install salesforce-lavis

Collecting salesforce-lavis
  Downloading salesforce_lavis-1.0.2-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting contexttimer (from salesforce-lavis)
  Downloading contexttimer-0.3.3.tar.gz (4.9 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting decord (from salesforce-lavis)
  Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl (13.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m74.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting einops>=0.4.1 (from salesforce-lavis)
  Obtaining dependency information for einops>=0.4.1 from https://files.pythonhosted.org/packages/29/0b/2d1c0ebfd092e25935b86509a9a817159212d82aa43d7fb07eca4eeff2c2/einops-0.7.0-py3-none-any.whl.metadata
  Downloading einops-0.7.0-py3-none-any.whl.metadata (13 kB)
Collecting fairscale==0.4.4 (from salesforce

In [19]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from lavis.models import load_model_and_preprocess

model, vis_processors, txt_processors = load_model_and_preprocess(name="blip_feature_extractor", model_type="base", is_eval=True, device=device)



Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

100%|██████████| 1.97G/1.97G [00:08<00:00, 242MB/s]


In [20]:
def get_blip_embeddings(raw_image):
    global model, vis_processors, device
    image = vis_processors["eval"](raw_image).unsqueeze(0).to(device)
    sample = {"image": image, "text_input": None}
    features_image = model.extract_features(sample, mode="image")
    return features_image.image_embeds

In [24]:
from tqdm import tqdm

all_blip_embeddings = []

for file_name in tqdm(train_images_filenames):
    raw_image = load_image(file_name)
    embeds = get_blip_embeddings(raw_image)
    all_blip_embeddings.append(embeds)

100%|██████████| 250/250 [00:11<00:00, 22.31it/s]


In [25]:
sum_similarities = 0.0

N = len(all_blip_embeddings)
# all_blip_embeddings = np.array(all_blip_embeddings)

for i in tqdm(range(0, N - 1)):
    for j in range(i+1, N):
        u_x = all_blip_embeddings[i]
        u_y = all_blip_embeddings[j]
        sim = u_x[:,0,:] @ u_y[:,0,:].t()
        sum_similarities += float(sim[0][0])

100%|██████████| 249/249 [00:01<00:00, 142.07it/s]


In [27]:
sum_similarities

7493446.417144775

In [28]:
den = (N*(N-1))/2

In [30]:
avg = sum_similarities/den

In [31]:
avg

240.75329854280403