In [1]:
from model import GeoCLIP
from model import ImageEncoder
from model import LocationEncoder
from train import train
from train import dataloader
import os
from torch import nn
import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def haversine_distance(lon1, lat1, lon2, lat2):
    '''Find distance between locations in meters'''

    R = 6371000 # radius of Earth in meters
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    a = (np.sin(delta_phi / 2))**2 + np.cos(phi1) * np.cos(phi2) * (np.sin(delta_lambda / 2))**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

In [3]:
df = pd.read_csv("/workspace/mappilary_street_level/train_val/amsterdam/query/raw.csv")
df_no_panorama = df[~df["pano"]]

In [4]:
df_distance = df_no_panorama[["lat", "lon"]].values
distances_map = dict()
min_distance = 100
start_coords = df_distance[0]
for i in range(1, len(df_distance)):
    dist = haversine_distance(start_coords[1], start_coords[0], df_distance[i][1], df_distance[i][0])
    if dist >= min_distance:
        start_coords = df_distance[i]
        distances_map[i] = dist

In [21]:
selected_images_no_jpg = df_no_panorama.iloc[list(distances_map.keys())].key.values
selected_images = [f"{img}.jpg" for img in selected_images_no_jpg]
gps_coords = list(distances_map.keys())

In [22]:
class AmsterdamData(torch.utils.data.Dataset):
    def __init__(self, root: str, selected_images: np.ndarray, gps_coords: np.ndarray, transform=None):
        self.root = root
        self.transform = transform
        self.images = selected_images
        self.coordinates = gps_coords

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        gps = self.coordinates[idx]
        img = Image.open(os.path.join(self.root, self.images[idx]))
        if self.transform:
            img = self.transform(img)
        return img

In [7]:
def device_as(t1, t2):
   """
   Moves t1 to the device of t2
   """
   return t1.to(t2.device)

class SimCLRLoss(nn.Module):

    def __init__(self, batch_size, temperature):
        self.batch_size = batch_size
        self.temperature = temperature
        self.mask = (~torch.eye(batch_size * 2, batch_size * 2, dtype=bool)).float()


    def calc_similarity_batch(self, a, b):
       representations = torch.cat([a, b], dim=0)
       return F.cosine_similarity(representations.unsqueeze(1), representations.unsqueeze(0), dim=2)

        
    def forward(self, proj_1, proj_2):
        """
        proj_1 and proj_2 are batched embeddings [batch, embedding_dim]
        where corresponding indices are pairs
        z_i, z_j in the SimCLR paper
        """
        batch_size = proj_1.shape[0]
        z_i = F.normalize(proj_1, p=2, dim=1)
        z_j = F.normalize(proj_2, p=2, dim=1)

        similarity_matrix = self.calc_similarity_batch(z_i, z_j)

        # Postive similarities are on the off-diagonals
        sim_ij = torch.diag(similarity_matrix, batch_size)
        sim_ji = torch.diag(similarity_matrix, -batch_size)

        positives = torch.cat([sim_ij, sim_ji], dim=0)

        nominator = torch.exp(positives / self.temperature)

        # Mask out the main diagonal and calculate the softmax
        denominator = (self.mask, similarity_matrix) * torch.exp(similarity_matrix / self.temperature)

        all_losses = -torch.log(nominator / torch.sum(denominator, dim=1))
        loss = torch.sum(all_losses) / (2 * self.batch_size)
        return loss

In [None]:
class BestModel(nn.Module):
    def __init__(self, image_encoder, location_encoder):
        super(BestModel, self).__init__()
        self.Geoclip = GeoCLIP()
        for param in self.Geoclip.parameters():
            param.requires_grad = False
        self.fc = nn.Linear(512, 512)

    def forward(self, image, location):
        x = self.Geoclip(image, location)
        return self.fc(x)



In [8]:
#loader = dataloader.GeoDataLoader("data/geojsons", "data/images")
geo_clip = GeoCLIP()
#train(train_dataloader=loader, model=geo_clip)

#geo_clip.save("model")

config.json: 100%|██████████| 4.52k/4.52k [00:00<00:00, 20.4MB/s]
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.
model.safetensors: 100%|██████████| 1.71G/1.71G [02:23<00:00, 11.9MB/s]
preprocessor_config.json: 100%|██████████| 316/316 [00:00<00:00, 2.12MB/s]
tokenizer_config.json: 100%|██████████| 905/905 [00:00<00:00, 6.85MB/s]
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict`

In [9]:
geo_clip.predict("/workspace/geoclip/images/Kauai.png", 10)

(tensor([[  22.1980, -159.6219],
         [  22.1785, -159.6501],
         [  22.1759, -159.6542],
         [  22.1751, -159.6559],
         [  22.1502, -159.6636],
         [  22.2178, -159.5888],
         [  21.4986, -158.1512],
         [  22.2207, -159.5827],
         [  21.3382, -157.8054],
         [  20.1172, -155.5842]]),
 tensor([0.0731, 0.0704, 0.0679, 0.0669, 0.0527, 0.0454, 0.0370, 0.0314, 0.0293,
         0.0288]))

In [10]:
def img_train_transform():
    train_transform_list = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.RandomApply([transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1)], p=0.8),
        transforms.RandomGrayscale(p=0.2),
        transforms.PILToTensor(),
        transforms.ConvertImageDtype(torch.float),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    return train_transform_list

In [23]:
optim = torch.optim.SGD(geo_clip.parameters(), lr=0.1)
criterion = SimCLRLoss(10, 0.1)
EPOCHS = 2
BATCH_SIZE = 2
scheduler = torch.optim.lr_scheduler.StepLR(optim, step_size=10, gamma=0.1)
dataset = AmsterdamData(root="/workspace/mappilary_street_level/train_val/amsterdam/query/images", selected_images=selected_images, gps_coords=gps_coords, transform=img_train_transform)
loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [25]:
data = iter(loader)

In [27]:
a, b = data

  0%|          | 0/185 [00:29<?, ?it/s]


TypeError: img_train_transform() takes 0 positional arguments but 1 was given

In [24]:
train(loader, geo_clip, criterion, optim, scheduler, epoch=EPOCHS, batch_size=BATCH_SIZE, device = "gpu")

Starting Epoch 2




AttributeError: 'range' object has no attribute 'to'