In [1]:
import copy
import os
import pickle
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import src.GLIP.maskrcnn_benchmark as maskrcnn_benchmark
import torch
import torch.utils.data as data
from sklearn.model_selection import train_test_split
from torchvision.ops import nms
from tqdm import tqdm

sys.modules['maskrcnn_benchmark'] = maskrcnn_benchmark

In [2]:
dataset = pickle.load(open('../data/dataset_for_segmentation.p','rb'))
dataset.head()

Unnamed: 0,idx,filename,caption,model,expr,conf,bbox,labels,image_id,caption_raw,caption_preprocessed,title_raw,title_preprocessed
0,1027,JOMU_32980_2k_324w.jpg,people buying sweets at the market.,GLIP,caption,"[tensor(0.8280), tensor(0.7924), tensor(0.6899...","[[tensor(178.8594), tensor(116.5122), tensor(2...","[people, people, people, people, people, peopl...",1488.0,people buying sweets at the market.,people buying sweets at the market.,"Saint Nicholas Festival Market, Place de Notre...","saint nicholas festival market, place de notre..."
1,183,CAPO_02480_2k_324w.jpg,a group of soldiers stand in front of a building.,GLIP,caption,"[tensor(0.8242), tensor(0.5884), tensor(0.5781...","[[tensor(0.8775), tensor(131.9302), tensor(81....","[soldiers, a building, a building, a building]",516.0,a group of soldiers stand in front of a building.,a group of soldiers stand in front of a building.,[Mobilization at Perolles in August 1914],[mobilization at perolles in august 1914]
2,543,JATH_26232_2k_324w.jpg,soldiers stand in front of a military vehicle.,GLIP,caption,"[tensor(0.7898), tensor(0.7305), tensor(0.6872...","[[tensor(97.0861), tensor(137.5495), tensor(14...","[soldiers, soldiers, a military vehicle, soldi...",950.0,soldiers stand in front of a military vehicle.,soldiers stand in front of a military vehicle.,"Additional service for women, Barracks de la P...","additional service for women, barracks de la p..."
3,430,JATH_10616_2k_324w.jpg,women walking down a street.,GLIP,caption,"[tensor(0.6900), tensor(0.6777), tensor(0.6676...","[[tensor(125.4207), tensor(111.6184), tensor(1...","[women, women, women, a street, women]",836.0,women walking down a street.,women walking down a street.,Procession on the route to the Alps during a w...,procession on the route to the alps during a w...
4,408,HAWI_01023_2k_324w.jpg,a photograph of a large tropical cyclone.,GLIP,caption,[tensor(0.7263)],"[[tensor(4.5861), tensor(13.1419), tensor(318....",[a large tropical cyclone],814.0,a photograph of a large tropical cyclone.,a a large tropical cyclone.,Tornado over Lake Morat,tornado over lake morat


In [3]:
pickle_dir = '../data/phrase_grounding_results/'
MDETR_caption = pickle.load(open(pickle_dir + 'MDETR_full_caption.p', 'rb'))
MDETR_title = pickle.load(open(pickle_dir + 'MDETR_full_title.p', 'rb'))
GLIP_caption = pickle.load(open(pickle_dir + 'GLIP_full_caption.p', 'rb'))
GLIP_title = pickle.load(open(pickle_dir + 'GLIP_full_title.p', 'rb'))
dataset_dict = pickle.load(open('../data/dataset_for_phrase_grounding/dataset.p', 'rb'))

In [4]:
def GLIP2MDETR(glip_array):
    mdetr_array = []
    for elem in glip_array:
        caption = [elem[1][k -1] if k < len(elem[1]) else elem[1][len(elem[1]) - 1] for k in elem[0].get_field('labels')]
        mdetr_array.append([elem[0].get_field('scores'), elem[0].bbox, caption])
    return mdetr_array
GLIP_caption = GLIP2MDETR(GLIP_caption)
GLIP_title = GLIP2MDETR(GLIP_title)

In [5]:
def det_nms(segmentation_array):
    segmentation_array_ = []
    for i, elem in enumerate(segmentation_array):
        segmentation_array_.append(list(elem))
    seg_filtered = copy.deepcopy(segmentation_array_)
    for index_, elem in enumerate(segmentation_array_):
        unique_caption = set(elem[2])
        if len(elem[2]) != 0:
            boolean_index = [[elem_ == cap for elem_ in elem[2]] for cap in list(unique_caption)]
            idx = [[i for i, x in enumerate(bool_idx) if x] for bool_idx in boolean_index]
            idx_to_keep = [nms(boxes=torch.index_select(elem[1], 0, torch.tensor(idx_)),
                               scores=torch.index_select(elem[0], 0, torch.tensor(idx_)), iou_threshold=0.2) for idx_ in
                           idx]
            scores = []
            boxes = []
            captions = []
            for idx_, idx_tokeep, caption in zip(idx, idx_to_keep, list(unique_caption)):
                scores += (elem[0][idx_][idx_tokeep])
                boxes += (elem[1][idx_][idx_tokeep])
                captions += (
                    [elem[2][i].removeprefix(' ') for i in torch.index_select(torch.tensor(idx_), 0, idx_tokeep)])

            seg_filtered[index_][0] = torch.stack(scores, dim=0)
            seg_filtered[index_][1] = torch.stack(boxes, dim=0)
            seg_filtered[index_][2] = captions

    return seg_filtered


def global_det_nms(segmentation_array):
    segmentation_array_ = []
    for i, elem in enumerate(segmentation_array):
        segmentation_array_.append(list(elem))
    seg_filtered = copy.deepcopy(segmentation_array_)
    for index_, elem in enumerate(segmentation_array_):
        if len(elem[2]) != 0:
            idx_to_keep = nms(boxes=elem[1], scores=elem[0], iou_threshold=0.9)
            scores = []
            boxes = []
            captions = []
            scores += (elem[0][idx_to_keep])
            boxes += (elem[1][idx_to_keep])
            captions += ([elem[2][i].removeprefix(' ') for i in idx_to_keep])

            seg_filtered[index_][0] = torch.stack(scores, dim=0)
            seg_filtered[index_][1] = torch.stack(boxes, dim=0)
            seg_filtered[index_][2] = captions

    return seg_filtered

In [6]:
MDETR_caption = global_det_nms(det_nms(MDETR_caption))
MDETR_title = global_det_nms(det_nms(MDETR_title))
GLIP_caption = global_det_nms(det_nms(GLIP_caption))
GLIP_title = global_det_nms(det_nms(GLIP_title))

In [7]:
MDETR_title = pd.DataFrame(MDETR_title).rename({0:'conf', 1:'bbox', 2:'expr'}, axis=1)
MDETR_caption = pd.DataFrame(MDETR_caption).rename({0:'conf', 1:'bbox', 2:'expr'}, axis=1)
GLIP_caption = pd.DataFrame(GLIP_caption).rename({0:'conf', 1:'bbox', 2:'expr'}, axis=1)
GLIP_title = pd.DataFrame(GLIP_title).rename({0:'conf', 1:'bbox', 2:'expr'}, axis=1)
MDETR_title.head()

Unnamed: 0,conf,bbox,expr
0,"[tensor(0.9756), tensor(0.9001), tensor(0.8900)]","[[tensor(65.0094), tensor(1.1163), tensor(324....","[the wall of father girard, inauguration plaqu..."
1,[tensor(0.9990)],"[[tensor(81.8896), tensor(160.1014), tensor(22...",[patient of dr xavier cuony city]
2,[tensor(0.9510)],"[[tensor(86.9352), tensor(87.6198), tensor(244...",[felsenegg]
3,"[tensor(0.9978), tensor(0.9940)]","[[tensor(39.9502), tensor(162.3252), tensor(27...","[his bike, male ( hermann nussbaumer]"
4,"[tensor(0.9960), tensor(0.9914), tensor(0.9859...","[[tensor(141.7642), tensor(84.6272), tensor(27...","[two women one, two women one, regional costum..."


In [8]:
# for each element in the dataset, we want to filter results that have confidence < 0.75
# import compress
from itertools import compress

def filter_results(elem):
    conf = elem['conf']
    bbox = elem['bbox']
    expr = elem['expr']
    keep = conf > 0.75
    conf = conf[keep]
    bbox = bbox[keep]
    expr = list(compress(expr, keep))

    return {'conf': conf, 'bbox': bbox, 'expr': expr}

def filter_results_df(df):
    new_df = df.copy()
    df = df.apply(filter_results, axis=1)
    new_df['conf'] = df.apply(lambda x: x['conf'])
    new_df['bbox'] = df.apply(lambda x: x['bbox'])
    new_df['expr'] = df.apply(lambda x: x['expr'])
    return new_df

MDETR_title = filter_results_df(MDETR_title)
MDETR_caption = filter_results_df(MDETR_caption)

In [9]:
def get_min_bbox(bbox):
    try:
        return int((bbox[:,2:] - bbox[:,:2]).prod(axis=1).min())
    except:
        return np.nan

def get_max_bbox(bbox):
    try:
        return int((bbox[:,2:] - bbox[:,:2]).prod(axis=1).max())
    except:
        return np.nan

def get_median_bbox(bbox):
    try:
        return int((bbox[:,2:] - bbox[:,:2]).prod(axis=1).median())
    except:
        return np.nan

MDETR_title['num_bbox'] = MDETR_title.conf.apply(lambda x: len(x))
MDETR_caption['num_bbox'] = MDETR_caption.conf.apply(lambda x: len(x))
GLIP_caption['num_bbox'] = GLIP_caption.conf.apply(lambda x: len(x))
GLIP_title['num_bbox'] = GLIP_title.conf.apply(lambda x: len(x))

MDETR_title['min_bbox_area'] = MDETR_title.bbox.apply(lambda x: get_min_bbox(x))
MDETR_caption['min_bbox_area'] = MDETR_caption.bbox.apply(lambda x: get_min_bbox(x))
GLIP_caption['min_bbox_area'] = GLIP_caption.bbox.apply(lambda x: get_min_bbox(x))
GLIP_title['min_bbox_area'] = GLIP_title.bbox.apply(lambda x: get_min_bbox(x))

MDETR_title['max_bbox_area'] = MDETR_title.bbox.apply(lambda x: get_max_bbox(x))
MDETR_caption['max_bbox_area'] = MDETR_caption.bbox.apply(lambda x: get_max_bbox(x))
GLIP_caption['max_bbox_area'] = GLIP_caption.bbox.apply(lambda x: get_max_bbox(x))
GLIP_title['max_bbox_area'] = GLIP_title.bbox.apply(lambda x: get_max_bbox(x))

MDETR_title['median_bbox_area'] = MDETR_title.bbox.apply(lambda x: get_median_bbox(x))
MDETR_caption['median_bbox_area'] = MDETR_caption.bbox.apply(lambda x: get_median_bbox(x))
GLIP_caption['median_bbox_area'] = GLIP_caption.bbox.apply(lambda x: get_median_bbox(x))
GLIP_title['median_bbox_area'] = GLIP_title.bbox.apply(lambda x: get_median_bbox(x))


MDETR_title.drop(columns=['conf', 'bbox', 'expr'], inplace=True)
MDETR_caption.drop(columns=['conf', 'bbox', 'expr'], inplace=True)
GLIP_caption.drop(columns=['conf', 'bbox', 'expr'], inplace=True)
GLIP_title.drop(columns=['conf', 'bbox', 'expr'], inplace=True)

In [10]:
dataset_dict = pd.DataFrame(dataset_dict)

In [11]:
dataset_dict['len_caption'] = dataset_dict.caption.apply(lambda x: len(x['raw'].split()) if x['raw'] is not None else 0)
dataset_dict['len_title'] = dataset_dict.title.apply(lambda x: len(x['raw'].split()) if x['raw'] is not None else 0)

In [12]:
dataset_dict = dataset_dict.drop(columns=['title', 'caption'])
dataset_dict.head()

Unnamed: 0,image_id,filename,len_caption,len_title
0,2,ALCU_00005_2k_324w.jpg,6,18
1,4,ALCU_00033_2k_324w.jpg,5,8
2,5,ALNU_00015_2k_324w.jpg,8,5
3,6,ALNU_00016_2k_324w.jpg,7,6
4,7,ALNU_00052_2k_324w.jpg,7,6


In [13]:
dataset_dict = pd.merge(dataset_dict, MDETR_title, left_index=True, right_index=True).rename({'num_bbox': 'num_bbox_MDETR_title', 'min_bbox_area': 'min_bbox_area_MDETR_title', 'max_bbox_area': 'max_bbox_area_MDETR_title', 'median_bbox_area': 'median_bbox_area_MDETR_title'}, axis=1)
dataset_dict = pd.merge(dataset_dict, MDETR_caption, left_index=True, right_index=True).rename({'num_bbox': 'num_bbox_MDETR_caption', 'min_bbox_area': 'min_bbox_area_MDETR_caption', 'max_bbox_area': 'max_bbox_area_MDETR_caption', 'median_bbox_area': 'median_bbox_area_MDETR_caption'}, axis=1)
dataset_dict = pd.merge(dataset_dict, GLIP_caption, left_index=True, right_index=True).rename({'num_bbox': 'num_bbox_GLIP_caption', 'min_bbox_area': 'min_bbox_area_GLIP_caption', 'max_bbox_area': 'max_bbox_area_GLIP_caption', 'median_bbox_area': 'median_bbox_area_GLIP_caption'}, axis=1)
dataset_dict = pd.merge(dataset_dict, GLIP_title, left_index=True, right_index=True).rename({'num_bbox': 'num_bbox_GLIP_title', 'min_bbox_area': 'min_bbox_area_GLIP_title', 'max_bbox_area': 'max_bbox_area_GLIP_title', 'median_bbox_area': 'median_bbox_area_GLIP_title'}, axis=1)
dataset_dict.head()

Unnamed: 0,image_id,filename,len_caption,len_title,num_bbox_MDETR_title,min_bbox_area_MDETR_title,max_bbox_area_MDETR_title,median_bbox_area_MDETR_title,num_bbox_MDETR_caption,min_bbox_area_MDETR_caption,max_bbox_area_MDETR_caption,median_bbox_area_MDETR_caption,num_bbox_GLIP_caption,min_bbox_area_GLIP_caption,max_bbox_area_GLIP_caption,median_bbox_area_GLIP_caption,num_bbox_GLIP_title,min_bbox_area_GLIP_title,max_bbox_area_GLIP_title,median_bbox_area_GLIP_title
0,2,ALCU_00005_2k_324w.jpg,6,18,3,708.0,85733.0,1182.0,3,6862.0,50244.0,42928.0,1,53361.0,53361.0,53361.0,1,2266.0,2266.0,2266.0
1,4,ALCU_00033_2k_324w.jpg,5,8,1,37941.0,37941.0,37941.0,2,39104.0,146906.0,39104.0,1,42747.0,42747.0,42747.0,1,42669.0,42669.0,42669.0
2,5,ALNU_00015_2k_324w.jpg,8,5,1,52499.0,52499.0,52499.0,1,50734.0,50734.0,50734.0,1,52736.0,52736.0,52736.0,1,53524.0,53524.0,53524.0
3,6,ALNU_00016_2k_324w.jpg,7,6,2,44221.0,62845.0,44221.0,2,41603.0,62280.0,41603.0,1,42702.0,42702.0,42702.0,2,41893.0,67260.0,41893.0
4,7,ALNU_00052_2k_324w.jpg,7,6,5,26689.0,67220.0,33665.0,2,68115.0,113862.0,68115.0,3,32610.0,117776.0,39382.0,3,33316.0,66190.0,39599.0


In [14]:
dataset.head()

Unnamed: 0,idx,filename,caption,model,expr,conf,bbox,labels,image_id,caption_raw,caption_preprocessed,title_raw,title_preprocessed
0,1027,JOMU_32980_2k_324w.jpg,people buying sweets at the market.,GLIP,caption,"[tensor(0.8280), tensor(0.7924), tensor(0.6899...","[[tensor(178.8594), tensor(116.5122), tensor(2...","[people, people, people, people, people, peopl...",1488.0,people buying sweets at the market.,people buying sweets at the market.,"Saint Nicholas Festival Market, Place de Notre...","saint nicholas festival market, place de notre..."
1,183,CAPO_02480_2k_324w.jpg,a group of soldiers stand in front of a building.,GLIP,caption,"[tensor(0.8242), tensor(0.5884), tensor(0.5781...","[[tensor(0.8775), tensor(131.9302), tensor(81....","[soldiers, a building, a building, a building]",516.0,a group of soldiers stand in front of a building.,a group of soldiers stand in front of a building.,[Mobilization at Perolles in August 1914],[mobilization at perolles in august 1914]
2,543,JATH_26232_2k_324w.jpg,soldiers stand in front of a military vehicle.,GLIP,caption,"[tensor(0.7898), tensor(0.7305), tensor(0.6872...","[[tensor(97.0861), tensor(137.5495), tensor(14...","[soldiers, soldiers, a military vehicle, soldi...",950.0,soldiers stand in front of a military vehicle.,soldiers stand in front of a military vehicle.,"Additional service for women, Barracks de la P...","additional service for women, barracks de la p..."
3,430,JATH_10616_2k_324w.jpg,women walking down a street.,GLIP,caption,"[tensor(0.6900), tensor(0.6777), tensor(0.6676...","[[tensor(125.4207), tensor(111.6184), tensor(1...","[women, women, women, a street, women]",836.0,women walking down a street.,women walking down a street.,Procession on the route to the Alps during a w...,procession on the route to the alps during a w...
4,408,HAWI_01023_2k_324w.jpg,a photograph of a large tropical cyclone.,GLIP,caption,[tensor(0.7263)],"[[tensor(4.5861), tensor(13.1419), tensor(318....",[a large tropical cyclone],814.0,a photograph of a large tropical cyclone.,a a large tropical cyclone.,Tornado over Lake Morat,tornado over lake morat


In [15]:
merged_dataset = pd.merge(dataset, dataset_dict.drop('image_id', axis=1), left_on='filename', right_on='filename', how='left')

In [18]:
((merged_dataset.model == 'GLIP') & (merged_dataset.expr==('caption'))).sum()

1130

In [19]:
((merged_dataset.model == 'GLIP') & (merged_dataset.expr==('title'))).sum()

224

In [20]:
((merged_dataset.model == 'MDETR') & (merged_dataset.expr==('caption'))).sum()

91

In [21]:
((merged_dataset.model == 'MDETR') & (merged_dataset.expr==('title'))).sum()

15

|         | MDETR | GLIP |
|---------|-------|------|
| Caption | 91    | 1130 |
| Title   | 15    | 224  |

In [24]:
merged_dataset['conf'] =  merged_dataset.conf.apply(lambda x: torch.tensor(x).numpy())
merged_dataset['bbox'] =  merged_dataset.bbox.apply(lambda x: torch.tensor(x).numpy())

  merged_dataset['conf'] =  merged_dataset.conf.apply(lambda x: torch.tensor(x).numpy())
  merged_dataset['bbox'] =  merged_dataset.bbox.apply(lambda x: torch.tensor(x).numpy())


In [30]:
def get_label(elem):
    if elem['model'] == 'MDETR':
        if elem['expr'] == 'caption':
            return 0
        else:
            return 1
    else:
        if elem['expr'] == 'caption':
            return 2
        else:
            return 3

merged_dataset['label'] = merged_dataset.apply(lambda x: get_label(x), axis=1)
merged_dataset[['label', 'model', 'expr']].sample(10)

Unnamed: 0,label,model,expr
1356,2,GLIP,caption
1176,2,GLIP,caption
1193,2,GLIP,caption
375,0,MDETR,caption
489,2,GLIP,caption
904,2,GLIP,caption
620,3,GLIP,title
1205,2,GLIP,caption
555,2,GLIP,caption
214,3,GLIP,title


In [31]:
merged_dataset.keys()
features = merged_dataset.dropna()[[ 'len_caption', 'len_title',
       'num_bbox_MDETR_title', 'min_bbox_area_MDETR_title',
       'max_bbox_area_MDETR_title', 'median_bbox_area_MDETR_title',
       'num_bbox_MDETR_caption', 'min_bbox_area_MDETR_caption',
       'max_bbox_area_MDETR_caption', 'median_bbox_area_MDETR_caption',
       'num_bbox_GLIP_caption', 'min_bbox_area_GLIP_caption',
       'max_bbox_area_GLIP_caption', 'median_bbox_area_GLIP_caption',
       'num_bbox_GLIP_title', 'min_bbox_area_GLIP_title',
       'max_bbox_area_GLIP_title', 'median_bbox_area_GLIP_title']]
labels = merged_dataset.dropna()[['label']]


In [43]:
labels

array([[2],
       [2],
       [2],
       ...,
       [2],
       [3],
       [2]])

In [52]:
labels = np.array(labels)
labels_one_hot = np.eye(4)[labels].reshape(-1, 4)
# Convert to numpy array
features = np.array(features)
train_features, test_features, train_labels, test_labels = train_test_split(features, labels_one_hot, test_size = 0.25, random_state = 42)

In [53]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)


Training Features Shape: (820, 18)
Training Labels Shape: (820, 4)
Testing Features Shape: (274, 18)
Testing Labels Shape: (274, 4)


In [54]:
test_labels

array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.]])

In [55]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);

In [57]:
# get accuracy 
predictions = rf.predict(test_features)
predictions = np.argmax(predictions, axis=1)
gd = np.argmax(test_labels, axis=1)
print('Accuracy:', (predictions == gd).sum()/ len(gd))

Accuracy: 0.7664233576642335


In [59]:
gd

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2,
       2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 2, 2, 2, 2, 2, 2, 0, 3,
       2, 2, 2, 2, 2, 2, 3, 3, 2, 2, 0, 3, 3, 2, 2, 2, 3, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 0, 3, 3, 3, 2, 2, 0, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2,
       2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 1,
       2, 3, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 3, 3, 2, 2, 3, 2, 3, 3, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 3, 3, 2, 0, 3, 3, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
       0, 2, 2, 2, 2, 2, 0, 3, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 0, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 0, 3, 2, 3, 1,
       2, 2, 0, 3, 0, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 3, 2, 0, 2, 2, 2,
       2, 2, 3, 2, 3, 3, 2, 3, 2, 2])

## With PyTorch

In [60]:
train_features = torch.tensor(train_features).float()
train_labels = torch.tensor(train_labels).float()

In [72]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


class PyDataset(data.Dataset):

    def __init__(self, features, label):
        """
        Inputs:
            size - Number of data points we want to generate
            seed - The seed to use to create the PRNG state with which we want to generate the data points
            std - Standard deviation of the noise (see generate_continuous_xor function)
        """
        super().__init__()
        self.features = features
        self.label = label

    def __len__(self):
        # Number of data point we have. Alternatively self.data.shape[0], or self.label.shape[0]
        return len(self.label)

    def __getitem__(self, idx):
        # Return the idx-th data point of the dataset
        # If we have multiple things to return (data point and label), we can return them as tuple
        data_point = self.features[idx]
        data_label = self.label[idx]
        return data_point, data_label


def train_NN(model, train_loader, optimizer, loss_fn, device, epochs=10):
    """
    Inputs:
        model - The model we want to train
        train_loader - The data loader that will give us the data points
        optimizer - The optimizer we want to use to update the model parameters
        loss_fn - The loss function we want to use to compute the loss
        device - The device on which we want to train the model
    """
    model.train()
    pbar = tqdm(range(epochs))
    for epoch in pbar:
        loss_array = []
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = loss_fn(output, target)
            loss.backward()
            optimizer.step()
            loss_array.append(loss.item())
        pbar.set_description(f"Epoch {epoch} - Loss: {np.mean(loss_array)}")

train_dataset = PyDataset(torch.tensor(train_features).float().cuda(), torch.tensor(train_labels).float().cuda())
train_data_loader = data.DataLoader(train_dataset, batch_size=8, shuffle=True)

classifier = nn.Sequential(
    nn.Linear(18, 128),
    nn.ReLU(),
    nn.Linear(128, 128),
    nn.ReLU(),
    nn.Linear(128, 128),
    nn.ReLU(),
    nn.Linear(128, 4),
    nn.Softmax(dim=1)
)
classifier = classifier.cuda()

optimizer = optim.Adam(classifier.parameters(), lr=0.0001)
loss = nn.MSELoss()

train_NN(classifier, train_data_loader, optimizer, loss, device='cuda', epochs=2)

  train_dataset = PyDataset(torch.tensor(train_features).float().cuda(), torch.tensor(train_labels).float().cuda())
Epoch 1 - Loss: 0.4047330097087379: 100%|██████████| 2/2 [00:00<00:00,  5.36it/s] 


In [73]:
# eval
def eval_NN(model, test_loader, device):
    model.eval()
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.argmax(dim=1, keepdim=True).view_as(pred)).sum().item()

    print(f'Accuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)')

test_dataset = PyDataset(torch.tensor(test_features).float().cuda(), torch.tensor(test_labels).float().cuda())
test_data_loader = data.DataLoader(test_dataset, batch_size=8, shuffle=True)

eval_NN(classifier, test_data_loader, device='cuda')

Accuracy: 41/274 (15%)


In [None]:
# save the results of the model
predictions = []
for batch in test_data_loader:
    data_input, labels = batch
    output = classifier(data_input.float())
    _, predicted = torch.max(output.data, 1)
    predictions.append(predicted)
    print(output)

In [190]:
import torch
import torch.nn as nn

# Define the model
class Classifier(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(Classifier, self).__init__()
    self.fc1 = nn.Linear(input_size, hidden_size)
    self.fc2 = nn.Linear(hidden_size, output_size)
  
  def forward(self, x):
    x = self.fc1(x)
    x = torch.relu(x)
    x = self.fc2(x)
    return x

# Create an instance of the model
model = Classifier(19, 32, 4)
model.train()
# Define the loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the model
for epoch in range(400):
  # Forward pass
  output = model(torch.tensor(X_resampled).float())
  loss = loss_fn(output, torch.tensor(y_resampled).float())

  # Backward pass and optimization
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()


In [191]:
model.eval()

# Calculate the accuracy on the test dataset
with torch.no_grad():
  output = model(torch.tensor(X_resampled).float())
  predictions = output.argmax(dim=1)
  accuracy = (predictions == torch.tensor(y_resampled).float().argmax(dim=1)).float().mean()
  print('Test accuracy:', accuracy)

Test accuracy: tensor(0.2268)


In [181]:
model.eval()

# Calculate the accuracy on the test dataset
with torch.no_grad():
  output = model(torch.tensor(features).float())
  predictions = output.argmax(dim=1)
  accuracy = (predictions == torch.tensor(labels).float().argmax(dim=1)).float().mean()
  print('Test accuracy:', accuracy)

Test accuracy: tensor(0.4610)


In [185]:
(np.argmax(test_features, axis=1) == 0).sum() / len(test_features)

0.0821917808219178

## Dataset creation

In [84]:
df = merged_dataset

In [85]:
import pandas
import torch
df_temp = pd.DataFrame(columns=['label', 'caption', 'title'])
df_temp['caption'] = df.caption_raw.str.lower()
df_temp['title'] = df.title_raw.str.lower()
df_temp['label'] = df.caption == df.title_raw
df_temp['label'] = df_temp.label.astype(int)
# df_temp['label'] = df_temp.label.apply(lambda x: torch.tensor([1,0]) if x==0 else torch.tensor([0,1]))
df_temp['title'] = df_temp['title'].astype(str)
df_temp['caption'] = df_temp['caption'].astype(str)
df_temp = df_temp.reset_index().rename({'index':'idx'}, axis=1)
df_temp.head()

Unnamed: 0,idx,label,caption,title
0,0,0,people buying sweets at the market.,"saint nicholas festival market, place de notre..."
1,1,0,a group of soldiers stand in front of a building.,[mobilization at perolles in august 1914]
2,2,0,soldiers stand in front of a military vehicle.,"additional service for women, barracks de la p..."
3,3,0,women walking down a street.,procession on the route to the alps during a w...
4,4,0,a photograph of a large tropical cyclone.,tornado over lake morat


In [86]:
# swap caption and title for 50% of the data, switch label for them
df_temp2 = df_temp.copy()
df_temp2['caption'] = df_temp['title']
df_temp2['title'] = df_temp['caption']
df_temp2['label'] = df_temp2['label'].apply(lambda x: 1-x)
df_temp2.head()

df_temp = pd.concat([df_temp, df_temp2])
df_temp = df_temp.reset_index().rename({'index':'idx'}, axis=1)


In [87]:
df_temp

Unnamed: 0,idx,idx.1,label,caption,title
0,0,0,0,people buying sweets at the market.,"saint nicholas festival market, place de notre..."
1,1,1,0,a group of soldiers stand in front of a building.,[mobilization at perolles in august 1914]
2,2,2,0,soldiers stand in front of a military vehicle.,"additional service for women, barracks de la p..."
3,3,3,0,women walking down a street.,procession on the route to the alps during a w...
4,4,4,0,a photograph of a large tropical cyclone.,tornado over lake morat
...,...,...,...,...,...
2915,1455,1455,1,"celebration of the fête-dieu, the rest of the ...",monarch and noble person celebrate the wedding...
2916,1456,1456,1,"cortège de la saint-nicolas, torchbearers, fri...",a group of men lighting a fire in the dark.
2917,1457,1457,0,woman in a boat on lake morat,portrait of a young woman.
2918,1458,1458,1,"waves breaking on the jetty and rowing boat, l...",fishing boat on the beach.


In [198]:
train_df = pd.DataFrame(columns=['selected', 'expression'])
temp_train_df = pd.DataFrame(columns=['selected', 'expression'])
temp_train_df['expression'] = df['caption_raw']
temp_train_df['selected'] = df['caption'] == df['caption_raw']
train_df = train_df.append(temp_train_df)

temp_train_df = pd.DataFrame(columns=['selected', 'expression'])
temp_train_df['expression'] = df['title_raw']
temp_train_df['selected'] = df['caption'] == df['title_raw']
train_df = train_df.append(temp_train_df)
train_df.head()

  train_df = train_df.append(temp_train_df)
  train_df = train_df.append(temp_train_df)
  train_df = train_df.append(temp_train_df)


Unnamed: 0,selected,expression
0,True,people buying sweets at the market.
1,True,a group of soldiers stand in front of a building.
2,True,soldiers stand in front of a military vehicle.
3,True,women walking down a street.
4,True,a photograph of a large tropical cyclone.


In [201]:
pickle.dump(train_df, open('train_df.pkl', 'wb'))

# Get info from GLIP MDETR

In [None]:
import seaborn as sns

In [None]:
df = merged_dataset

In [None]:
def show_dist(df, ax, columns, feature, model_names, log_scale, discrete=False):
    temp_df = pd.DataFrame(columns=[feature, 'model'])
    for name, column in zip(model_names, columns):
        for column_ in column:
            temp_df = temp_df.append(pd.DataFrame({feature: df[column_], 'model': name}))
    temp_df.dropna(inplace=True)

    temp_df.index = range(len(temp_df))
    # distribution of the feature for each model in percentage
    return sns.histplot(temp_df, ax=ax, x=feature, hue='model', fill=True, common_norm=False, stat='percent', log_scale=log_scale, discrete=discrete)


In [None]:
columns = [['median_bbox_area_GLIP_caption', 'median_bbox_area_GLIP_title'], ['median_bbox_area_MDETR_caption', 'median_bbox_area_MDETR_title']]
model_names = ['GLIP', 'MDETR']
fig, ax = plt.subplots(1,2, figsize=(12, 5))
show_dist(df, ax[1], columns, 'median_bbox_area', model_names, log_scale=True)
ax[1].set_ylabel('Percentage')
ax[1].set_xlabel('Median Bounding Box Area')
ax[1].set_title('Distribution of Median Bounding Box Area')


# do the same as above but for num_bbox
columns = [['num_bbox_GLIP_caption', 'num_bbox_GLIP_title'], ['num_bbox_MDETR_caption', 'num_bbox_MDETR_title']]
model_names = ['GLIP', 'MDETR']
filterd_df = df.copy()
for column in columns:
    for column_ in column:
        filterd_df = filterd_df.loc[(filterd_df[column_] < 15) & (filterd_df[column_] > 0)]

show_dist(filterd_df,ax[0], columns, 'num_bbox', model_names, log_scale=False, discrete=True)
ax[0].set_xlabel('Number of Bounding Boxes')
ax[0].set_ylabel('Percentage')
ax[0].set_title('Distribution of Number of Bounding Boxes')

plt.savefig('../docs/assets/num_bbox_median_bbox_area.png', dpi=300)
plt.show()

# show_dist_plotly(df, columns, 'median_bbox_area', model_names)

In [None]:
df.loc[df.num_bbox_GLIP_caption > 20] 

In [None]:
temp_df.index = np.arange(len(temp_df))

In [None]:
sns.displot(temp_df, x="mean_area", hue="model", kind="kde", fill=True)

In [None]:
temp_df.loc[temp_df['mean_area'] < 0]