In [2]:
import torch, torchvision
import matplotlib.pyplot as plt
import json
import cv2
import numpy as np
from copy import deepcopy
import os
from getpass import getpass
import urllib

In [3]:
torch.cuda.is_available()

False

In [4]:
from detectron2.modeling import build_model
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.structures.image_list import ImageList
from detectron2.data import transforms as T
from detectron2.modeling.box_regression import Box2BoxTransform
from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputs
from detectron2.structures.boxes import Boxes
from detectron2.layers import nms
from detectron2 import model_zoo
from detectron2.config import get_cfg

In [6]:
def bordered_resize(img, scale, center=None, border_size=None, border_color=None):
    if center == None:
        center = (int(img.shape[1]), int(img.shape[0]))
    if border_size == None:
        border_size = (img.shape[1], img.shape[0])
    if border_color == None:
        border_color=(255, 255, 255)
    M = cv2.getRotationMatrix2D(center, 0, scale)
    return cv2.warpAffine(img, M, border_size, borderValue=border_color)

In [7]:
cfg_path = "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml"

def load_config_and_model_weights(cfg_path):
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file(cfg_path))

    # ROI HEADS SCORE THRESHOLD
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5

    # Comment the next line if you're using 'cuda'
    cfg['MODEL']['DEVICE']='cpu'

    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(cfg_path)

    return cfg

cfg = load_config_and_model_weights(cfg_path)

In [8]:
def get_model(cfg):
    # build model
    model = build_model(cfg)

    # load weights
    checkpointer = DetectionCheckpointer(model)
    checkpointer.load(cfg.MODEL.WEIGHTS)

    # eval mode
    model.eval()
    return model

model = get_model(cfg)

The checkpoint state_dict contains keys that are not used by the model:
  [35mproposal_generator.anchor_generator.cell_anchors.{0, 1, 2, 3, 4}[0m


In [9]:
def prepare_image_inputs(cfg, img_list):
    # Resizing the image according to the configuration
    transform_gen = T.ResizeShortestEdge(
                [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
            )
    img_list = [transform_gen.get_transform(img).apply_image(img) for img in img_list]

    # Convert to C,H,W format
    convert_to_tensor = lambda x: torch.Tensor(x.astype("float32").transpose(2, 0, 1))

    batched_inputs = [{"image":convert_to_tensor(img), "height": img.shape[0], "width": img.shape[1]} for img in img_list]

    # Normalizing the image
    num_channels = len(cfg.MODEL.PIXEL_MEAN)
    pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(num_channels, 1, 1)
    pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1)
    normalizer = lambda x: (x - pixel_mean) / pixel_std
    images = [normalizer(x["image"]) for x in batched_inputs]

    # Convert to ImageList
    images =  ImageList.from_tensors(images,model.backbone.size_divisibility)
    
    return images, batched_inputs

In [10]:
def get_features(model, images):
    features = model.backbone(images.tensor)
    return features

def get_proposals(model, images, features):
    proposals, _ = model.proposal_generator(images, features)
    return proposals

In [11]:
def get_box_features(model, features, proposals):
    features_list = [features[f] for f in ['p2', 'p3', 'p4', 'p5']]
    box_features = model.roi_heads.box_pooler(features_list, [x.proposal_boxes for x in proposals])
    box_features = model.roi_heads.box_head.flatten(box_features)
    box_features = model.roi_heads.box_head.fc1(box_features)
    box_features = model.roi_heads.box_head.fc_relu1(box_features)
    box_features = model.roi_heads.box_head.fc2(box_features)

    box_features = box_features.reshape(1, 1000, 1024) # depends on your config and batch size
    return box_features, features_list

In [12]:
def get_prediction_logits(model, features_list, proposals):
    cls_features = model.roi_heads.box_pooler(features_list, [x.proposal_boxes for x in proposals])
    cls_features = model.roi_heads.box_head(cls_features)
    pred_class_logits, pred_proposal_deltas = model.roi_heads.box_predictor(cls_features)
    return pred_class_logits, pred_proposal_deltas

In [13]:
def get_box_scores(cfg, pred_class_logits, pred_proposal_deltas):
    box2box_transform = Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
    smooth_l1_beta = cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA

    outputs = FastRCNNOutputs(
        box2box_transform,
        pred_class_logits,
        pred_proposal_deltas,
        proposals,
        smooth_l1_beta,
    )

    boxes = outputs.predict_boxes()
    scores = outputs.predict_probs()
    image_shapes = outputs.image_shapes

    return boxes, scores, image_shapes

In [14]:
def get_output_boxes(boxes, batched_inputs, image_size):
    proposal_boxes = boxes.reshape(-1, 4).clone()
    scale_x, scale_y = (batched_inputs["width"] / image_size[1], batched_inputs["height"] / image_size[0])
    output_boxes = Boxes(proposal_boxes)

    output_boxes.scale(scale_x, scale_y)
    output_boxes.clip(image_size)

    return output_boxes

In [15]:
def select_boxes(cfg, output_boxes, scores):
    test_score_thresh = cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST
    test_nms_thresh = cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST
    cls_prob = scores.detach()
    cls_boxes = output_boxes.tensor.detach().reshape(1000,80,4)
    max_conf = torch.zeros((cls_boxes.shape[0]))
    for cls_ind in range(0, cls_prob.shape[1]-1):
        cls_scores = cls_prob[:, cls_ind+1]
        det_boxes = cls_boxes[:,cls_ind,:]
        keep = np.array(nms(det_boxes, cls_scores, test_nms_thresh))
        max_conf[keep] = torch.where(cls_scores[keep] > max_conf[keep], cls_scores[keep], max_conf[keep])
    keep_boxes = torch.where(max_conf >= test_score_thresh)[0]
    return keep_boxes, max_conf

In [16]:
MIN_BOXES=10
MAX_BOXES=100
def filter_boxes(keep_boxes, max_conf, min_boxes, max_boxes):
    if len(keep_boxes) < min_boxes:
        keep_boxes = np.argsort(max_conf).numpy()[::-1][:min_boxes]
    elif len(keep_boxes) > max_boxes:
        keep_boxes = np.argsort(max_conf).numpy()[::-1][:max_boxes]
    return keep_boxes

In [17]:
def get_visual_embeds(box_features, keep_boxes):
    return box_features[keep_boxes.copy()]

In [18]:
file_names = []
for file_name in os.listdir("Data/img  "):
  file_names.append(file_name)

In [19]:
len(file_names)

10000

In [20]:
count = 0
import os
for file_name in file_names:
  img_list = []
  name = "img_" + file_name
  name = plt.imread("Data/img/"+file_name)
  name = cv2.resize(name, (365, 252)) 
  namef = "img_" + file_name + "_bgr"
  namef = cv2.cvtColor(name, cv2.COLOR_RGB2BGR)
  img_list.append(namef)
  images, batched_inputs = prepare_image_inputs(cfg, img_list)
  features = get_features(model, images)
  proposals = get_proposals(model, images, features)
  box_features, features_list = get_box_features(model, features, proposals)
  pred_class_logits, pred_proposal_deltas = get_prediction_logits(model, features_list, proposals)
  boxes, scores, image_shapes = get_box_scores(cfg, pred_class_logits, pred_proposal_deltas)
  output_boxes = [get_output_boxes(boxes[i], batched_inputs[i], proposals[i].image_size) for i in range(len(proposals))]
  temp = [select_boxes(cfg, output_boxes[i], scores[i]) for i in range(len(scores))]
  keep_boxes, max_conf = [],[]
  for keep_box, mx_conf in temp:
    keep_boxes.append(keep_box)
    max_conf.append(mx_conf)
  MIN_BOXES=10
  MAX_BOXES=100
  keep_boxes = [filter_boxes(keep_box, mx_conf, MIN_BOXES, MAX_BOXES) for keep_box, mx_conf in zip(keep_boxes, max_conf)]
  visual_embeds = [get_visual_embeds(box_feature, keep_box) for box_feature, keep_box in zip(box_features, keep_boxes)]
  np.save("Emb_feature/test_features/"+file_name[:-4],visual_embeds[0].detach().numpy())

  max_size = (max_size + (stride - 1)) // stride * stride
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [3]:
a = np.load('Emb_feature/test_features/01235.npy')


In [4]:
np.shape(a)

(100, 1024)

In [24]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [5]:
file_names = []
for file_name in os.listdir("Emb_feature/test_features"):
  file_names.append(file_name)

In [7]:
len(file_names)

10000

In [None]:
# import glob
# npfiles = glob.glob("Emb_feature/test_features/*.npy")
# npfiles.sort()
# for npfile in npfiles:

In [8]:
path = 'Emb_feature/test_features/'
trainImages = []
for i in os.listdir(path):
  data = np.load(path+i)
  trainImages.append(data)

In [5]:
# Data
import pandas as pd
FMS = pd.read_csv("FMS_final.csv")
FMS.head()

Unnamed: 0,id,img,label,text,image,gen_caption
0,42953,img/42953.png,0,its their character not their color that matters,42953.png,man in black and white cap is holding up sign .
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...,23058.png,man wearing black shirt and white shirt is sta...
2,13894,img/13894.png,0,putting bows on your pet,13894.png,small child is jumping over the snow .
3,37408,img/37408.png,0,i love everything and everybody! except for sq...,37408.png,black dog with blue collar is walking through ...
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h...",82403.png,two men are standing in front of white building .


In [6]:
np.shape(FMS)

(8313, 6)

In [7]:
text_train = []
label_train = []
for file_name in os.listdir("Emb_feature/test_features"):
    # testNdesc = 'In the picture '+ FMS['gen_caption'].loc[FMS['id']==int(file_name[:-4])] + ' And the text says: ' + FMS['text'].loc[FMS['id']==int(file_name[:-4])]
    testNdesc = 'In the picture '+ FMS['gen_caption'].values[FMS['id']==int(file_name[:-4])] + ' And the text says: ' + FMS['text'].values[FMS['id']==int(file_name[:-4])].ravel()
    label = FMS['label'].values[FMS['id']==int(file_name[:-4])].ravel()
    
    text_train.append(testNdesc)
    label_train.append(label)
    

In [42]:
dataset = pd.DataFrame({'label': label_train, 'testNdesc': text_train}, columns=['label', 'testNdesc'])
dataset.to_csv("final_dataset.csv")

In [6]:
np.shape(trainImages)

(10000, 100, 1024)

In [8]:
np.save('Emb_feature/final_test_features/', trainImages)

In [9]:
a = np.load('Emb_feature/final_test_features/final_image.npy')

In [10]:
np.shape(a)

(10000, 100, 1024)

In [3]:
import sys
from absl import flags
sys.argv=['preserve_unused_tokens=False']
flags.FLAGS(sys.argv)

['preserve_unused_tokens=False']

In [2]:
pip install bert-tensorflow

Collecting bert-tensorflow
  Using cached bert_tensorflow-1.0.4-py2.py3-none-any.whl (64 kB)
Installing collected packages: bert-tensorflow
Successfully installed bert-tensorflow-1.0.4
Note: you may need to restart the kernel to use updated packages.


In [1]:
# Downloading tokenization script created by the Google
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [5]:
pip install tensorflow_hub

Collecting tensorflow_hub
  Using cached tensorflow_hub-0.12.0-py2.py3-none-any.whl (108 kB)
Installing collected packages: tensorflow_hub
Successfully installed tensorflow_hub-0.12.0
Note: you may need to restart the kernel to use updated packages.


In [39]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
from sklearn.model_selection import train_test_split

# from bert import tokenization

In [40]:
# Data
FMS = pd.read_csv("FMS_resampled.csv")
FMS.head()

Unnamed: 0.1,Unnamed: 0,id,img,label,text,image,gen_caption,textNdesc
0,0,7293,img/07293.png,1,anyone that says white people can't jump obvio...,07293.png,an old man is standing in front of white build...,In the picture an old man is standing in front...
1,1,53280,img/53280.png,0,my dog loves the attention,53280.png,little girl is sitting on the couch .,In the picture little girl is sitting on the c...
2,2,74562,img/74562.png,1,this old ass honky is a porch monkey 4-life yo...,74562.png,the man is wearing the white shirt and holding...,In the picture the man is wearing the white sh...
3,3,61589,img/61589.png,0,dementia or just another lying democrat it's h...,61589.png,man in black jacket and white shirt and black ...,In the picture man in black jacket and white s...
4,4,7218,img/07218.png,1,when you and ya classmate are hiding from the ...,07218.png,little girl in yellow dress and white dress is...,In the picture little girl in yellow dress and...


In [41]:
FMS['textNdesc'] = 'In the picture '+ FMS.gen_caption + ' And the text says: ' + FMS.text
print(FMS.textNdesc[0], '\n\n')
FMS.loc[8]

In the picture an old man is standing in front of white building . And the text says: anyone that says white people can't jump obviosly hasn't seen the 9/11 footage "white people can't jump" 




Unnamed: 0                                                     8
id                                                         76219
img                                                img/76219.png
label                                                          0
text                      i am thinking about investing my money
image                                                  76219.png
gen_caption        two men are standing in front of brick wall .
textNdesc      In the picture two men are standing in front o...
Name: 8, dtype: object

In [42]:
np.shape(FMS)

(5952, 8)

In [43]:
path = 'Emb_feature/test_features/'
trainImages = []


for i in range(0, len(FMS)):

    id = FMS.loc[i]['image'][:-4]
    # print(id)
    data = np.load(path + id + ".npy")
    trainImages.append(data)
    

In [44]:
np.save('Emb_feature/resampled_test_features/final_images.npy', trainImages)

In [19]:
visual_emb = np.load('Emb_feature/dev_test_features/final_images.npy')

In [20]:
np.shape(visual_emb)

(494, 100, 1024)

In [9]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [10]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    image_input = tf.keras.Input(
    shape=(100,1024),
    batch_size=None,
    name="image_input",
)

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

    concatted = tf.keras.layers.Concatenate()([sequence_output, image_input])


    clf_output = concatted[:, 0, :]

    # concatted = tf.keras.layers.Concatenate()([clf_output, image_input])

    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids, image_input], outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy', 'AUC'])
    
    return model

In [11]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

Wall time: 2min 9s
Parser   : 162 ms


In [24]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [30]:
train_input = bert_encode(FMS.textNdesc.values, tokenizer, max_len=100)
# test_input = bert_encode(X_test.values, tokenizer, max_len=160)
train_labels = FMS.label.values
# print(len(X_train), len(X_test), len(y_train), len(y_test))

In [31]:
model = build_model(bert_layer, max_len=100)
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 100)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 100)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 100)]        0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 1024),       335141889   ['input_word_ids[0][0]',         
                                 (None, 100, 1024)]               'input_mask[0][0]',       

  super().__init__(name, **kwargs)


In [32]:
from keras import backend as K
K.clear_session()

In [33]:
type(train_labels[0])

numpy.int64

In [34]:
%%time
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', save_best_only=True)

train_history = model.fit(
    [train_input, visual_emb[:8313]], train_labels,
    validation_split=0.25,
    epochs=20,
    callbacks=[checkpoint],
    batch_size=2,
    steps_per_epoch=12
)

Epoch 1/20