In [5]:
import numpy as np
import glob
import os

from pathlib import Path
import cv2
import numpy as np
from matplotlib.animation import ArtistAnimation
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
from sklearn.manifold import TSNE
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup

# datapath_dd_training = '/media/tim/6f37312f-8eb4-400c-a4e7-e229c18bbf2c/datasets/calvin_debug_dataset/training'
datapath_dd_training = '/media/tim/E/datasets/task_D_D/training'
datapath_dd_val = '/media/tim/E/datasets/task_D_D/validation'
# datapath_dd_training = '/media/tim/6f37312f-8eb4-400c-a4e7-e229c18bbf2c/datasets/hulc2/unprocessed/real_world/500k_all_tasks_dataset_15hz'


## Visualize Calvin Data

In [6]:
def visualize():

    #parser = ArgumentParser(description="Interactive visualization of CALVIN dataset")
    #parser.add_argument("path", type=str, help="Path to dir containing scene_info.npy")
    #parser.add_argument("-d", "--data", nargs="*", default=["rgb_static", "rgb_gripper"], help="Data to visualize")
    #args = parser.parse_args()

    path = datapath_dd_training
    data = ["rgb_static", "rgb_gripper"]

    if not Path(path).is_dir():
        print(f"Path {path} is either not a directory, or does not exist.")
        exit()

    indices = next(iter(np.load(f"{path}/scene_info.npy", allow_pickle=True).item().values()))
    indices = list(range(indices[0], indices[1] + 1))

    scene_info = np.load(f"{path}/scene_info.npy", allow_pickle=True)
    print(scene_info)

    annotations = np.load(f"{path}/lang_annotations/auto_lang_ann.npy", allow_pickle=True).item()
    annotations = list(zip(annotations["info"]["indx"], annotations["language"]["ann"]))
    print(annotations)
    print(len(annotations))

    #idx = 0
    idx = 50000
    ann_idx = -1

    while True:
        t = np.load(f"{path}/episode_{indices[idx]:07d}.npz", allow_pickle=True)

        for d in data:
            if d not in t:
                print(f"Data {d} cannot be found in transition")
                continue

            img = cv2.resize(t[d], (400, 400))
            cv2.imshow(d, img[:, :, ::-1])

        for n, ((low, high), ann) in enumerate(annotations):
            if indices[idx] >= low and indices[idx] <= high:
                if n != ann_idx:
                    print(f"{ann}")
                    ann_idx = n

        # user_input = input("Enter something: ")


        key = cv2.waitKey(0)
        if key == ord("q"):
            break
        elif key == 83:  # Right arrow
            idx = (idx + 1) % len(indices)
        elif key == 81:  # Left arrow
            idx = (len(indices) + idx - 1) % len(indices)
        else:
            print(f'Unrecognized keycode "{key}"')

        

In [5]:
visualize()

{'calvin_scene_D': [0, 611098]}
[((315660, 315724), 'move the door to the left side'), ((191730, 191794), 'slide the door to the left side'), ((305439, 305503), 'slide down the switch'), ((340730, 340794), 'toggle the button to turn on the green light'), ((542337, 542401), 'toggle the light switch to turn on the yellow light'), ((536830, 536894), 'push the switch upwards'), ((575627, 575691), 'push down the button to turn on the led'), ((80243, 80307), 'open the cabinet drawer'), ((68433, 68497), 'grasp the drawer handle and open it'), ((370674, 370738), 'move up the switch'), ((526635, 526699), 'pull the handle of the drawer'), ((485616, 485680), 'move the sliding door to the left'), ((473791, 473839), 'put the block in the drawer'), ((201910, 201957), 'toggle the light switch to turn off the light bulb'), ((292365, 292429), 'move the door to the right side'), ((425910, 425974), 'turn on the yellow light'), ((91077, 91141), 'grasp the blue block and lift it up'), ((610343, 610407), 't

In [7]:
datapath_test= '/media/tim/E/datasets/task_D_D/training/episode_0053819.npz'
data = np.load(datapath_test)
print(list(data.keys()))
print(data['actions'].shape)
print(data['rgb_static'].shape)
print(data['rgb_gripper'].shape)

['actions', 'rel_actions', 'robot_obs', 'scene_obs', 'rgb_static', 'rgb_gripper', 'rgb_tactile', 'depth_static', 'depth_gripper', 'depth_tactile']
(7,)
(200, 200, 3)
(84, 84, 3)


## Captioning
### Data Loader

In [18]:
class AttrDict(dict):
    __setattr__ = dict.__setitem__

    def __getattr__(self, attr):
        try:
            return self.__getitem__(attr)
        except KeyError:
            raise AttributeError("Attribute %r not found" % attr)

    def __getstate__(self):
        return self

    def __setstate__(self, d):
        self = d

class CustomDataset(Dataset):
    def __init__(self, data_path, caption_path, tokenizer, max_seq_length):
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        self.data_path = data_path
        self.caption_data = self.load_caption_data(caption_path)
        self.data_files = [f for f in os.listdir(data_path) if f.startswith('episode_')]

    def __len__(self):
        return len(self.caption_data)
        # return len(self.data_files)
    
    def __getitem__(self, idx):
        annotation = self.caption_data[idx]
        caption = annotation[1]
        tokens = self.tokenizer.encode(caption, add_special_tokens=True, max_length=self.max_seq_length, truncation=True)
        padding_length = self.max_seq_length - len(tokens)
        if padding_length > 0:
            tokens += [0] * padding_length            
        token_tensor = torch.tensor(tokens)

        start_epi = annotation[0][0]

        actions = torch.zeros(64, 7) 
        rgb_static = torch.zeros(64, 200, 200, 3) 
        rgb_gripper = torch.zeros(64, 84, 84, 3) 
        for i in range(64):
            epi_num = str(start_epi + i).zfill(7)
            file_path = os.path.join(self.data_path, "episode_{}.npz".format(epi_num))
            data = np.load(file_path)
            actions[i] = torch.tensor(data['actions'])  # Assign values to the tensor
            rgb_static[i] = torch.tensor(data['rgb_static'])
            rgb_gripper[i] = torch.tensor(data['rgb_gripper'])

        return AttrDict({'token': token_tensor, 'instruction': caption,'actions': actions,'rgb_static': rgb_static,'rgb_gripper': rgb_gripper})

        """
        file_name = self.data_files[idx]
        file_path = os.path.join(self.data_path, file_name)
        for annotation in self.caption_data:
            epi_filename = int(file_name[-11:-4])
            if (annotation[0][0] <= epi_filename <= annotation[0][1]):
                data = np.load(file_path)
                actions = data['actions']
                rgb_static = data['rgb_static']
                rgb_gripper = data['rgb_gripper']
            
                tokens = self.tokenizer.encode(annotation[1], add_special_tokens=True, max_length=self.max_seq_length, truncation=True)
                padding_length = self.max_seq_length - len(tokens)
                if padding_length > 0:
                    tokens += [self.tokenizer.pad_token_id] * padding_length
                token_tensor = torch.tensor(tokens)
              
                return actions, rgb_static, rgb_gripper
        return None, None, None
        """

    def load_caption_data(self, caption_path):
        annotations = np.load(f"{caption_path}", allow_pickle=True).item()
        annotations = list(zip(annotations["info"]["indx"], annotations["language"]["ann"]))
        return annotations


tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
max_seq_length = 16

datapath_dd_training = '/media/tim/E/datasets/task_D_D/training'
datapath_dd_val = '/media/tim/E/datasets/task_D_D/validation'
caption_path_training = '/media/tim/E/datasets/task_D_D/training/lang_annotations/auto_lang_ann.npy'
caption_path_val = '/media/tim/E/datasets/task_D_D/validation/lang_annotations/auto_lang_ann.npy'

train_dataset = CustomDataset(datapath_dd_training, caption_path_training, tokenizer, max_seq_length)
val_dataset  = CustomDataset(datapath_dd_val, caption_path_val, tokenizer, max_seq_length)

batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=8)


Test Data Loader

In [22]:

"""
for actions, rgb_static, rgb_gripper in train_dataloader:
    print("Actions: ", actions.shape)
    print("rgb-static: ", rgb_static.shape)
    print("rgb-gripper: ", rgb_gripper.shape)
"""

for batch in train_dataloader:
    print("token: ", batch.token.shape)
    print(batch.instruction)
    print("actions: ", batch.actions.shape)
    print("rgb_static: ", batch.rgb_static.shape)
    print("rgb_gripper: ", batch.rgb_gripper.shape)
    print("#############################################")

token:  torch.Size([32, 16])
['pick up the pink block', 'push down the button to turn off the led', 'take the blue block and rotate it right', 'lift the blue block in the drawer', 'grasp the pink block from the drawer', 'move to the drawer, then store the object', 'pick up the red block from the drawer', 'take the pink block and rotate it left', 'place the object in the cabinet', 'pick up the blue block lying in the drawer', 'slide the door to the right side', 'turn on the green lamp', 'take the blue block and rotate it left', 'push the red block towards the right', 'go open the drawer', 'move the door all the way to the left and let go', 'slide the block into the drawer', 'grasp the blue block lying on the shelf', 'grasp the blue block in the drawer', 'grasp the blue block in the drawer', 'grasp the handle of the drawer, then close it', 'grasp the red block, then lift it up', 'turn the red block left', 'rotate the blue block to the left', 'rotate the red block to the left', 'in the sl

### Models
Captioning Model

In [None]:

class BehaviourEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        # self.pos_encoder = PositionalEncoding(CFG.d_model, dropout=CFG.dropout)
        self.pos_encoder = PositionalEncoding(2050, dropout=0.1)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=CFG.d_model, nhead=CFG.n_heads, batch_first=True, dim_feedforward=CFG.d_ff, dropout=CFG.dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=CFG.n_layers)

    def forward(self, src):

        image_features = src.observations # images: [batch_size, sequence_length, 2048]
        actions = src.actions

        src_key_padding_mask = (image_features.mean(dim=2)==0.0)
        features = torch.cat((image_features, actions), dim=-1)

        # Transformer encoder
        # add positional encoding
        features = features * math.sqrt(CFG.d_model)
        features = self.pos_encoder(features)
        behaviour_encoding = self.transformer_encoder(features, src_key_padding_mask=src_key_padding_mask)

        return behaviour_encoding


class ClipCaptionModel(nn.Module):

    def __init__(self, prefix_length: int, clip_length: Optional[int] = None, prefix_size: int = 512,
                 num_layers: int = 8, mapping_type: MappingType = MappingType.MLP):
        super(ClipCaptionModel, self).__init__()
        self.prefix_length = prefix_length
        self.gpt = GPT2LMHeadModel.from_pretrained('gpt2')
        self.gpt_embedding_size = self.gpt.transformer.wte.weight.shape[1]
        self.clip_project = TransformerMapper(prefix_size, self.gpt_embedding_size, prefix_length,
                                                                     clip_length, num_layers)
        self.behaviour_encoder = BehaviourEncoder()
        self.project_to_gpt = nn.Linear(514, self.gpt_embedding_size).to("cuda")

    def get_dummy_token(self, batch_size: int, device: torch.device) -> torch.Tensor:
        return torch.zeros(batch_size, self.prefix_length, dtype=torch.int64, device=device)

    def forward(self, data: AttrDict):

        embedding_text = self.gpt.transformer.wte(data.tokens)

        behaviour_encoding  = self.behaviour_encoder(data)
        
        behaviour_encoder_padding_mask = ~(data.observations.mean(dim=2)==0.0) * 1.0
        prefix_projections = self.project_to_gpt(behaviour_encoding)

        total_mask = torch.cat((behaviour_encoder_padding_mask, embedding_text), dim=1)

        ######################################
        # prefix_projections = self.clip_project(prefix).view(-1, self.prefix_length, self.gpt_embedding_size)
        embedding_cat = torch.cat((prefix_projections, embedding_text), dim=1)
        if labels is not None:
            dummy_token = self.get_dummy_token(tokens.shape[0], tokens.device)
            labels = torch.cat((dummy_token, tokens), dim=1)
        out = self.gpt(inputs_embeds=embedding_cat, labels=labels, attention_mask=total_mask)
        return out