In [None]:
from custom_models.CustomViT import CustomViT
from custom_models.CustomViTMAE import CustomViTMAE
import torch
# call CustomViT
from transformers import AutoImageProcessor, ViTMAEForPreTraining, ViTMAEConfig
from PIL import Image

output_dir='/home/ubuntu/camelmera'
# trained_model_name = 'multimodal'
# output_dir='/home/ubuntu/weights/' + trained_model_name

# Initialize a new CustomViT model
model_name = "facebook/vit-mae-base"
vit_config = ViTMAEConfig.from_pretrained(model_name)
vit_config.output_hidden_states=True
vit_model = CustomViT(config=vit_config)

# Initialize a new CustomViTMAE model
model_name = "facebook/vit-mae-base"
config = ViTMAEConfig.from_pretrained(model_name)
config.output_hidden_states=True
custom_model = CustomViTMAE(config=config)
custom_model.vit = vit_model

# Load the state_dict from the saved model
state_dict = torch.load(f"{output_dir}/pytorch_model.bin")
custom_model.load_state_dict(state_dict)

# don't need decoders
vit_encoder = custom_model.vit

In [None]:
import numpy as np

def reward_function(state_embedding, goal_embedding, threshold=0.02, goal_reward=100):
    distance = np.linalg.norm(state_embedding - goal_embedding)

    if distance <= threshold:
        # Give a large positive reward when the goal is reached
        reward = goal_reward
    else:
        # Give a negative reward proportional to the distance otherwise
        reward = -distance

    return reward

In [None]:
from tem_dataloader import MultimodalDatasetPerTrajectory
import functools
import os
from torch.utils.data import Dataset, DataLoader

environment_name = 'AbandonedFactoryExposure'
environemnt_directory = f'/mnt/temp_mount/{environment_name}/Data_hard'
OBSERVATION_SIZE = 768
ACTION_SIZE = 7
BATCH_SIZE = 64

for i in range(0,10):
    if i==7:
        continue
    trajectory_folder_path = os.path.join(environemnt_directory, f'P00{i}')
    my_dataset = MultimodalDatasetPerTrajectory(trajectory_folder_path)
    train_dataloader = DataLoader(my_dataset, batch_size=BATCH_SIZE, shuffle=False)

    # Initialize empty arrays for observations, actions, rewards, and terminals
    all_observations = np.empty((0, OBSERVATION_SIZE))
    all_actions = np.empty((0, ACTION_SIZE))
    all_rewards = np.empty(0)
    all_terminals = np.empty(0, dtype=bool)

    for batch_idx, data in enumerate(train_dataloader):
        # get embedding
        vit_encoder.cuda()
        vit_encoder.eval()
        pixel_values = data["pixel_values"].cuda()
        pixel_values1 = data["pixel_values1"].cuda()
        pixel_values2 = data["pixel_values2"].cuda()
        outputs = vit_encoder(pixel_values,pixel_values1,pixel_values2,noise=None)
        embedding = outputs.last_hidden_state[:,0,:]
        observation = embedding.cpu().detach().numpy()
        # get action
        pose = data["pose_values"]
        action = torch.diff(pose,axis = 0).numpy()
        action = np.concatenate((action, np.zeros((1,7))), axis=0)
        # get reward
        goal = observation[-1]
        partial_function = functools.partial(reward_function, goal_embedding=goal)
        reward = np.apply_along_axis(partial_function, 1, observation)
        # get terminals
        terminals = np.zeros_like(reward, dtype=int)
        terminals[reward == 100] = 1

        # Concatenate observations, actions, rewards, and terminals
        all_observations = np.vstack((all_observations, observation))
        all_actions = np.vstack((all_actions, action))
        all_rewards = np.hstack((all_rewards, reward))
        all_terminals = np.hstack((all_terminals, terminals))

    print("All observations shape:", all_observations.shape)
    print("All actions shape:", all_actions.shape)
    print("All rewards shape:", all_rewards.shape)
    print("All terminals shape:", all_terminals.shape)
    
    np.save(f'hard/all_observations_P00{i}.npy', all_observations)
    np.save(f'hard/all_actions_P00{i}.npy', all_actions)
    np.save(f'hard/all_rewards_P00{i}.npy', all_rewards)
    np.save(f'hard/all_terminals_P00{i}.npy', all_terminals)

In [None]:
'''
Args:
        observations (numpy.ndarray): N-D array. If the
            observation is a vector, the shape should be
            `(N, dim_observation)`. If the observations is an image, the shape
            should be `(N, C, H, W)`.
        actions (numpy.ndarray): N-D array. If the actions-space is
            continuous, the shape should be `(N, dim_action)`. If the
            action-space is discrete, the shape should be `(N,)`.
        rewards (numpy.ndarray): array of scalar rewards. The reward function
            should be defined as :math:`r_t = r(s_t, a_t)`.
        terminals (numpy.ndarray): array of binary terminal flags.
        episode_terminals (numpy.ndarray): array of binary episode terminal
            flags. The given data will be splitted based on this flag.
            This is useful if you want to specify the non-environment
            terminations (e.g. timeout). If ``None``, the episode terminations
            match the environment terminations.
        discrete_action (bool): flag to use the given actions as discrete
            action-space actions. If ``None``, the action type is automatically
            determined.
    '''
hard_all_observations = np.load('hard/all_observations.npy')
hard_all_actions = np.load('hard/all_actions.npy')
hard_all_rewards = np.load('hard/all_rewards.npy')
hard_all_terminals = np.load('hard/all_terminals.npy')
print(np.count_nonzero(hard_all_terminals == 1))
# cql_dataset = MDPDataset(observations=all_observations,actions=all_actions,rewards=all_rewards,terminals=all_terminals,episode_terminals=all_terminals)

In [None]:
all_observations = np.load('all_observations.npy')
all_actions = np.load('all_actions.npy')
all_rewards = np.load('all_rewards.npy')
all_terminals = np.load('all_terminals.npy')
print(np.count_nonzero(all_terminals == 1))
# all_observations = np.vstack((all_observations, hard_all_observations))
# all_actions = np.vstack((all_actions, hard_all_actions))
# all_rewards = np.hstack((all_rewards, hard_all_rewards))
# all_terminals = np.hstack((all_terminals, hard_all_terminals))
# print("All observations shape:", all_observations.shape)
# print("All actions shape:", all_actions.shape)
# print("All rewards shape:", all_rewards.shape)
# print("All terminals shape:", all_terminals.shape)
cql_dataset = MDPDataset(observations=all_observations,actions=all_actions,rewards=all_rewards,terminals=all_terminals,episode_terminals=all_terminals)

In [None]:
print(cql_dataset.actions.shape)

In [None]:
from d3rlpy.algos import CQL

# setup CQL algorithm
cql = CQL(use_gpu=True, initial_alpha=1.0)

# split train and test episodes
# train_episodes, test_episodes = train_test_split(cql_dataset, test_size=0.25)

# start training
cql.fit(cql_dataset,
        eval_episodes=None,
        n_epochs=80,
        scorers=None)

In [None]:
print(np.sum(all_rewards[0:64]))

In [None]:
import numpy as np
from d3rlpy.dataset import Episode, MDPDataset, Transition

OBSERVATION_SIZE = 768
ACTION_SIZE = 7
BATCH_SIZE = 64

all_observations = np.empty((0, OBSERVATION_SIZE))
all_actions = np.empty((0, ACTION_SIZE))
all_rewards = np.empty(0)
all_terminals = np.empty(0, dtype=bool)

for i in range(0,10):
    if i==7:
        continue
    observation = np.load(f'hard/all_observations_P00{i}.npy')
    action = np.load(f'hard/all_actions_P00{i}.npy')
    reward = np.load(f'hard/all_rewards_P00{i}.npy')
    terminals = np.load(f'hard/all_terminals_P00{i}.npy')

    all_observations = np.vstack((all_observations, observation))
    all_actions = np.vstack((all_actions, action))
    all_rewards = np.hstack((all_rewards, reward))
    all_terminals = np.hstack((all_terminals, terminals))

    print("All observations shape:", all_observations.shape)
    print("All actions shape:", all_actions.shape)
    print("All rewards shape:", all_rewards.shape)
    print("All terminals shape:", all_terminals.shape)
cql_dataset = MDPDataset(observations=all_observations,actions=all_actions,rewards=all_rewards,terminals=all_terminals,episode_terminals=all_terminals)

In [None]:
from d3rlpy.algos import CQL
cql01 = CQL(use_gpu=False)
cql01.build_with_dataset(cql_dataset)
cql01.load_model('/home/ubuntu/camelmera/models/gym/multimodal/d3rlpy_logs/CQL_20230503011241/model_40.pt')

In [None]:
from d3rlpy.algos import CQL

cql = CQL.from_json('d3rlpy_logs/CQL_20230504012746/params.json')

# ready to load
cql.load_model('d3rlpy_logs/CQL_20230504012746/model_310.pt')

In [None]:
# start training
cql.fit(cql_dataset,
        eval_episodes=None,
        n_epochs=30,
        scorers=None)

In [None]:
# start training
cql.fit(cql_dataset,
        eval_episodes=None,
        n_epochs=10,
        scorers=None)

In [None]:
import numpy as np
all_rewards = np.load('all_rewards.npy')
print(all_rewards[0:64])

In [None]:
import functools
import numpy as np

all_observations = np.load('all_observations.npy')
# observation = np.load('hard/all_observations_P000.npy')
print(all_observations.shape)
# get reward
all_rewards = np.empty(0)
all_terminals = np.empty(0)
for i in range(0,len(all_observations),64): 
    goal = None
    if i+63 < len(all_observations):
        goal = all_observations[i+63]
        observation = all_observations[i:i+64]
    else:
        goal = all_observations[-1]
        observation = all_observations[i:]
    partial_function = functools.partial(reward_function, goal_embedding=goal,threshold=0.01)
    reward = np.apply_along_axis(partial_function, 1, observation)
    all_rewards = np.hstack((all_rewards, reward))
    # get terminals
    terminal = np.zeros_like(reward, dtype=int)
    terminal[reward == 100] = 1
    # print(np.count_nonzero(terminal == 1))
    all_terminals = np.hstack((all_terminals,terminal))
print(terminal)

In [None]:
count_100 = np.count_nonzero(all_terminals == 1)
print("Number of terminals:", count_100)
print(len(all_terminals)/64)

In [None]:
np.save('all_terminals.npy',all_terminals)
np.save('all_rewards.npy',all_rewards)

In [None]:
original_terminals = np.load('hard/all_terminals_P000.npy')
print(original_terminals)
print(np.count_nonzero(original_terminals == 1))
print(len(original_terminals)/64)

In [None]:
np.sum(reward[0:100])

In [None]:
print(len(reward))

In [None]:
# check difference around goal = observation[-100]
downsampled_observation = observation[::10,:]
print(downsampled_observation.shape)

In [None]:
# get reward
goal = downsampled_observation[-23]
partial_function = functools.partial(reward_function, goal_embedding=goal,threshold=0.02)
reward = np.apply_along_axis(partial_function, 1, downsampled_observation)
# get terminals
terminals = np.zeros_like(reward, dtype=int)
terminals[reward == 100] = 1
count_100 = np.count_nonzero(reward == 100)
print("Number of elements with the value 100:", count_100)

In [None]:
easy_actions = np.load('all_actions.npy')
hard_actions = np.load('hard/all_actions.npy')
print(np.min(easy_actions,axis=0)[0],np.max(easy_actions,axis=0)[0],np.mean(easy_actions,axis=0)[0],np.std(easy_actions,axis=0)[0])
print(np.min(easy_actions,axis=0)[3],np.max(easy_actions,axis=0)[3],np.mean(easy_actions,axis=0)[3],np.std(easy_actions,axis=0)[3])
print(np.min(hard_actions,axis=0)[0],np.max(hard_actions,axis=0)[0],np.mean(hard_actions,axis=0)[0],np.std(hard_actions,axis=0)[0])
print(np.min(hard_actions,axis=0)[3],np.max(hard_actions,axis=0)[3],np.mean(hard_actions,axis=0)[3],np.std(hard_actions,axis=0)[3])