# Housekeep human preferences dataset

In this notebook, we provide a walk through our Housekeep human preferences dataset. 

Download the dataset [here](https://drive.google.com/drive/folders/1pnFWwEAtSupY0MCmvfYYaf6-kD9c0dp8) and upload it before running this notebook.

## Import packages and compile helper functions

In [1]:
import numpy as np
import pandas as pd

In [2]:
def receptacle_labels2vec(x):
  result = np.zeros((128,)) # because of 128 receptacles
  for recep in x:
     result[recep] = 1
  return result


def distance_metric(x, y):

  dot_prod = np.dot(x, y)

  # jaccard distance = 1 - jaccard similarity = 1 - (a n b)/(a U b)
  return 1 - dot_prod/(sum(x**2) + sum(y**2) - dot_prod)

## Load dataset

In [3]:
data_path = './housekeep.npy'
data_dict = np.load(data_path, allow_pickle=True).item()

In [4]:
objects = data_dict['objects']
rooms = data_dict['rooms']
room_receps = data_dict['room_receptacles']
data = data_dict['data']

## Visualizing the data

### DataFrame

Our data is organized as a pandas DataFrame with following fields:



**annotator_idx**: The participants are indexed as 0, 1, ..., 370. 

**assignment_idx**: The assignments are indexed as 0, 1, ... . Each assignment has a set of 10 tasks which are solved by a single participant. Each individual task involves finding and ranking *correct* and *misplaced* receptacles in a given room for a given object. 

We collect 10 annotations for each object-room pair.

**object_idx**: object indexed as 0, 1, ..., 267.  

**room_idx**: room indexed as 0, 1, ..., 16.

**correct**/**misplaced**: ranked list of *correct*/*misplaced* room-receptacles. Each room-receptacle is indexed as 0, 1, ..., 127.

**implausible**: list of implausible room-receptacles indexed as 0, 1, ..., 127.


In [None]:
print(f"Objects ({len(objects)}): {', '.join(objects[:5])}, ...")
print(f"Rooms ({len(rooms)}): {', '.join(rooms[:5])}, ...")
print(f"Room receptacles ({len(room_receps)}): {', '.join(room_receps[:5])}, ...")


In [None]:
print(f"Total number of rows: {len(data)} (10 x n_objects x n_rooms)")
data.head(5)

In [None]:
# Let's look at a sample annotation
id = 1

row = data.loc[id]

object_name = objects[row['object_idx']]
room_name = rooms[row['room_idx']]
correct_receps = [room_receps[r].split('|')[1] for r in row['correct']]
misplaced_receps = [room_receps[r].split('|')[1] for r in row['misplaced']]
implausible_receps = [room_receps[r].split('|')[1] for r in row['implausible']]

print(f'Object: {object_name}, Room: {room_name}')
print(f"Correct (ranked): {', '.join(correct_receps)}")
print(f"Incorrect (ranked): {', '.join(misplaced_receps)}")
print(f"Implausible:  {', '.join(implausible_receps)}")

## Cluster object-room pairs based on annotator receptacle annotations

Each object = 10 x num_rooms_obj annotations

In [6]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import AgglomerativeClustering
import numpy as np
import pandas as pd
import seaborn as sns

# hyperparameters
CLUSTERS = 4

clustering_pandas_df = pd.DataFrame([], 
    columns=['objectid', 'roomid', 'num_clusters', 'cluster_asgns'])

disagreed_object_names = []

# load objects from file
with open('housekeep_less_than_fair_agreement.txt', 'r') as fh:

  alllines = fh.readlines()
  for line in alllines:
    disagreed_object_names.append(line.split(',')[0])

for object_idx in range(268):
  object_name = objects[object_idx]

  for room_idx in range(17):
    room_name = rooms[room_idx]

    if '{}/{}'.format(object_name, room_name) not in disagreed_object_names:
      continue

    print(f'obj: {object_name}, room: {room_name}') #DEBUG

    filtered_values = np.where(data['object_idx'] == object_idx)
    annotators = []
    datapoints = []

    # lists to vector
    for i, d in data.loc[filtered_values].iterrows():

      vec = np.concatenate([receptacle_labels2vec(d['correct']), 
                              receptacle_labels2vec(d['misplaced']),
                              receptacle_labels2vec(d['implausible'])])

      annotators.append(d['annotator_idx'])
      datapoints.append(vec[np.newaxis, :])

    # array of datapoints
    datapoints_array = np.concatenate(datapoints, axis=0)

    # pairwise distance matrix
    pairwise_dist_mat = np.zeros((len(datapoints_array), len(datapoints_array)))
    for i1, d1 in enumerate(datapoints_array):
      for i2, d2 in enumerate(datapoints_array):
        pairwise_dist_mat[i1, i2] = distance_metric(d1, d2)

    # clusters
    num_clusters = CLUSTERS
    check_flag = False
    no_clustering = False

    while not check_flag:

      if num_clusters == 1:
        no_clustering = True
        break

      clusters = AgglomerativeClustering(n_clusters=num_clusters, affinity="precomputed", linkage="single").fit_predict(pairwise_dist_mat)
      unique_cluster_asgns, cluster_counts = np.unique(clusters, return_counts=True)
      # print(f'for {num_clusters} clusters, cluster asgns: {clusters}') #DEBUG
      check_flag = all([x>=3 for x in cluster_counts])
      num_clusters = num_clusters - 1

    if no_clustering:
      # print(f'{object_name}: no clustering!')
      continue

    clustering_pandas_df = pd.concat([clustering_pandas_df, pd.DataFrame(dict({'object':[object_name],
                                                                                'num_clusters': [num_clusters],
                                                                                'cluster_asgns': (len(clusters)*'{}-').format(*clusters)
                                                                                  }))
                                      ], ignore_index=True)
  
clustering_pandas_df.head(-5)
print('done')


obj: action_figure, room: bathroom
obj: action_figure, room: bedroom
obj: action_figure, room: childs_room
obj: action_figure, room: closet
obj: action_figure, room: corridor
obj: action_figure, room: dining_room
obj: action_figure, room: exercise_room
obj: action_figure, room: garage
obj: action_figure, room: home_office
obj: action_figure, room: kitchen
obj: action_figure, room: living_room
obj: action_figure, room: lobby
obj: action_figure, room: pantry_room
obj: action_figure, room: playroom
obj: action_figure, room: storage_room
obj: action_figure, room: television_room
obj: action_figure, room: utility_room
obj: adjustable_wrench, room: bathroom
obj: adjustable_wrench, room: bedroom
obj: adjustable_wrench, room: closet
obj: adjustable_wrench, room: corridor
obj: adjustable_wrench, room: dining_room
obj: adjustable_wrench, room: exercise_room
obj: adjustable_wrench, room: garage
obj: adjustable_wrench, room: lobby
obj: adjustable_wrench, room: pantry_room
obj: android_figure, room

## Calculate annotater agreement scores (Fleiss' Kappa)

In [None]:
def fleiss_kappa(subject_label_mat):

    assert isinstance(subject_label_mat, np.ndarray)

    rowwise_sum = np.sum(subject_label_mat, axis=1)
    nonzero_rows = np.where(rowwise_sum > 0)[0]

    # print('non0 rows: ', nonzero_rows) #DEBUG

    working_matrix = subject_label_mat[nonzero_rows, :]
    rowwise_sum_working = np.sum(working_matrix, axis=1) # number of ratings per subject

    #DEBUG
    # print('number of receptacles: ', len(working_matrix), ' | num of ratings for this obj: ', np.sum(working_matrix))
    # TODO: assert len(working_matrix)*10 == np.sum(working_matrix) # 10 annotators per  

    # proportion of assignments belonging to jth category/column
    p_j_array = np.sum(working_matrix, axis=0)/np.sum(working_matrix)
    assert len(p_j_array) == working_matrix.shape[1]

    # agreement score for category i w.r.t. total number of annotator pairs
    P_i_array = np.sum(working_matrix**2 - working_matrix, axis=1)/(rowwise_sum_working**2 - rowwise_sum_working)
    assert len(P_i_array) == working_matrix.shape[0]

    P_i_mean = np.sum(P_i_array)/len(P_i_array)

    P_e_mean = np.sum(p_j_array**2)

    return 1.0*(P_i_mean-P_e_mean)/(1-P_e_mean+1e-3)

In [None]:
object_agreement_scores = dict({})

for object_idx in range(268):
    for room_idx in range(17):
        filtered_values = np.where((data['object_idx']==object_idx)&(data['room_idx']==room_idx))

        receptacle2labels = np.zeros((128, 2)) # receptacle2labels[rcpt] = correct/incorrect(misplaced, implausible)

        for i, d in data.loc[filtered_values].iterrows():
            for rcpt in d['correct']:
                receptacle2labels[rcpt, 0] += 1

            for rcpt in d['misplaced']:
                receptacle2labels[rcpt, 1] += 1

            for rcpt in d['implausible']:
                receptacle2labels[rcpt, 1] += 1

        object_agreement_scores['{}/{}'.format(objects[object_idx], rooms[room_idx])] = fleiss_kappa(receptacle2labels)

# save all objects with agreement scores less than 0.2
with open('housekeep_less_than_fair_agreement.txt', 'w') as fw:
    for k, v in object_agreement_scores.items():
        if v < 0.2:
            fw.write(f'{k},{v}\n')


## Misc: Clustering object-receptacle pairs based on distance threshold

In [None]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np
import pandas as pd
import seaborn as sns

THRESHOLD = 0.3

clustering_pandas_df = pd.DataFrame([], 
    columns=['objectid', 'roomid', 'num_clusters', 'cluster_asgns'])

for object_idx in range(268):

  filtered_values = np.where(data['object_idx'] == object_idx)
  annotators = []
  datapoints = []

  # lists to vector
  for i, d in data.loc[filtered_values].iterrows():

    vec = np.concatenate([receptacle_labels2vec(d['correct']), 
                            receptacle_labels2vec(d['misplaced']),
                            receptacle_labels2vec(d['implausible'])])

    annotators.append(d['annotator_idx'])
    datapoints.append(vec[np.newaxis, :])

  # array of datapoints
  datapoints_array = np.concatenate(datapoints, axis=0)

  # pairwise distance matrix
  pairwise_dist_mat = np.zeros((len(datapoints_array), len(datapoints_array)))
  for i1, d1 in enumerate(datapoints_array):
    for i2, d2 in enumerate(datapoints_array):
      pairwise_dist_mat[i1, i2] = distance_metric(d1, d2)

  # clusters
  clusters = AgglomerativeClustering(n_clusters=None, affinity="precomputed", linkage="single", distance_threshold=THRESHOLD).fit_predict(pairwise_dist_mat)
  num_clusters = 1 + max(clusters)

  clustering_pandas_df = pd.concat([clustering_pandas_df, pd.DataFrame(dict({'objectid':[object_idx],
                                                                              'num_clusters': [num_clusters],
                                                                              'cluster_asgns': (len(clusters)*'{}-').format(*clusters)
                                                                                }))
                                    ], ignore_index=True)

In [None]:
if THRESHOLD == 0.5:
    clustering_pandas_df.to_csv('./dataframe_distthresh-5en1.csv', encoding = 'utf-8-sig')
elif THRESHOLD == 0.7:
    clustering_pandas_df.to_csv('./dataframe_distthresh-7en1.csv', encoding = 'utf-8-sig')
elif THRESHOLD == 0.3:
    clustering_pandas_df.to_csv('./dataframe_distthresh-3en1.csv', encoding = 'utf-8-sig')

In [None]:
import matplotlib.pyplot as plt

THRESHOLD = 0.3
if THRESHOLD == 0.5:
    clustering_pandas_df = pd.read_csv('./dataframe_distthresh-5en1.csv')
elif THRESHOLD == 0.7:
    clustering_pandas_df = pd.read_csv('./dataframe_distthresh-7en1.csv')
elif THRESHOLD == 0.3:
    clustering_pandas_df = pd.read_csv('./dataframe_distthresh-3en1.csv')

tuples_to_plot = pd.DataFrame([], columns=['obj_room', 'num_clusters'])
for _, row in clustering_pandas_df.iterrows():
  newdf = pd.DataFrame(
      dict({'obj_room':['o{}r{}'.format(row['objectid'], row['roomid'])],
            'num_clusters': [row['num_clusters']]
            })
      )
  tuples_to_plot = pd.concat([tuples_to_plot, newdf], ignore_index=True)

plt.figure(figsize=(12, 8))
sns.histplot(data=tuples_to_plot, x='num_clusters', discrete=True)
plt.xticks(range(11))
tuples_to_plot.head()
