In [1]:
import pandas as pd
import numpy as np

In [2]:
def converter(instr):
    return np.fromstring(instr[1:-1],sep=',', dtype=int)

In [3]:
bing_label_ids_path = '../../gt_data/triple_inspected_May23rd/bing/label_idx.csv'
google_label_ids_path = '../../gt_data/triple_inspected_May23rd/google/label_idx.csv'

bing_labels_dict = {}

df = pd.read_csv(bing_label_ids_path)

for i, row in df.iterrows():
    label = row['label'].lower()
    ilabel = row['ilabel']
    if label in bing_labels_dict:
        raise ValueError("Label happening multiple times!")
    bing_labels_dict[label] = ilabel
    
google_labels_dict = {}

df = pd.read_csv(google_label_ids_path)

for i, row in df.iterrows():
    label = row['label'].lower()
    ilabel = row['ilabel']
    if label in google_labels_dict:
        raise ValueError("Label happening multiple times!")
    google_labels_dict[label] = ilabel

In [4]:
print(len(bing_labels_dict))
print(len(google_labels_dict))

In [5]:
bing_labels_dict['not sure/confusing']

In [6]:
google_labels_dict['not sure/confusing']

In [31]:
bing_labels_dict

# Finalized labels list in order, i.e., conditions , states, activities, others

### label -> ids

In [7]:
all_labels = set(bing_labels_dict.keys()).union(set(google_labels_dict.keys()))

In [8]:
len(all_labels)

In [9]:
all_labels

In [10]:
conditions = ['alone', 'group', 'couple/2people']
states = ['on wheelchair', 'sitting', 'standing', 'walking', 'running', 'biking', 'mobility aids', 'riding carriage', 'riding motorcycle']
others = ['with luggage', 'with coffee or drinks', 'with bike', 'pet', 'kid', 'police', 'teenager', 'elderly', 'baby/infant']
to_be_removed = ['model_hint', 'multi-label', 'duplicate', 'no people', 'not sure/confusing', 'no interaction', 'skating']
activities = list(all_labels - set(conditions) - set(states) - set(others) - set(to_be_removed))
activities

In [11]:
finalalized_labels = conditions + states + activities + others
finalalized_labels

In [12]:
finalalized_labels_dict = {}
for i, l in enumerate(finalalized_labels):
    finalalized_labels_dict[l] = i
finalalized_labels_dict

In [13]:
import pickle

path = '../../gt_data/triple_inspected_May23rd/merged/label_id_dict.pickle'
with open(path, 'wb') as f:
    pickle.dump(finalalized_labels_dict, f)

In [13]:
import pickle

path = '../../gt_data/merged/label_id_dict.pickle'
with open(path, 'rb') as f:
    finalalized_labels_dict = pickle.load(f)

### label_id -> category_type

In [14]:
ids_category_dict = {}
for label in conditions:
    l_id = finalalized_labels_dict[label]
    ids_category_dict[l_id] = 'condition'

for label in states:
    l_id = finalalized_labels_dict[label]
    ids_category_dict[l_id] = 'state'
    
for label in activities:
    l_id = finalalized_labels_dict[label]
    ids_category_dict[l_id] = 'activity'

for label in others:
    l_id = finalalized_labels_dict[label]
    ids_category_dict[l_id] = 'other'

In [15]:
ids_category_dict

In [16]:
import pickle

path = '../../gt_data/triple_inspected_May23rd/merged/labelId_categoryType_dict.pickle'
with open(path, 'wb') as f:
    pickle.dump(ids_category_dict, f)

# Add "is_challenging" column

In [17]:
bing_gt_bbox_labels_path = '../../gt_data/triple_inspected_May23rd/bing/per_file_per_box.csv'
bing_gt_bbox_labels = pd.read_csv(bing_gt_bbox_labels_path, converters={'ilabel':converter})

google_gt_bbox_labels_path = '../../gt_data/triple_inspected_May23rd/google/per_file_per_box.csv'
google_gt_bbox_labels = pd.read_csv(google_gt_bbox_labels_path, converters={'ilabel':converter})

In [18]:
def bing_set_is_challenging(label_ids):
    confusing_id = bing_labels_dict['not sure/confusing']
    if confusing_id in label_ids:
        return True
    return False

def google_set_is_challenging(label_ids):
    confusing_id = google_labels_dict['not sure/confusing']
    if confusing_id in label_ids:
        return True
    return False

In [19]:
bing_gt_bbox_labels['is_challenging'] = bing_gt_bbox_labels['ilabel'].apply(lambda x: bing_set_is_challenging(x))
google_gt_bbox_labels['is_challenging'] = google_gt_bbox_labels['ilabel'].apply(lambda x: google_set_is_challenging(x))

In [20]:
bing_gt_bbox_labels['is_challenging'].sum()

In [21]:
google_gt_bbox_labels['is_challenging'].sum()

# Remove unwanted labels from bing and google

In [22]:
bing_to_be_removed_label_ids = [bing_labels_dict[l] for l in to_be_removed if l in bing_labels_dict]
google_to_be_removed_label_ids = [google_labels_dict[l] for l in to_be_removed if l in google_labels_dict]

def bing_remove_unwanted_labels(label_ids):
    label_ids = [l_id for l_id in label_ids if l_id not in bing_to_be_removed_label_ids]
    
    return label_ids

def google_remove_unwanted_labels(label_ids):
    label_ids = [l_id for l_id in label_ids if l_id not in google_to_be_removed_label_ids]
    
    return label_ids

In [23]:
bing_gt_bbox_labels['ilabel'] = bing_gt_bbox_labels['ilabel'].apply(lambda x: bing_remove_unwanted_labels(x))
google_gt_bbox_labels['ilabel'] = google_gt_bbox_labels['ilabel'].apply(lambda x: google_remove_unwanted_labels(x))

#### Check if any ilabel array is empty

In [24]:
bing_is_empty = bing_gt_bbox_labels['ilabel'].apply(lambda x: True if len(x)==0 else False)
bing_is_empty.sum()

In [25]:
google_is_empty = google_gt_bbox_labels['ilabel'].apply(lambda x: True if len(x)==0 else False)
google_is_empty.sum()

# Unify labels: map bing and google ids to the finalized label ids

### get id->label dict

In [26]:
bing_id_label_dict = {v: k for k, v in bing_labels_dict.items()}
google_id_label_dict = {v: k for k, v in google_labels_dict.items()}

In [27]:
def bing_replace_labelids_with_finalized_ids(label_ids):
    new_label_ids = []
    for l_id in label_ids:
        label = bing_id_label_dict[l_id]
        finalized_id = finalalized_labels_dict[label]
        new_label_ids.append(finalized_id)
    return new_label_ids

def google_replace_labelids_with_finalized_ids(label_ids):
    new_label_ids = []
    for l_id in label_ids:
        label = google_id_label_dict[l_id]
        finalized_id = finalalized_labels_dict[label]
        new_label_ids.append(finalized_id)
    return new_label_ids

In [28]:
bing_gt_bbox_labels['ilabel'][0]

In [29]:
bing_gt_bbox_labels['ilabel'] = bing_gt_bbox_labels['ilabel'].apply(lambda x: bing_replace_labelids_with_finalized_ids(x))
google_gt_bbox_labels['ilabel'] = google_gt_bbox_labels['ilabel'].apply(lambda x: google_replace_labelids_with_finalized_ids(x))

In [30]:
bing_gt_bbox_labels['ilabel'][0]

In [32]:
path = '../../gt_data/triple_inspected_May23rd/bing/label_per_box_sanity_checked_removed_unwanted_labels_unified_labels.csv'
bing_gt_bbox_labels.to_csv(path, index=False)

path = '../../gt_data/triple_inspected_May23rd/google/label_per_box_sanity_checked_removed_unwanted_labels_unified_labels.csv'
google_gt_bbox_labels.to_csv(path, index=False)

# Concatenate google and bing into one single df

In [33]:
bing_gt_bbox_labels['data_source'] = 'BSV'
google_gt_bbox_labels['data_source'] = 'GSV'

In [34]:
final_df = pd.concat([bing_gt_bbox_labels, google_gt_bbox_labels], ignore_index=True)
final_df = final_df.drop('Unnamed: 0', axis=1)
final_df.tail()

In [35]:
final_df.shape

In [36]:
final_df['is_challenging'].sum()

In [37]:
path = '../../gt_data/triple_inspected_May23rd/merged/label_per_box_sanity_checked_removed_unwanted_labels_unified_labels.csv'
final_df.to_csv(path, index=False)