In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
def converter(instr):
    return np.fromstring(instr[1:-1],sep=',', dtype=int)

In [3]:
path = '../../gt_data/triple_inspected_May23rd/merged/label_per_box_sanity_checked_removed_unwanted_labels_unified_labels.csv'
#df = pd.read_csv(path, converters={'ilabel':converter})
df = pd.read_csv(path)
df.head()

In [4]:
df.shape

In [5]:
from collections import Counter


def label_per_box(df):
    grouped = (df.groupby(
            ['ifile', 'normx', 'normy', 'normwidth', 'normheight'])['ilabel']
               .unique()
               .reset_index())
    grouped['ilabel'] = grouped['ilabel'].apply(lambda x: sorted([i for i in x]))
    grouped['num_labels'] = [len(leb) for leb in grouped.ilabel]
    return grouped

def combo_freq(df):
    gr = label_per_box(df)
    gr['labels_tuple'] = gr['ilabel'].apply(tuple)
    label_comb_counter = Counter(gr['labels_tuple'])
    label_comb_df = pd.DataFrame(label_comb_counter.items(),
                                 columns=['Label Combination', 'Frequency'])
    return label_comb_df


In [6]:
freqs = combo_freq(df)

In [7]:
freqs

In [8]:
freqs['Frequency'].max()

In [9]:
freqs = freqs.sort_values('Frequency', ascending=False, ignore_index=True)

In [10]:
sns.barplot(freqs, x='Label Combination', y='Frequency')

In [11]:
freqs[freqs['Frequency']==1296]

In [12]:
import pickle

with open('../../gt_data/triple_inspected_May23rd/merged/label_id_dict.pickle', 'rb') as f:
    labels_dict = pickle.load(f)

In [13]:
id_labels_dict = {v: k for k, v in labels_dict.items()}

In [14]:
print(id_labels_dict[0], id_labels_dict[6])

In [15]:
print(id_labels_dict[0], id_labels_dict[5])

## Processing/Filtering annotations that have freq < 5

In [16]:
freqs = freqs[freqs['Frequency'] < 5]
freqs

In [18]:
not_allowed_combinations = freqs['Label Combination'].apply(lambda x: x[0]).tolist()
not_allowed_combinations

## Remove rare labels from the rare label_combos 

### Which labels are making the combos rare?

In [19]:
import pickle
import ast

with open('../../gt_data/triple_inspected_May23rd/merged/labelId_categoryType_dict.pickle', 'rb') as f:
    lId_category_dict = pickle.load(f)


In [20]:
single_label_freqs = {}

for i, row in df.iterrows():
    label_list = ast.literal_eval(row['ilabel'])
    for label in label_list:
        if not label in single_label_freqs:
            single_label_freqs[label] = 0
        single_label_freqs[label] += 1

single_label_freqs_df = pd.DataFrame.from_dict({'Label': single_label_freqs.keys(), 'Frequency': single_label_freqs.values()})
single_label_freqs_df.head()

In [21]:
from matplotlib import pyplot as plt


single_label_freqs_df = single_label_freqs_df.sort_values('Frequency', ascending=False, ignore_index=True)
plt.figure(figsize=(15,8))
single_label_freqs_df['Label'] = single_label_freqs_df['Label'].apply(lambda x: str(x))
sns.barplot(single_label_freqs_df, x='Label', y='Frequency')

## Put a threshold for label freq 
If a single label x is appearing rarely, then remove that label from the combo. 
This allows us to still keep the bbox.

In [22]:
labels_more_than_100freq = single_label_freqs_df[single_label_freqs_df['Frequency']>=100]['Label'].tolist()
labels_more_than_100freq = [int(l) for l in labels_more_than_100freq]

In [23]:
print(f'Number of remaining single labels: {len(labels_more_than_100freq)}')

In [24]:
remaining_single_labels = [id_labels_dict[i] for i in labels_more_than_100freq]
print(f"Remaining single labels: {remaining_single_labels}")

In [25]:
type(not_allowed_combinations)

In [26]:
type(not_allowed_combinations[0])

In [27]:
not_allowed_combinations = [ast.literal_eval(combo) for combo in not_allowed_combinations]

In [28]:
def remove_rare_labels(label_ids):
    if label_ids in not_allowed_combinations:
        new_label_ids = [i for i in label_ids if i in labels_more_than_100freq]
        if len(new_label_ids) == 0:
            raise ValueError(f'Empty label list after label list {label_ids}!')
        return str(new_label_ids)
    return str(label_ids)

In [29]:
type(df['ilabel'][0])

In [30]:
df['ilabel'] = df['ilabel'].apply(lambda x: ast.literal_eval(x))
df['ilabel'] = df['ilabel'].apply(lambda x: remove_rare_labels(x))
df['ilabel']

Note that at this point, the size of the dataframe is the same as the size of the initial dataframe from the beginning of the notebook. This test makes sure that no bbox has been removed during this filtering process so far.

In [31]:
freqs = combo_freq(df)
freqs = freqs.sort_values('Frequency', ascending=False, ignore_index=True)
sns.barplot(freqs, x='Label Combination', y='Frequency')

In [32]:
freqs

In [33]:
freqs[freqs['Frequency'] < 2].shape

### From here, we need to remove the files that contain the rare combos
This is becauase the alternative is to remove the bbox! Which would no be feasible, since we would have an image with like 3 people in them, 2 with annotated bbox, and the other one not included. This is wrong.

In [34]:
freqs = freqs[freqs['Frequency'] < 2]
not_allowed_combinations = freqs['Label Combination'].apply(lambda x: x[0]).tolist()
not_allowed_combinations

In [35]:
df['unique_ifile'] = df['data_source'] + df['ifile'].apply(lambda x: f'_{x}')
to_be_removed_files = df[df['ilabel'].isin(not_allowed_combinations)]['unique_ifile'].tolist()
to_be_removed_files

In [62]:
df['unique_ifile'].nunique()

In [64]:
df['unique_ifile'].nunique() - len(set(to_be_removed_files))

In [37]:
filtered_df = df[~df['unique_ifile'].isin(to_be_removed_files)]

In [38]:
filtered_df.head()

In [63]:
filtered_df['unique_ifile'].nunique()

In [39]:
freqs = combo_freq(filtered_df)
freqs = freqs.sort_values('Frequency', ascending=False, ignore_index=True)
sns.barplot(freqs, x='Label Combination', y='Frequency')

In [40]:
freqs.shape

In [41]:
freqs[freqs['Frequency'] < 5].shape

In [42]:
unique_labels = set()
for i, row in filtered_df.iterrows():
    label_list = row['ilabel']
    label_list = ast.literal_eval(label_list)
    for l in label_list:
        unique_labels.add(l)

final_remaining_single_labels = [id_labels_dict[i] for i in unique_labels]
print(f"Final remaining single labels: {final_remaining_single_labels}")

In [43]:
len(final_remaining_single_labels)

### Fix the num_labels column

In [44]:
import ast
filtered_df = filtered_df.drop(['num_labels'], axis=1)
filtered_df['num_labels'] = filtered_df['ilabel'].apply(lambda x: len(ast.literal_eval(x)))
filtered_df.head()

## Re-arrange the labels after thresholding

In [45]:
id_labels_dict

In [46]:
removed_labels = set(id_labels_dict.keys()) - unique_labels
removed_labels

In [47]:
new_id_dict = {key: value for key, value in id_labels_dict.items() if key not in removed_labels}
new_id_dict

In [48]:
final_id_dict = {}
l_index = 0
for k in new_id_dict:
    final_id_dict[l_index] = new_id_dict[k]
    l_index += 1

final_id_dict

In [49]:
final_labels_dict = {v: k for k, v in final_id_dict.items()}
final_labels_dict

In [50]:
path = '../../gt_data/triple_inspected_May23rd/merged/label_id_dict_after_distr_thresholding.pickle'

with open(path, 'wb') as f:
    pickle.dump(final_labels_dict, f)

In [51]:
path = '../../gt_data/triple_inspected_May23rd/merged/label_id_dict_after_distr_thresholding.csv'
labels_df = pd.DataFrame.from_dict({'label': final_id_dict.values(), 'id': final_id_dict.keys()})
labels_df.to_csv(path, index=False)

#### Fix label_id to category_type dict

In [52]:
all_labels = set(final_labels_dict.keys())
conditions = ['alone', 'group', 'couple/2people']
states = ['sitting', 'standing', 'walking', 'running', 'biking', 'mobility aids', 'riding carriage']
others = ['with luggage', 'with coffee or drinks', 'with bike', 'pet', 'kid', 'teenager', 'elderly', 'baby/infant']
activities = list(all_labels - set(conditions) - set(states) - set(others))
activities

In [53]:
ids_category_dict = {}
for label in conditions:
    l_id = final_labels_dict[label]
    ids_category_dict[l_id] = 'condition'

for label in states:
    l_id = final_labels_dict[label]
    ids_category_dict[l_id] = 'state'
    
for label in activities:
    l_id = final_labels_dict[label]
    ids_category_dict[l_id] = 'activity'

for label in others:
    l_id = final_labels_dict[label]
    ids_category_dict[l_id] = 'other'
ids_category_dict

In [54]:
ids_category_dict = dict(sorted(ids_category_dict.items()))
ids_category_dict

In [55]:
import pickle

path = '../../gt_data/triple_inspected_May23rd/merged/labelId_categoryType_dict_after_distr_thresholding.pickle'
with open(path, 'wb') as f:
    pickle.dump(ids_category_dict, f)

### Fix label ids in the final bbox annotations

In [56]:
def new_label_ids(label_ids):
    label_ids = ast.literal_eval(label_ids)
    text_labels = [id_labels_dict[i] for i in label_ids]
    final_label_ids = [final_labels_dict[t] for t in text_labels]
    return final_label_ids

In [57]:
filtered_df['ilabel'] = filtered_df['ilabel'].apply(lambda x: new_label_ids(x))
filtered_df.head()

In [59]:
filtered_df.shape

In [58]:
path = '../../gt_data/triple_inspected_May23rd/merged/label_per_box_sanity_checked_removed_unwanted_labels_unified_labels_after_distr_thresholding.csv'
filtered_df.to_csv(path, index=False)

In [60]:
filtered_df['unique_ifile'].nunique()