In [1]:
import numpy as np
import pandas as pd 
import pickle
from glob import glob
import os

In [2]:
# Template stuff
imagenet_root_path = '/bigstor/zsarwar/Imagenet_2012'
imagenet_subsets_path = '/bigstor/zsarwar/Imagenet_2012_subsets'
template_path = os.path.join(imagenet_subsets_path, "Dogs_vs_Wolves_metadata.pkl")

df_template = pd.read_pickle(template_path)
df_template = pd.DataFrame.from_dict(df_template, orient='index')

dataset_path = "/bigstor/common_data/UPMC-food-101/images/test/*"
df_path = "/bigstor/common_data/UPMC-food-101/DF"
label_files = "/bigstor/common_data/UPMC-food-101/texts/test_titles.csv"

In [3]:
all_folders = glob(dataset_path)

In [4]:
cols = df_template.columns
metadata = []
for idx, folder in enumerate(all_folders):
    label = (folder.split("/")[-1])
    all_images = glob(folder + "/*")
    all_images = [img for img in all_images]
    for j in range(len(all_images)):
        t_dict = {}
        t_dict[cols[0]] = None
        t_dict[cols[1]] = label
        t_dict[cols[2]] = idx
        t_dict[cols[3]] = 'UPMC-food-101'
        t_dict[cols[4]] = 'UPMC-food-101'
        t_dict[cols[5]] = None
        t_dict[cols[6]] = all_images[j]
        t_dict['index'] = '/'.join(all_images[j].split("/")[-1:])
        metadata.append(t_dict)        



In [5]:
df_upmc = pd.DataFrame.from_dict(metadata)
df_upmc = df_upmc.set_index('index')
df_upmc.index.name = None

In [6]:
# Load foof101 df
df_food101 = pd.read_pickle('/bigstor/common_data/food_101/DF/df_food101_train.pkl')

In [7]:
# Allign labels of food_101 and UPMC

food101_classes = df_food101['class'].unique().tolist()
upmc_classes = df_upmc['class'].unique().tolist()

upmc_classes_clean = [cl.replace("_", " ") for cl in upmc_classes]
upmc_upmc_classes = {upmc_classes[i] : upmc_classes_clean[i] for i in range(len(upmc_classes))}
upmc_classes_unmatched = upmc_classes_clean

# Full string matching
upmc_food101 = {}
used_upmc_classes = []
for im_class in food101_classes:
    im = im_class.lower()
    upmc_classes_unmatched = [cl for cl in upmc_classes_unmatched if cl not in used_upmc_classes] 
    for ts_class in upmc_classes_clean:
        ts = ts_class.lower()
        if ts in im:
            used_upmc_classes.append(ts_class)
            upmc_food101[ts_class] = im_class
            break
upmc_classes_unmatched = [cl for cl in upmc_classes_unmatched if cl not in used_upmc_classes] 
food101_classes_unmatched = [cl for cl in food101_classes if cl not in list(upmc_food101.values())]
# Match remaining classes manually

In [8]:
# Get labels of matched classes in food101

matched_food101 = list(upmc_food101.values())


In [9]:
df_food101_matched = df_food101[df_food101['class'].isin(matched_food101)]

upmc_class_label = {}
for cl in upmc_food101.keys():
    key = upmc_food101[cl]
    lab = df_food101[df_food101['class'] == key]['label'].unique()[0]
    upmc_class_label[cl] = lab



In [10]:
# Relabel matched classes first
for key, val in upmc_upmc_classes.items():
    if val in upmc_food101.keys():
        df_upmc.loc[df_upmc['class'] == key, 'label'] = upmc_class_label[val]
        df_upmc.loc[df_upmc['class'] == key, 'class'] = upmc_food101[val]


In [11]:
used_labels = list(upmc_class_label.values())
# Relabel remaining classes
upmc_used = list(upmc_food101.keys())
upmc_classes = [cl for cl in upmc_classes if upmc_upmc_classes[cl] not in upmc_used] 

In [12]:
upmc_classes

[]

In [13]:
curr_label = 0
for cl in upmc_classes:
    while curr_label in used_labels:
        curr_label+=1
    df_upmc.loc[df_upmc['class'] == cl, 'label'] = curr_label
    curr_label+=1

In [14]:
df_upmc

Unnamed: 0,image_url,class,label,data_type,dataset,query,img_path
omelette_749.jpg,,Omelette,67,UPMC-food-101,UPMC-food-101,,/bigstor/common_data/UPMC-food-101/images/test...
omelette_850.jpg,,Omelette,67,UPMC-food-101,UPMC-food-101,,/bigstor/common_data/UPMC-food-101/images/test...
omelette_385.jpg,,Omelette,67,UPMC-food-101,UPMC-food-101,,/bigstor/common_data/UPMC-food-101/images/test...
omelette_886.jpg,,Omelette,67,UPMC-food-101,UPMC-food-101,,/bigstor/common_data/UPMC-food-101/images/test...
omelette_822.jpg,,Omelette,67,UPMC-food-101,UPMC-food-101,,/bigstor/common_data/UPMC-food-101/images/test...
...,...,...,...,...,...,...,...
beef_tartare_147.jpg,,Beef tartare,4,UPMC-food-101,UPMC-food-101,,/bigstor/common_data/UPMC-food-101/images/test...
beef_tartare_196.jpg,,Beef tartare,4,UPMC-food-101,UPMC-food-101,,/bigstor/common_data/UPMC-food-101/images/test...
beef_tartare_308.jpg,,Beef tartare,4,UPMC-food-101,UPMC-food-101,,/bigstor/common_data/UPMC-food-101/images/test...
beef_tartare_877.jpg,,Beef tartare,4,UPMC-food-101,UPMC-food-101,,/bigstor/common_data/UPMC-food-101/images/test...


In [15]:
# Check if all labels / classes match



In [16]:


all_classes_food = df_food101['class'].unique()
all_classes_upmc = df_upmc['class'].unique()

In [17]:
for m_class in upmc_food101.values():
    og_label = df_food101[df_food101['class'] == m_class]['label'].iloc[0]
    upmc_label = df_upmc[df_upmc['class'] == m_class]['label'].iloc[0]
    if (og_label != upmc_label):
        print("Labels do not match")


In [18]:
out_val_path = os.path.join(df_path, "df_upmc-food-101_val.pkl")
df_upmc.to_pickle(out_val_path)
#out_train_path = os.path.join(df_path, "df_upmc-food-101_train.pkl")
#df_upmc.to_pickle(out_train_path)