In [1]:
import numpy as np
import pandas as pd 
import pickle
from glob import glob
import os

In [2]:
# Template stuff
imagenet_root_path = '/bigstor/zsarwar/Imagenet_2012'
imagenet_subsets_path = '/bigstor/zsarwar/Imagenet_2012_subsets'
template_path = os.path.join(imagenet_subsets_path, "Dogs_vs_Wolves_metadata.pkl")
df_template = pd.read_pickle(template_path)
df_template = pd.DataFrame.from_dict(df_template, orient='index')


dataset_path = "/bigstor/common_data/UECFOOD256/*"
df_path = "/bigstor/common_data/UECFOOD256/DF/"
all_folders = glob(dataset_path)
all_folders = [fold for fold in all_folders if ".txt" not in fold]
all_folders = [fold for fold in all_folders if "DF" not in fold]
class_mapping = "/bigstor/common_data/UECFOOD256/category.txt"

In [3]:
# Process label to class mapping
with open(class_mapping, 'r') as i_file:
    labels_classes = i_file.readlines()
labels_classes = [lc.split("\t") for lc in labels_classes]
labels_classes = [[int(lc[0]), lc[1].replace("\n", "")] for lc in labels_classes[1:]]
labels_classes = {lc[0]: lc[1] for lc in labels_classes}

In [4]:
cols = df_template.columns
metadata = []
for idx, folder in enumerate(all_folders):
    label = int(folder.split("/")[-1])
    all_images = glob(folder + "/*")
    all_images = [img for img in all_images if ".txt" not in img]
    for j in range(len(all_images)):
        t_dict = {}
        t_dict[cols[0]] = None
        t_dict[cols[1]] = labels_classes[label]
        t_dict[cols[2]] = label
        t_dict[cols[3]] = 'uec_food256'
        t_dict[cols[4]] = 'uec_food256'
        t_dict[cols[5]] = None
        t_dict[cols[6]] = all_images[j]
        t_dict['index'] = '/'.join(all_images[j].split("/")[-2:])
        metadata.append(t_dict)        



In [5]:
df_uec = pd.DataFrame.from_dict(metadata)
df_uec = df_uec.set_index('index')
df_uec.index.name = None

In [6]:
# Finding repeated classes
#####
"""
label_class_dict = {}

uni_labels = df_uec['label'].unique().tolist()
for uni in uni_labels:
    df_temp = df_uec[df_uec['label'] == uni]
    cl = df_temp.iloc[0]['class']
    if cl in list(label_class_dict.values()):
        print(cl)
    else:
        label_class_dict[uni] = cl
"""
####
        

"\nlabel_class_dict = {}\n\nuni_labels = df_uec['label'].unique().tolist()\nfor uni in uni_labels:\n    df_temp = df_uec[df_uec['label'] == uni]\n    cl = df_temp.iloc[0]['class']\n    if cl in list(label_class_dict.values()):\n        print(cl)\n    else:\n        label_class_dict[uni] = cl\n"

In [7]:
# Load foof101 df
df_food101 = pd.read_pickle('/bigstor/common_data/food_101/DF/df_food101_train.pkl')

In [8]:
food101_classes = df_food101['class'].unique().tolist()
uec256_classes = df_uec['class'].unique().tolist()


In [9]:
len(df_uec['label'].unique().tolist())

256

In [10]:
# Allign labels of food_101 and UEC256

food101_classes = df_food101['class'].unique().tolist()
uec256_classes = df_uec['class'].unique().tolist()

uec256_classes_clean = [cl.replace("_", " ") for cl in uec256_classes]
uec256_uec256_classes = {uec256_classes[i] : uec256_classes_clean[i] for i in range(len(uec256_classes))}
uec256_classes_temp = uec256_classes_clean

# Full string matching
uec256_food101 = {}
used_uec256_classes = []
for im_class in food101_classes:
    im = im_class.lower()
    uec256_classes_temp = [cl for cl in uec256_classes_temp if cl not in used_uec256_classes] 
    for ts_class in uec256_classes_clean:
        ts = ts_class.lower()
        if ts in im:
            used_uec256_classes.append(ts_class)
            uec256_food101[ts_class] = im_class
            break
uec256_classes_temp = [cl for cl in uec256_classes_temp if cl not in used_uec256_classes] 
food101_classes_temp = [cl for cl in food101_classes if cl not in list(uec256_food101.values())]
# Match remaining classes manually

In [11]:
# Get labels of matched classes in food101

matched_food101 = list(uec256_food101.values())


In [12]:
df_food101_matched = df_food101[df_food101['class'].isin(matched_food101)]

uec_class_label = {}
for cl in uec256_food101.keys():
    key = uec256_food101[cl]
    lab = df_food101[df_food101['class'] == key]['label'].unique()[0]
    uec_class_label[cl] = lab


In [13]:
uec256_food101.keys()

dict_keys(['fried rice', 'churro', 'french fries', 'spaghetti', 'sushi', 'pizza', 'sashimi', 'shortcake', 'pho', 'takoyaki', 'hot dog', 'tiramisu', 'miso soup', 'pancake', 'tacos', 'grilled salmon', 'hamburger', 'apple pie', 'paella', 'waffle', 'Caesar salad', 'steak', 'omelet', 'lasagna', 'french toast', 'bibimbap', 'nachos'])

In [14]:
# Relabel matched classes first
for cl in list(uec256_food101.keys()):
    df_uec.loc[df_uec['class'] == cl, 'label'] = uec_class_label[cl]
    df_uec.loc[df_uec['class'] == cl, 'class'] = uec256_food101[cl]


In [15]:
used_labels = list(uec_class_label.values())
# Relabel remaining classes
uec_used = list(uec256_food101.keys())
uec_classes = df_uec['class'].unique().tolist()
uec_classes = [cl for cl in uec256_classes if cl not in uec_used]

In [16]:
curr_label = 0
for cl in uec_classes:
    while curr_label in used_labels:
        curr_label+=1
    df_uec.loc[df_uec['class'] == cl, 'label'] = curr_label
    curr_label+=1

In [17]:
# Split into train val
val_per_class = 20

df_val = None

uni_labels = df_uec['label'].unique().tolist()

for lab in uni_labels:
    df_temp = df_uec[df_uec['label'] == lab]
    df_temp_val = df_temp.sample(n=val_per_class, random_state=42)

    df_uec = df_uec.drop(df_temp_val.index)
    if isinstance(df_val, pd.DataFrame):
        frames = [df_temp_val, df_val]
        df_val = pd.concat(frames)
    else:
        df_val = df_temp_val
    

In [18]:
out_val_path = os.path.join(df_path, "df_uec256_val.pkl")
df_val.to_pickle(out_val_path)
out_train_path = os.path.join(df_path, "df_uec256_train.pkl")
df_uec.to_pickle(out_train_path)

In [23]:
idx = df_uec.index.tolist()