In [1]:
# %%
import numpy as np
import pandas as pd 
import pickle
from glob import glob
import os

# %%
# Template stuff
imagenet_root_path = '/bigstor/zsarwar/Imagenet_2012'
imagenet_subsets_path = '/bigstor/zsarwar/Imagenet_2012_subsets'
template_path = os.path.join(imagenet_subsets_path, "Dogs_vs_Wolves_metadata.pkl")
df_template = pd.read_pickle(template_path)
df_template = pd.DataFrame.from_dict(df_template, orient='index')


dataset_path = "/bigstor/common_data/ISIA-Food-500/ISIA_Food500/images/*"
df_path = "/bigstor/common_data/ISIA-Food-500/DF"
all_folders = glob(dataset_path)
all_folders = [fold for fold in all_folders]
class_mapping = "/bigstor/common_data/ISIA-Food-500/metadata_ISIAFood_500/ISIAFood500_classLabel.txt"

train_samples_path = "/bigstor/common_data/ISIA-Food-500/metadata_ISIAFood_500/train_feature.txt"

# %%
with open(class_mapping, 'r') as ifile:
    class_labels = ifile.readlines()
class_labels = [lab.replace("\n", "") for lab in class_labels]
class_names =  [lab.split("\t")[0] for lab in class_labels]
class_labels = [lab.split("\t")[1] for lab in class_labels]
class_label = {class_names[i] : class_labels[i] for i in range(len(class_names))}

# %%
cols = df_template.columns
metadata = []
for idx, folder in enumerate(all_folders):
    label = (folder.split("/")[-1])
    all_images = glob(folder + "/*")
    all_images = [img for img in all_images]
    for j in range(len(all_images)):
        t_dict = {}
        t_dict[cols[0]] = None
        t_dict[cols[1]] = label
        t_dict[cols[2]] = class_label[label]
        t_dict[cols[3]] = 'ISIA-FOOD-500'
        t_dict[cols[4]] = 'ISIA-FOOD-500'
        t_dict[cols[5]] = None
        t_dict[cols[6]] = all_images[j]
        t_dict['index'] = '/'.join(all_images[j].split("/")[-1:])
        metadata.append(t_dict)        



# %%
df_isia = pd.DataFrame.from_dict(metadata)
df_isia = df_isia.set_index('index')
df_isia.index.name = None

# %%
# Load food101 df
df_food101 = pd.read_pickle('/bigstor/common_data/food_101/DF/df_food101_train.pkl')

# %%
# Allign labels of food_101 and isia

food101_classes = df_food101['class'].unique().tolist()
isia_classes = df_isia['class'].unique().tolist()

isia_classes_clean = [cl.replace("_", " ") for cl in isia_classes]
isia_isia_classes = {isia_classes[i] : isia_classes_clean[i] for i in range(len(isia_classes))}
isia_classes_unmatched = isia_classes_clean

# Full string matching
isia_food101 = {}
used_isia_classes = []
for im_class in food101_classes:
    im = im_class.lower()
    isia_classes_unmatched = [cl for cl in isia_classes_unmatched if cl not in used_isia_classes] 
    for ts_class in isia_classes_clean:
        ts = ts_class.lower()
        if ts in im:
            used_isia_classes.append(ts_class)
            isia_food101[ts_class] = im_class
            break
isia_classes_unmatched = [cl for cl in isia_classes_unmatched if cl not in used_isia_classes] 
food101_classes_unmatched = [cl for cl in food101_classes if cl not in list(isia_food101.values())]
# Match remaining classes manually

# %%
# Get labels of matched classes in food101

matched_food101 = list(isia_food101.values())


# %%
df_food101_matched = df_food101[df_food101['class'].isin(matched_food101)]

isia_class_label = {}
for cl in isia_food101.keys():
    key = isia_food101[cl]
    lab = df_food101[df_food101['class'] == key]['label'].unique()[0]
    isia_class_label[cl] = lab



# %%
# Relabel matched classes first
for key, val in isia_isia_classes.items():
    if val in isia_food101.keys():
        df_isia.loc[df_isia['class'] == key, 'label'] = isia_class_label[val]
        df_isia.loc[df_isia['class'] == key, 'class'] = isia_food101[val]


# %%
used_labels = list(isia_class_label.values())
# Relabel remaining classes
isia_used = list(isia_food101.keys())
isia_classes = [cl for cl in isia_classes if isia_isia_classes[cl] not in isia_used] 

# %%
curr_label = 0
for cl in isia_classes:
    while curr_label in used_labels:
        curr_label+=1
    df_isia.loc[df_isia['class'] == cl, 'label'] = curr_label
    curr_label+=1

# %%
len(isia_classes)

# %%
all_classes_food = df_food101['class'].unique()
all_classes_isia = df_isia['class'].unique()
for m_class in isia_food101.values():
    og_label = df_food101[df_food101['class'] == m_class]['label'].iloc[0]
    isia_label = df_isia[df_isia['class'] == m_class]['label'].iloc[0]
    if (og_label != isia_label):
        print("Labels do not match")


# %%
# Load train samples
with open(train_samples_path, 'r') as iffile:
    train_samples = iffile.readlines()
train_samples = [sample.replace("\n", "") for sample in train_samples]
train_samples = [sample.split("/")[-1] for sample in train_samples]

# Remove bad samples
bad_samples = ['/bigstor/common_data/ISIA-Food-500/ISIA_Food500/images/Biryani/Biryani_0089.jpg', '/bigstor/common_data/ISIA-Food-500/ISIA_Food500/images/Abalone/Abalone_0062.jpg', '/bigstor/common_data/ISIA-Food-500/ISIA_Food500/images/Broccoli_slaw/Broccoli_slaw_0144.jpg']
bad_samples_index = [sample.split("/")[-1] for sample in bad_samples]
df_isia = df_isia[~df_isia.index.isin(bad_samples_index)]

bad_samples = ['/bigstor/common_data/ISIA-Food-500/ISIA_Food500/images/Ceviche/Ceviche_0200.jpg',
            '/bigstor/common_data/ISIA-Food-500/ISIA_Food500/images/Chicken_cacciatore/Chicken_cacciatore_0139.jpg',
            '/bigstor/common_data/ISIA-Food-500/ISIA_Food500/images/Crab_puff/Crab_puff_0971.jpg',
            '/bigstor/common_data/ISIA-Food-500/ISIA_Food500/images/Fish_steak/Fish_steak_0892.jpg',
            '/bigstor/common_data/ISIA-Food-500/ISIA_Food500/images/Doughnut/Doughnut_0206.jpg',
            '/bigstor/common_data/ISIA-Food-500/ISIA_Food500/images/Abalone/Abalone_0447.jpg',
            '/bigstor/common_data/ISIA-Food-500/ISIA_Food500/images/Charqui/Charqui_0223.jpg'
            ]
bad_samples_index = [sample.split("/")[-1] for sample in bad_samples]
df_isia = df_isia[~df_isia.index.isin(bad_samples_index)]



# Split train/val
df_isia_train = df_isia[df_isia.index.isin(train_samples)]

df_isia_val = df_isia[~df_isia.index.isin(train_samples)]

# %%
train_path = "df_ISIA-Food-500_train.pkl"
train_path = os.path.join(df_path, train_path)
val_path = "df_ISIA-Food-500_val.pkl"
val_path = os.path.join(df_path, val_path)

df_isia_train.to_pickle(train_path)
df_isia_val.to_pickle(val_path)
