In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [4]:
BASE_PATH = 'data/'
classes = ['food-images', 'non-food-images']

In [5]:
def get_files(root_path):
    file_list = []
    for dirr, _, files in os.walk(root_path):
        for file in files:
            file_list.append(os.path.join(dirr, file))
    return file_list

def dir_to_labels(BASE_PATH, classes):
    main_dirs = [os.path.join(BASE_PATH, i) for i in os.listdir(BASE_PATH) if i in classes]
    file_dict = {main_dirs[i]: get_files(k) for i, k in enumerate(main_dirs)}
    df = pd.concat(pd.DataFrame({'img':v, 'label':k}) for k, v in file_dict.items())
    df['label'] = df['label'].apply(lambda x: x.split('/')[-1])
    
    return df.reset_index(drop=True)

In [6]:
df = dir_to_labels(BASE_PATH, classes)

In [24]:
# added food-101 images and moved some labels to non-food
df_new = dir_to_labels(BASE_PATH, classes)
df_new.label.value_counts(normalize=True)

food-images        0.663701
non-food-images    0.336299
Name: label, dtype: float64

In [25]:
le = LabelEncoder()
df_new['enc_label'] = le.fit_transform(df_new.label)

In [31]:
df_new.enc_label.value_counts(normalize=True)

0    0.663701
1    0.336299
Name: enc_label, dtype: float64

In [33]:
targets = df_new.enc_label
train_idx, test_idx = train_test_split(np.arange(len(targets)), test_size=.2, stratify=targets, random_state=42)

In [34]:
df_new.loc[train_idx, 'set'] = 'train'
df_new.loc[test_idx, 'set'] = 'test'

In [7]:
df.label.value_counts(normalize=True)

non-food-images    0.945398
food-images        0.054602
Name: label, dtype: float64

In [8]:
le = LabelEncoder()
df['enc_label'] = le.fit_transform(df.label)

In [9]:
food = df[df.label == 'food-images']
non_food = df[df.label == 'non-food-images'].sample(n=3000)

df_balanced = pd.concat([food, non_food]).reset_index(drop=True)

In [10]:
df_balanced.label.value_counts()

non-food-images    3000
food-images        2985
Name: label, dtype: int64

In [239]:
targets = df_balanced.enc_label
train_idx, test_idx = train_test_split(np.arange(len(targets)), test_size=.2, stratify=targets, random_state=42)

In [244]:
df_balanced.loc[train_idx, 'set'] = 'train'
df_balanced.loc[test_idx, 'set'] = 'test'

In [251]:
df_balanced['subclass'] = df_balanced['img'].apply(lambda x: x.split('/')[2])

In [258]:
df_balanced.set.value_counts()

train    4788
test     1197
Name: set, dtype: int64

In [254]:
df_balanced[df_balanced.label == 'food-images']['subclass'].value_counts()

clementine      57
taro            57
satsuma         57
brick           57
cherry          57
man             57
life            57
chicory         57
sapodilla       57
dessert         57
periwinkle      57
onion           57
tangelo         57
Scandinavian    57
blue            57
hog             57
monkey          57
ginger          57
bagel           57
cress           56
bean            56
tabasco         56
chip            56
bass            56
lemon           56
bap             56
mandarin        56
lime            56
house           56
pulse           56
sardine         56
star            56
shoulder        56
light           55
king            55
soy             55
nutmeg          55
papaya          55
corn            55
lingonberry     55
cream           55
pickerel        54
ackee           54
garlic          54
baguet          54
medlar          54
eggplant        53
raisin          52
breakfast       52
muscadine       52
whiting         52
frijole         51
cornmeal    

In [248]:
df_balanced.to_csv('data/foodNotFood-balanced.csv', index=False)
df.to_csv('data/foodNotFood.csv', index=False)

In [39]:
df_new.to_csv('data/foodNotFood-101.csv', index=False)

In [21]:
df = pd.read_csv('data/foodNotFood-101.csv')

In [22]:
df['img'] = df['img'].apply(lambda x: "../" + x)

In [23]:
df.to_csv('data2/foodNotFood-101.csv', index=False)