In [1]:
import numpy as np
import pandas as pd 
import pickle
from glob import glob
import os

In [2]:
# Template stuff
imagenet_root_path = '/bigstor/zsarwar/Imagenet_2012'
imagenet_subsets_path = '/bigstor/zsarwar/Imagenet_2012_subsets'
template_path = os.path.join(imagenet_subsets_path, "Dogs_vs_Wolves_metadata.pkl")
df_template = pd.read_pickle(template_path)
df_template = pd.DataFrame.from_dict(df_template, orient='index')


dataset_path = "/bigstor/zsarwar/Tsinghua/high/high-resolution/*"
df_path = "/bigstor/zsarwar/Tsinghua/DF/"
all_folders = glob(dataset_path)

In [3]:
cols = df_template.columns
metadata = []
for idx, folder in enumerate(all_folders):
    label = folder.split("/")[-1].split("-")[-1]
    all_images = glob(folder + "/*")

    for j in range(len(all_images)):
        t_dict = {}
        t_dict[cols[0]] = None
        t_dict[cols[1]] = label
        t_dict[cols[2]] = idx
        t_dict[cols[3]] = 'tsinghua_dogs'
        t_dict[cols[4]] = 'tsinghua_dogs'
        t_dict[cols[5]] = None
        t_dict[cols[6]] = all_images[j]
        t_dict['index'] = '/'.join(all_images[j].split("/")[-2:])
        metadata.append(t_dict)        



In [4]:
df_tsinghua = pd.DataFrame.from_dict(metadata)
df_tsinghua = df_tsinghua.set_index('index')
df_tsinghua.index.name = None

In [5]:
# Load splits
with open("/bigstor/zsarwar/Tsinghua/TrainAndValList/train.lst") as i_file:
    train_list = i_file.readlines()

train_list = [sample.replace("\n", "") for sample in train_list]
train_list = ['/'.join(t.split("/")[2:]) for t in train_list]


with open("/bigstor/zsarwar/Tsinghua/TrainAndValList/validation.lst") as i_file:
    val_list = i_file.readlines()

val_list = [sample.replace("\n", "") for sample in val_list]
val_list = ['/'.join(v.split("/")[2:]) for v in val_list]





In [6]:
# Remove bad images
bad_images = [11796, 15420, 18503, 24881, 25426, 28916,30939, 35629, 40499, 46151, 56842]


In [7]:
drop_indices = df_tsinghua.iloc[bad_images].index
df_tsinghua = df_tsinghua.drop(drop_indices)

In [8]:
# Load Imagenet and allign the labels
df_imagenet = pd.read_pickle("/bigstor/zsarwar/Imagenet/DF/df_imagenet_dogs_train.pkl")

tsinghua_classes = df_tsinghua['class'].unique().tolist()
imagenet_classes = df_imagenet['class'].unique().tolist()



In [9]:
tsinghua_classes_clean = [cl.replace("_", " ") for cl in tsinghua_classes]
tsinghua_tsinghua_classes = {tsinghua_classes[i] : tsinghua_classes_clean[i] for i in range(len(tsinghua_classes))}
tsinghua_classes_temp = tsinghua_classes_clean

# Full string matching
tsinghua_imagenet = {}
used_tsinghua_classes = []
for im_class in imagenet_classes:
    im = im_class.lower().split(",")[0].replace("-", " ")
    tsinghua_classes_temp = [cl for cl in tsinghua_classes_temp if cl not in used_tsinghua_classes] 
    for ts_class in tsinghua_classes_clean:
        ts = ts_class.lower()
        if ts in im:
            used_tsinghua_classes.append(ts_class)
            tsinghua_imagenet[ts_class] = im_class
            break
tsinghua_classes_temp = [cl for cl in tsinghua_classes_temp if cl not in used_tsinghua_classes] 
imagenet_classes_temp = [cl for cl in imagenet_classes if cl not in list(tsinghua_imagenet.values())]
# Match remaining classes manually
tsinghua_imagenet['Border collie'] = 'collie'

In [10]:

# Relabel tsinghua to imagenet
start_label = 120
unique_classes = df_tsinghua['class'].unique()
for uni_cl in unique_classes:
    
    if tsinghua_tsinghua_classes[uni_cl] not in tsinghua_imagenet:
        df_tsinghua.loc[df_tsinghua['class'] == uni_cl, 'class'] = tsinghua_tsinghua_classes[uni_cl]
        df_tsinghua.loc[df_tsinghua['class'] == tsinghua_tsinghua_classes[uni_cl], 'label'] = start_label
        start_label+=1
        print(uni_cl)
    else:
        # Change class
        imagenet_class = tsinghua_imagenet[tsinghua_tsinghua_classes[uni_cl]]
        df_tsinghua.loc[df_tsinghua['class'] == uni_cl, 'class'] = imagenet_class
        # Change label
        imagenet_label = df_imagenet[df_imagenet['class'] == imagenet_class]['label'].iloc[0] 
        df_tsinghua.loc[df_tsinghua['class'] == imagenet_class, 'label'] = imagenet_label


Black_sable
Shiba_Dog
chinese_rural_dog
Australian_Shepherd
Cane_Carso
Fila Braziliero
Japanese_Spitzes
teddy
Chinese_Crested_Dog
Bichon_Frise


In [11]:
df_imagenet[df_imagenet['label'] == 1]

Unnamed: 0,image_url,class,label,data_type,dataset,query,img_path
n02085782_8225.JPEG,,Japanese spaniel,1,imagenet_baseline,imagenet,breed of toy dogs originating in Japan having ...,/bigstor/zsarwar/Imagenet_2012/train/n02085782...
n02085782_8341.JPEG,,Japanese spaniel,1,imagenet_baseline,imagenet,breed of toy dogs originating in Japan having ...,/bigstor/zsarwar/Imagenet_2012/train/n02085782...
n02085782_3810.JPEG,,Japanese spaniel,1,imagenet_baseline,imagenet,breed of toy dogs originating in Japan having ...,/bigstor/zsarwar/Imagenet_2012/train/n02085782...
n02085782_1925.JPEG,,Japanese spaniel,1,imagenet_baseline,imagenet,breed of toy dogs originating in Japan having ...,/bigstor/zsarwar/Imagenet_2012/train/n02085782...
n02085782_9914.JPEG,,Japanese spaniel,1,imagenet_baseline,imagenet,breed of toy dogs originating in Japan having ...,/bigstor/zsarwar/Imagenet_2012/train/n02085782...
...,...,...,...,...,...,...,...
n02085782_3354.JPEG,,Japanese spaniel,1,imagenet_baseline,imagenet,breed of toy dogs originating in Japan having ...,/bigstor/zsarwar/Imagenet_2012/train/n02085782...
n02085782_14768.JPEG,,Japanese spaniel,1,imagenet_baseline,imagenet,breed of toy dogs originating in Japan having ...,/bigstor/zsarwar/Imagenet_2012/train/n02085782...
n02085782_3292.JPEG,,Japanese spaniel,1,imagenet_baseline,imagenet,breed of toy dogs originating in Japan having ...,/bigstor/zsarwar/Imagenet_2012/train/n02085782...
n02085782_3855.JPEG,,Japanese spaniel,1,imagenet_baseline,imagenet,breed of toy dogs originating in Japan having ...,/bigstor/zsarwar/Imagenet_2012/train/n02085782...


In [12]:
df_tsinghua[df_tsinghua['class'] == 'Japanese spaniel']

Unnamed: 0,image_url,class,label,data_type,dataset,query,img_path
249-n000106-Japanese_spaniel/n132489.jpg,,Japanese spaniel,1,tsinghua_dogs,tsinghua_dogs,,/bigstor/zsarwar/Tsinghua/high/high-resolution...
249-n000106-Japanese_spaniel/n132500.jpg,,Japanese spaniel,1,tsinghua_dogs,tsinghua_dogs,,/bigstor/zsarwar/Tsinghua/high/high-resolution...
249-n000106-Japanese_spaniel/n132617.jpg,,Japanese spaniel,1,tsinghua_dogs,tsinghua_dogs,,/bigstor/zsarwar/Tsinghua/high/high-resolution...
249-n000106-Japanese_spaniel/n132507.jpg,,Japanese spaniel,1,tsinghua_dogs,tsinghua_dogs,,/bigstor/zsarwar/Tsinghua/high/high-resolution...
249-n000106-Japanese_spaniel/n132610.jpg,,Japanese spaniel,1,tsinghua_dogs,tsinghua_dogs,,/bigstor/zsarwar/Tsinghua/high/high-resolution...
...,...,...,...,...,...,...,...
249-n000106-Japanese_spaniel/n132536.jpg,,Japanese spaniel,1,tsinghua_dogs,tsinghua_dogs,,/bigstor/zsarwar/Tsinghua/high/high-resolution...
249-n000106-Japanese_spaniel/n132469.jpg,,Japanese spaniel,1,tsinghua_dogs,tsinghua_dogs,,/bigstor/zsarwar/Tsinghua/high/high-resolution...
249-n000106-Japanese_spaniel/n132626.jpg,,Japanese spaniel,1,tsinghua_dogs,tsinghua_dogs,,/bigstor/zsarwar/Tsinghua/high/high-resolution...
249-n000106-Japanese_spaniel/n132531.jpg,,Japanese spaniel,1,tsinghua_dogs,tsinghua_dogs,,/bigstor/zsarwar/Tsinghua/high/high-resolution...


In [13]:
len(df_tsinghua['class'].unique())

130

In [14]:
len(df_tsinghua['label'].unique())

130

In [15]:
# Check that every class has a unique label and 

unique_classes = df_tsinghua['class'].unique()
tot_labels = []
used_label = []
for uni in unique_classes:
    df_temp = df_tsinghua[df_tsinghua['class'] == uni]
    all_labels = df_temp['label'].unique()
    tot_labels.append(len(all_labels))
    if all_labels[0] in used_label:
        print("Label already used")
    used_label.append(all_labels[0])
    

In [16]:
# Split DF

df_tsinghua_train = df_tsinghua[df_tsinghua.index.isin(train_list)]
df_tsinghua_val = df_tsinghua[df_tsinghua.index.isin(val_list)]

In [17]:
out_val_path = os.path.join(df_path, "df_tsinghua_val.pkl")
df_tsinghua_val.to_pickle(out_val_path)
out_train_path = os.path.join(df_path, "df_tsinghua_train.pkl")
df_tsinghua_train.to_pickle(out_train_path)

# Process non 8 bit images

In [None]:
from dataset import CustomImageDataset
from tqdm import tqdm

In [None]:
loader = CustomImageDataset(df=df_tsinghua)

In [None]:
bad_images = []
for i in tqdm(range(24881, len(loader))):
    try:
        img = loader[i]
    except:
        print("Bad image at index ", i)
        bad_images.append(bad_images)


In [None]:
bad_images = [11796, 15420, 18503, 24881, 25426, 28916,30939, 35629, 40499, 46151, 56842]