In [None]:
import pandas as pd
import os
import numpy as np
from scipy.sparse import coo_matrix

## Visual Features

In [None]:
def load_labels():
    # prepare all the labels
    # scene category relevant
    file_name_category = 'categories_places365.txt'
    if not os.access(file_name_category, os.W_OK):
        synset_url = 'https://raw.githubusercontent.com/csailvision/places365/master/categories_places365.txt'
        os.system('wget ' + synset_url)
    classes = list()
    with open(file_name_category) as class_file:
        for line in class_file:
            classes.append(line.strip().split(' ')[0][3:])
    classes = tuple(classes)

    # indoor and outdoor relevant
    file_name_IO = 'IO_places365.txt'
    if not os.access(file_name_IO, os.W_OK):
        synset_url = 'https://raw.githubusercontent.com/csailvision/places365/master/IO_places365.txt'
        os.system('wget ' + synset_url)
    with open(file_name_IO) as f:
        lines = f.readlines()
        labels_IO = []
        for line in lines:
            items = line.rstrip().split()
            labels_IO.append(int(items[-1]) -1) # 0 is indoor, 1 is outdoor
    labels_IO = np.array(labels_IO)

    # scene attribute relevant
    file_name_attribute = 'labels_sunattribute.txt'
    if not os.access(file_name_attribute, os.W_OK):
        synset_url = 'https://raw.githubusercontent.com/csailvision/places365/master/labels_sunattribute.txt'
        os.system('wget ' + synset_url)
    with open(file_name_attribute) as f:
        lines = f.readlines()
        labels_attribute = [item.rstrip() for item in lines]
    file_name_W = 'W_sceneattribute_wideresnet18.npy'
    if not os.access(file_name_W, os.W_OK):
        synset_url = 'http://places2.csail.mit.edu/models_places365/W_sceneattribute_wideresnet18.npy'
        os.system('wget ' + synset_url)
    W_attribute = np.load(file_name_W)

    return classes, labels_IO, labels_attribute, W_attribute

In [None]:
def idx_to_mat(M, M_ori, ID, k):
    row = np.repeat(np.arange(0,len(M))[:,None],k,axis=1).flatten()
    col = ID.flatten()
    data = M.flatten()
    
    M_new = coo_matrix((data, (row, col)), shape=M_ori.shape).toarray()
    
    return M_new

In [None]:
def k_hot_filter(M, k):
    IDX = np.argsort(M,axis=1)
    ID = IDX[:,-k:]
    ID_rev = IDX[:,:-k]
    
    R_id = np.take(M,ID, axis=1)
    M_id = np.stack([R_id[i,i,:] for i in range(len(M))])
    
    conf = M_id.sum(axis=1)
    
    R_rev = np.take(np.ones(M.shape),ID_rev, axis=1)
    R_rev_id = np.stack([R_rev[i,i,:] for i in range(len(M))])
    M_rev = R_rev_id*(1-conf[:,None])/(M.shape[1]-k)
    
    M0 = idx_to_mat(M_id, M, ID, k)
    M1 = idx_to_mat(M_rev, M, ID_rev, M.shape[1]-k)
    
    return M0+M1

In [None]:
classes, labels_IO, labels_attribute, W_attribute = load_labels()

In [None]:
def take_per_row(A, indx, num_elem=1):
    all_indx = indx[:,None] + np.arange(num_elem)
    return A[np.arange(all_indx.shape[0])[:,None], all_indx]

In [None]:
VIS_FEAT = pd.read_csv('dataset/Venice/Visual_Features.csv', sep='\t', index_col='Unnamed: 0')

In [None]:
np.save('dataset/Venice/Visual_Features.npy',np.array(VIS_FEAT))

In [None]:
np.load('dataset/Venice/Visual_Features.npy', allow_pickle=True).shape

(2951, 984)

In [None]:
VIS_FEAT = pd.read_csv('dataset/Venice-XL/Visual_Features.csv', sep='\t', index_col='Unnamed: 0')

In [None]:
np.save('dataset/Venice-XL/Visual_Features.npy',np.array(VIS_FEAT))

## Textual Features

In [None]:
TEX_FEAT = pd.read_csv('dataset/Venice/Textual_Features.csv', sep='\t', index_col='Unnamed: 0')

In [None]:
np.save('dataset_np/Venice/Textual_Features.npy',np.array(TEX_FEAT))

In [None]:
TEX_FEAT = pd.read_csv('dataset/Venice-XL/Textual_Features.csv', sep='\t', index_col='Unnamed: 0')

In [None]:
np.save('dataset_np/Venice-XL/Textual_Features.npy',np.array(TEX_FEAT))

## Heritage Values and Attributes Labels

In [None]:
classes = ['Criteria i', 'Criteria ii', 'Criteria iii', 'Criteria iv', 'Criteria v', 'Criteria vi', 
              'Criteria vii', 'Criteria viii', 'Criteria ix', 'Criteria x', 'Others']

In [None]:
VAL_LAB = pd.read_csv('dataset/Venice/Value_Labels.csv', sep='\t', index_col='Unnamed: 0')

In [3]:
ATT_LAB = pd.read_csv('dataset/Venice/Attribute_Labels.csv', sep='\t', index_col='Unnamed: 0')

In [223]:
np.savez('dataset_np/Venice/labels.npz', ATT_LAB=ATT_LAB, VAL_LAB=VAL_LAB)

In [None]:
VAL_LAB = pd.read_csv('dataset/Venice-XL/Value_Labels.csv', sep='\t', index_col='Unnamed: 0')

In [None]:
ATT_LAB = pd.read_csv('dataset/Venice-XL/Attribute_Labels.csv', sep='\t', index_col='Unnamed: 0')

In [220]:
np.savez('dataset_np/Venice-XL/labels.npz', ATT_LAB=ATT_LAB, VAL_LAB=VAL_LAB)

## Saving Tags as Numpy

### Node Types

In [66]:
node_type = np.array(VAL_LAB['text_bool'].astype(int))

In [68]:
node_type.shape

(2951,)

In [69]:
np.save('dataset_np/Venice/node_types.npy', node_type)

In [61]:
np.save('dataset_np/Venice-XL/node_types.npy', node_type)

In [125]:
node_type = np.load('dataset_np/Venice/node_types.npy')

In [199]:
node_type = np.load('dataset_np/Venice-XL/node_types.npy')

### Train_Val_Test_Split

In [10]:
VAL_LAB

Unnamed: 0,index,text_bool,Criteria i,Criteria ii,Criteria iii,Criteria iv,Criteria v,Criteria vi,Criteria vii,Criteria viii,...,max_2_col,max_3_val,max_3_col,max_1,max_3,conf_3,conf_1,same_3,same_1,labelled
0,51859675789,True,0.023335,0.00691,0.02033,0.010409,0.003758,0.020427,0.119277,0.21231,...,Criteria x,0.21231,Criteria viii,0.325591,0.78243,0.8486,0.453421,0.5,False,True
1,51859339946,False,,,,,,,,,...,,,,,,,,,,False
2,51858385807,False,,,,,,,,,...,,,,,,,,,,False
3,51859339546,False,,,,,,,,,...,,,,,,,,,,False
4,51858385377,False,,,,,,,,,...,,,,,,,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3132,38397950426,False,,,,,,,,,...,,,,,,,,,,False
3133,38397948496,False,,,,,,,,,...,,,,,,,,,,False
3134,38454117381,False,,,,,,,,,...,,,,,,,,,,False
3135,38397940256,False,,,,,,,,,...,,,,,,,,,,False


In [11]:
ATT_LAB

Unnamed: 0,ID,Building Elements,Urban Form Elements,Gastronomy,Interior Scenery,Natural Features and Land-scape Scenery,Monuments and Buildings,People’s Activity and Association,Artifact Products,Urban Scenery,category,category_id,category_vote,cat_id_vote,category_stack,cat_id_stack,conf,category_same,labelled
0,23903381607,0.036499,0.056327,0.009916,0.784756,0.008825,0.013589,0.051456,0.033642,0.004991,interior,3,interior,3,interior,3,0.784756,True,True
1,23903645987,0.040601,0.625583,0.040528,0.023794,0.008027,0.038982,0.196209,0.013067,0.013209,form,1,form,1,form,1,0.625583,True,False
2,23919231927,0.073656,0.355539,0.029174,0.022502,0.358969,0.026122,0.086723,0.018699,0.028617,,-1,landscape scenery and natural features,4,form,1,0.481422,False,False
3,23940920237,0.243404,0.300955,0.013184,0.188603,0.008440,0.129352,0.033844,0.074431,0.007787,,-1,architectural elements,0,form,1,0.365565,False,False
4,23952125997,0.044220,0.017232,0.009817,0.842258,0.004573,0.003968,0.038865,0.035273,0.003794,interior,3,interior,3,interior,3,0.842258,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3132,51859424958,0.266205,0.078821,0.007417,0.014943,0.073707,0.445106,0.022718,0.011466,0.079618,monuments,5,monuments,5,monuments,5,0.445106,True,False
3133,51859673444,0.158670,0.634293,0.016275,0.019310,0.014877,0.074055,0.056887,0.010266,0.015369,form,1,form,1,form,1,0.634293,True,False
3134,51859675789,0.370720,0.309098,0.009464,0.033456,0.141545,0.047814,0.069930,0.010328,0.007645,,-1,architectural elements,0,form,1,0.435263,False,False
3135,51859999785,0.064675,0.553818,0.009878,0.009191,0.066401,0.222615,0.038820,0.006277,0.028325,form,1,form,1,form,1,0.553818,True,False


In [77]:
sum(VAL_LAB['labelled']), sum(ATT_LAB['labelled']), sum(ATT_LAB['labelled'] & VAL_LAB['labelled'])

(756, 1356, 361)

In [7]:
sum(VAL_LAB['labelled']), sum(ATT_LAB['labelled']), sum(ATT_LAB['labelled'] & VAL_LAB['labelled'])

(1247, 1825, 639)

In [12]:
sum(VAL_LAB['labelled']), sum(ATT_LAB['labelled']), sum(ATT_LAB['labelled'] & VAL_LAB['labelled'])

(322, 1151, 118)

In [41]:
np.unique(np.array(VAL_LAB[(ATT_LAB['labelled'] & VAL_LAB['labelled'])][['max_1_col', 'max_2_col', 'max_3_col']]), return_counts=True)

(array(['Criteria i', 'Criteria ii', 'Criteria iii', 'Criteria iv',
        'Criteria ix', 'Criteria v', 'Criteria vi', 'Criteria vii',
        'Criteria x'], dtype=object),
 array([172, 188, 247, 261,   1,   7, 205,   1,   1]))

In [44]:
np.unique(np.array(ATT_LAB[(ATT_LAB['labelled'] & VAL_LAB['labelled'])][['category']]), return_counts=True)

(array(['architectural elements', 'form', 'gastronomy', 'interior',
        'landscape scenery and natural features', 'monuments', 'people',
        'product', 'urban scenery'], dtype=object),
 array([ 71, 101,   9,  25,  30,  69,  49,   1,   6]))

In [105]:
sum((node_type==1) & (VAL_LAB['labelled'] & ATT_LAB['labelled'])), sum((node_type==1) & (VAL_LAB['labelled'] & ~ATT_LAB['labelled'])),  sum((node_type==1) & (~VAL_LAB['labelled'] & ATT_LAB['labelled'])) , sum((node_type==1) & (~VAL_LAB['labelled'] & ~ATT_LAB['labelled']))

(361, 395, 450, 555)

In [106]:
sum((node_type==0) & (VAL_LAB['labelled'] & ATT_LAB['labelled'])), sum((node_type==0) & (VAL_LAB['labelled'] & ~ATT_LAB['labelled'])),  sum((node_type==0) & (~VAL_LAB['labelled'] & ATT_LAB['labelled'])) , sum((node_type==0) & (~VAL_LAB['labelled'] & ~ATT_LAB['labelled']))

(0, 0, 545, 645)

In [109]:
(395+450+545)/2

695.0

In [157]:
train_idx = np.array(VAL_LAB.index[(node_type==1) & (VAL_LAB['labelled'] & ATT_LAB['labelled'])])
train_idx

array([   1,   28,   30,   34,   35,   42,   46,   47,   62,   64,   94,
         95,  100,  101,  104,  106,  110,  120,  127,  151,  158,  160,
        165,  178,  179,  199,  222,  245,  251,  252,  260,  261,  264,
        284,  291,  292,  298,  304,  331,  346,  349,  362,  363,  364,
        365,  375,  397,  404,  407,  409,  410,  413,  421,  451,  455,
        456,  457,  458,  460,  462,  463,  466,  467,  468,  469,  474,
        475,  493,  523,  525,  537,  541,  551,  558,  562,  563,  571,
        593,  619,  646,  652,  665,  669,  682,  703,  705,  706,  707,
        712,  715,  718,  727,  742,  750,  753,  757,  782,  798,  807,
        810,  819,  826,  878,  913,  920,  926,  934,  935,  936,  937,
        943,  946,  955,  964,  972,  974, 1003, 1004, 1005, 1058, 1063,
       1077, 1078, 1095, 1096, 1099, 1106, 1115, 1116, 1130, 1134, 1149,
       1163, 1165, 1166, 1168, 1183, 1185, 1200, 1207, 1208, 1209, 1210,
       1211, 1213, 1219, 1221, 1222, 1224, 1233, 12

In [163]:
val_test = np.array(VAL_LAB.index[(VAL_LAB['labelled'] | ATT_LAB['labelled']) & ~(VAL_LAB['labelled'] & ATT_LAB['labelled'])])
val_test

array([   0,    2,    3, ..., 2947, 2948, 2949])

In [167]:
import random
def set_seed_everywhere(seed):
    np.random.seed(seed)
    random.seed(seed)

In [169]:
set_seed_everywhere(42)
val_test_p = np.random.permutation(val_test)
val_test_p

array([1171,  356,  516, ..., 2711, 1794, 2367])

In [172]:
val_idx = val_test_p[:int(len(val_test)*.5)]
len(val_idx)

695

In [173]:
test_idx = val_test_p[int(len(val_test)*.5):]

In [41]:
np.unique(np.array(VAL_LAB[(ATT_LAB['labelled'] & VAL_LAB['labelled'])][['max_1_col', 'max_2_col', 'max_3_col']]), return_counts=True)

(array(['Criteria i', 'Criteria ii', 'Criteria iii', 'Criteria iv',
        'Criteria ix', 'Criteria v', 'Criteria vi', 'Criteria vii',
        'Criteria x'], dtype=object),
 array([172, 188, 247, 261,   1,   7, 205,   1,   1]))

In [174]:
np.unique(np.array(VAL_LAB.loc[train_idx].loc[VAL_LAB['labelled']][['max_1_col', 'max_2_col', 'max_3_col']]), return_counts=True)

(array(['Criteria i', 'Criteria ii', 'Criteria iii', 'Criteria iv',
        'Criteria ix', 'Criteria v', 'Criteria vi', 'Criteria vii',
        'Criteria x'], dtype=object),
 array([172, 188, 247, 261,   1,   7, 205,   1,   1]))

In [239]:
np.unique(np.array(VAL_LAB[(ATT_LAB['labelled'] & VAL_LAB['labelled'])][['max_1_col', 'max_2_col', 'max_3_col']]), return_counts=True)[1]/361/3

array([0.1588181 , 0.17359187, 0.22807018, 0.24099723, 0.00092336,
       0.00646353, 0.18928901, 0.00092336, 0.00092336])

In [176]:
np.unique(np.array(VAL_LAB.loc[val_idx].loc[VAL_LAB['labelled']][['max_1_col', 'max_2_col', 'max_3_col']]), return_counts=True)

(array(['Criteria i', 'Criteria ii', 'Criteria iii', 'Criteria iv',
        'Criteria v', 'Criteria vi'], dtype=object),
 array([108, 107, 131, 142,   8, 113]))

In [177]:
np.unique(np.array(VAL_LAB.loc[test_idx].loc[VAL_LAB['labelled']][['max_1_col', 'max_2_col', 'max_3_col']]), return_counts=True)

(array(['Criteria i', 'Criteria ii', 'Criteria iii', 'Criteria iv',
        'Criteria v', 'Criteria vi', 'Criteria vii', 'Criteria viii',
        'Criteria x'], dtype=object),
 array([ 94, 103, 136, 133,   3, 103,   2,   1,   1]))

In [44]:
np.unique(np.array(ATT_LAB[(ATT_LAB['labelled'] & VAL_LAB['labelled'])][['category']]), return_counts=True)

(array(['architectural elements', 'form', 'gastronomy', 'interior',
        'landscape scenery and natural features', 'monuments', 'people',
        'product', 'urban scenery'], dtype=object),
 array([ 71, 101,   9,  25,  30,  69,  49,   1,   6]))

In [240]:
np.unique(np.array(ATT_LAB[(ATT_LAB['labelled'] & VAL_LAB['labelled'])][['category']]), return_counts=True)[1]/361

array([0.1966759 , 0.27977839, 0.02493075, 0.06925208, 0.08310249,
       0.19113573, 0.13573407, 0.00277008, 0.0166205 ])

In [175]:
np.unique(np.array(ATT_LAB.loc[train_idx][['category']]), return_counts=True)

(array(['architectural elements', 'form', 'gastronomy', 'interior',
        'landscape scenery and natural features', 'monuments', 'people',
        'product', 'urban scenery'], dtype=object),
 array([ 71, 101,   9,  25,  30,  69,  49,   1,   6]))

In [179]:
np.unique(np.array(ATT_LAB.fillna('None').loc[val_idx][['category']]), return_counts=True)

(array(['None', 'architectural elements', 'form', 'gastronomy', 'interior',
        'landscape scenery and natural features', 'monuments', 'people',
        'product', 'urban scenery'], dtype=object),
 array([ 45,  85, 168,  14,  27,  99, 135,  94,  13,  15]))

In [180]:
np.unique(np.array(ATT_LAB.fillna('None').loc[test_idx][['category']]), return_counts=True)

(array(['None', 'architectural elements', 'form', 'gastronomy', 'interior',
        'landscape scenery and natural features', 'monuments', 'people',
        'product', 'urban scenery'], dtype=object),
 array([ 36,  92, 182,  15,  26,  81, 135, 100,  15,  13]))

In [187]:
len(VAL_LAB.iloc[val_idx].loc[VAL_LAB['text_bool']==False]),len(VAL_LAB.iloc[test_idx].loc[VAL_LAB['text_bool']==False])

(276, 269)

In [190]:
len(VAL_LAB.iloc[val_idx].loc[VAL_LAB['text_bool']==True].loc[VAL_LAB['labelled']]),len(VAL_LAB.iloc[test_idx].loc[VAL_LAB['text_bool']==True].loc[VAL_LAB['labelled']])

(203, 192)

In [192]:
len(VAL_LAB.iloc[val_idx].loc[VAL_LAB['text_bool']==True].loc[ATT_LAB['labelled']]),len(VAL_LAB.iloc[test_idx].loc[VAL_LAB['text_bool']==True].loc[ATT_LAB['labelled']])

(216, 234)

In [193]:
np.savez('dataset_np/Venice/train_val_test_idx.npz', train_idx=train_idx, val_idx=val_idx, test_idx=test_idx)

In [91]:
np.unique(np.array(VAL_LAB.iloc[node_type==1].loc[VAL_LAB['labelled']][['max_1_col', 'max_2_col', 'max_3_col']]), return_counts=True)

(array(['Criteria i', 'Criteria ii', 'Criteria iii', 'Criteria iv',
        'Criteria ix', 'Criteria v', 'Criteria vi', 'Criteria vii',
        'Criteria viii', 'Criteria x'], dtype=object),
 array([374, 398, 514, 536,   1,  18, 421,   3,   1,   2]))

In [89]:
np.unique(np.array(ATT_LAB.iloc[node_type==1]['category'].fillna('None')), return_counts=True)

(array(['None', 'architectural elements', 'form', 'gastronomy', 'interior',
        'landscape scenery and natural features', 'monuments', 'people',
        'product', 'urban scenery'], dtype=object),
 array([204, 254, 375,  30,  67, 168, 323, 271,  36,  33]))

In [112]:
sum(VAL_LAB['labelled']), sum(ATT_LAB['labelled']), sum(ATT_LAB['labelled'] & VAL_LAB['labelled'])

(25771, 37289, 11569)

In [48]:
np.unique(np.array(VAL_LAB[(ATT_LAB['labelled'] & VAL_LAB['labelled'])][['max_1_col', 'max_2_col', 'max_3_col']]), return_counts=True)

(array(['Criteria i', 'Criteria ii', 'Criteria iii', 'Criteria iv',
        'Criteria ix', 'Criteria v', 'Criteria vi', 'Criteria vii',
        'Criteria viii', 'Criteria x'], dtype=object),
 array([2463, 4704, 9864, 8578,   19,   54, 8921,   58,   18,   28]))

In [50]:
np.unique(np.array(ATT_LAB[(ATT_LAB['labelled'] & VAL_LAB['labelled'])][['category']]), return_counts=True)

(array(['architectural elements', 'form', 'gastronomy', 'interior',
        'landscape scenery and natural features', 'monuments', 'people',
        'product', 'urban scenery'], dtype=object),
 array([1501, 2636,  139,  480, 2051, 1507, 2457,  685,  113]))

In [115]:
sum((node_type==1) & (VAL_LAB['labelled'] & ATT_LAB['labelled'])), sum((node_type==1) & (VAL_LAB['labelled'] & ~ATT_LAB['labelled'])),  sum((node_type==1) & (~VAL_LAB['labelled'] & ATT_LAB['labelled'])) , sum((node_type==1) & (~VAL_LAB['labelled'] & ~ATT_LAB['labelled']))

(11569, 14202, 11243, 12809)

In [116]:
sum((node_type==0) & (VAL_LAB['labelled'] & ATT_LAB['labelled'])), sum((node_type==0) & (VAL_LAB['labelled'] & ~ATT_LAB['labelled'])),  sum((node_type==0) & (~VAL_LAB['labelled'] & ATT_LAB['labelled'])) , sum((node_type==0) & (~VAL_LAB['labelled'] & ~ATT_LAB['labelled']))

(0, 0, 14477, 16663)

In [117]:
(14202+11243+14477)/2

19961.0

In [118]:
16663+12809

29472

In [202]:
train_idx = np.array(VAL_LAB.index[(node_type==1) & (VAL_LAB['labelled'] & ATT_LAB['labelled'])])
train_idx

array([    4,     5,     8, ..., 80952, 80953, 80954])

In [203]:
val_test = np.array(VAL_LAB.index[(VAL_LAB['labelled'] | ATT_LAB['labelled']) & ~(VAL_LAB['labelled'] & ATT_LAB['labelled'])])
val_test

array([    0,     1,     7, ..., 80959, 80960, 80962])

In [167]:
import random
def set_seed_everywhere(seed):
    np.random.seed(seed)
    random.seed(seed)

In [204]:
set_seed_everywhere(42)
val_test_p = np.random.permutation(val_test)
val_test_p

array([53191,  9792, 28504, ..., 77461,  1718, 31520])

In [205]:
val_idx = val_test_p[:int(len(val_test)*.5)]
len(val_idx)

19961

In [206]:
test_idx = val_test_p[int(len(val_test)*.5):]

In [207]:
np.unique(np.array(VAL_LAB[(ATT_LAB['labelled'] & VAL_LAB['labelled'])][['max_1_col', 'max_2_col', 'max_3_col']]), return_counts=True)

(array(['Criteria i', 'Criteria ii', 'Criteria iii', 'Criteria iv',
        'Criteria ix', 'Criteria v', 'Criteria vi', 'Criteria vii',
        'Criteria viii', 'Criteria x'], dtype=object),
 array([2463, 4704, 9864, 8578,   19,   54, 8921,   58,   18,   28]))

In [208]:
np.unique(np.array(VAL_LAB.loc[train_idx].loc[VAL_LAB['labelled']][['max_1_col', 'max_2_col', 'max_3_col']]), return_counts=True)

(array(['Criteria i', 'Criteria ii', 'Criteria iii', 'Criteria iv',
        'Criteria ix', 'Criteria v', 'Criteria vi', 'Criteria vii',
        'Criteria viii', 'Criteria x'], dtype=object),
 array([2463, 4704, 9864, 8578,   19,   54, 8921,   58,   18,   28]))

In [245]:
np.array([2463, 4704, 9864, 8578,   19,   54, 8921,   58,   18,   28])/11569/3

array([0.07096551, 0.13553462, 0.2842078 , 0.24715475, 0.00054744,
       0.00155588, 0.25703749, 0.00167113, 0.00051863, 0.00080675])

In [209]:
np.unique(np.array(VAL_LAB.loc[val_idx].loc[VAL_LAB['labelled']][['max_1_col', 'max_2_col', 'max_3_col']]), return_counts=True)

(array(['Criteria i', 'Criteria ii', 'Criteria iii', 'Criteria iv',
        'Criteria ix', 'Criteria v', 'Criteria vi', 'Criteria vii',
        'Criteria viii', 'Criteria x'], dtype=object),
 array([1672, 2845, 5990, 5189,   16,   22, 5482,   40,    3,   17]))

In [210]:
np.unique(np.array(VAL_LAB.loc[test_idx].loc[VAL_LAB['labelled']][['max_1_col', 'max_2_col', 'max_3_col']]), return_counts=True)

(array(['Criteria i', 'Criteria ii', 'Criteria iii', 'Criteria iv',
        'Criteria ix', 'Criteria v', 'Criteria vi', 'Criteria vii',
        'Criteria viii', 'Criteria x'], dtype=object),
 array([1651, 2736, 6028, 5268,   14,   18, 5557,   32,    7,   19]))

In [211]:
np.unique(np.array(ATT_LAB[(ATT_LAB['labelled'] & VAL_LAB['labelled'])][['category']]), return_counts=True)

(array(['architectural elements', 'form', 'gastronomy', 'interior',
        'landscape scenery and natural features', 'monuments', 'people',
        'product', 'urban scenery'], dtype=object),
 array([1501, 2636,  139,  480, 2051, 1507, 2457,  685,  113]))

In [212]:
np.unique(np.array(ATT_LAB.loc[train_idx][['category']]), return_counts=True)

(array(['architectural elements', 'form', 'gastronomy', 'interior',
        'landscape scenery and natural features', 'monuments', 'people',
        'product', 'urban scenery'], dtype=object),
 array([1501, 2636,  139,  480, 2051, 1507, 2457,  685,  113]))

In [244]:
np.array([1501, 2636,  139,  480, 2051, 1507, 2457,  685,  113])/11569

array([0.12974328, 0.22785029, 0.01201487, 0.04149019, 0.17728412,
       0.13026191, 0.21237791, 0.05920996, 0.00976748])

In [213]:
np.unique(np.array(ATT_LAB.fillna('None').loc[val_idx][['category']]), return_counts=True)

(array(['None', 'architectural elements', 'form', 'gastronomy', 'interior',
        'landscape scenery and natural features', 'monuments', 'people',
        'product', 'urban scenery'], dtype=object),
 array([1491, 2170, 4274,  396,  995, 3148, 2292, 3944, 1030,  221]))

In [214]:
np.unique(np.array(ATT_LAB.fillna('None').loc[test_idx][['category']]), return_counts=True)

(array(['None', 'architectural elements', 'form', 'gastronomy', 'interior',
        'landscape scenery and natural features', 'monuments', 'people',
        'product', 'urban scenery'], dtype=object),
 array([1491, 2172, 4160,  380,  968, 3190, 2321, 4022, 1054,  203]))

In [215]:
len(VAL_LAB.iloc[val_idx].loc[VAL_LAB['text_bool']==False]),len(VAL_LAB.iloc[test_idx].loc[VAL_LAB['text_bool']==False])

(7236, 7241)

In [216]:
len(VAL_LAB.iloc[val_idx].loc[VAL_LAB['text_bool']==True].loc[VAL_LAB['labelled']]),len(VAL_LAB.iloc[test_idx].loc[VAL_LAB['text_bool']==True].loc[VAL_LAB['labelled']])

(7092, 7110)

In [217]:
len(VAL_LAB.iloc[val_idx].loc[VAL_LAB['text_bool']==True].loc[ATT_LAB['labelled']]),len(VAL_LAB.iloc[test_idx].loc[VAL_LAB['text_bool']==True].loc[ATT_LAB['labelled']])

(5633, 5610)

In [218]:
np.savez('dataset_np/Venice-XL/train_val_test_idx.npz', train_idx=train_idx, val_idx=val_idx, test_idx=test_idx)

## Extracting Training Data Labels for Verification

In [9]:
(ATT_LAB['labelled'] & VAL_LAB['labelled']).sum()

361

In [16]:
df_A = ATT_LAB[(ATT_LAB['labelled'] & VAL_LAB['labelled'])][['ID', 'category', 'conf']]
df_A

Unnamed: 0,ID,category,conf
1,51871789478,urban scenery,0.899618
28,51847785831,people,0.788871
30,51843895627,monuments,0.912941
34,51844341038,architectural elements,0.807716
35,51844226641,architectural elements,0.763390
...,...,...,...
2874,51241724227,monuments,0.706842
2925,51234000589,form,0.840006
2927,51233219526,people,0.788442
2934,51234292385,form,0.854783


In [17]:
df_B = VAL_LAB[(ATT_LAB['labelled'] & VAL_LAB['labelled'])][['index', 'max_1_col','max_2_col','max_3_col','max_3','conf_3']]
df_B

Unnamed: 0,index,max_1_col,max_2_col,max_3_col,max_3,conf_3
1,51871789478,Criteria ii,Criteria iii,Criteria i,0.769948,0.828654
28,51847785831,Criteria iv,Criteria vi,Criteria iii,0.798588,0.858456
30,51843895627,Criteria iv,Criteria ii,Criteria iii,0.776642,0.832910
34,51844341038,Criteria i,Criteria vi,Criteria iii,0.811616,0.845875
35,51844226641,Criteria i,Criteria vi,Criteria iii,0.810204,0.840927
...,...,...,...,...,...,...
2874,51241724227,Criteria iv,Criteria ii,Criteria i,0.879760,0.887064
2925,51234000589,Criteria i,Criteria iii,Criteria iv,0.749658,0.884200
2927,51233219526,Criteria i,Criteria iii,Criteria iv,0.749658,0.884200
2934,51234292385,Criteria i,Criteria iii,Criteria iv,0.749658,0.884200


In [24]:
df_C = df_A.merge(df_B, how='outer',left_on='ID',right_on='index').drop('index', axis=1).merge(TEX_FEAT[['revised_text','index']], 
                            how='left',left_on='ID',right_on='index').drop('index', axis=1)

In [25]:
df_C

Unnamed: 0,ID,category,conf,max_1_col,max_2_col,max_3_col,max_3,conf_3,revised_text
0,51871789478,urban scenery,0.899618,Criteria ii,Criteria iii,Criteria i,0.769948,0.828654,"It has tags of adriaticsea, italy, venice"
1,51847785831,people,0.788871,Criteria iv,Criteria vi,Criteria iii,0.798588,0.858456,"It has title of Venice market, the fishmonger 4"
2,51843895627,monuments,0.912941,Criteria iv,Criteria ii,Criteria iii,0.776642,0.832910,NOT ONLY THE SQUERO. The church is dedicated t...
3,51844341038,architectural elements,0.807716,Criteria i,Criteria vi,Criteria iii,0.811616,0.845875,"It has title of San Zanipolo of Venice, chapel"
4,51844226641,architectural elements,0.763390,Criteria i,Criteria vi,Criteria iii,0.810204,0.840927,"It has title of San Zanipolo of Venice, clock"
...,...,...,...,...,...,...,...,...,...
356,51241724227,monuments,0.706842,Criteria iv,Criteria ii,Criteria i,0.879760,0.887064,San Giorgio Maggiore (Church of San Giorgio Ma...
357,51234000589,form,0.840006,Criteria i,Criteria iii,Criteria iv,0.749658,0.884200,"It has tags of biennale, architecture, archite..."
358,51233219526,people,0.788442,Criteria i,Criteria iii,Criteria iv,0.749658,0.884200,"It has tags of biennale, architecture, archite..."
359,51234292385,form,0.854783,Criteria i,Criteria iii,Criteria iv,0.749658,0.884200,"It has tags of biennale, architecture, archite..."


In [26]:
df_C.to_csv('Venezia/data_storage/training_data.csv', sep='\t')

In [28]:
!pip install xlsxwriter

Collecting xlsxwriter
[?25l  Downloading https://files.pythonhosted.org/packages/ef/95/30f6ee57f10232e2055a85c3e4c8db7d38ab5f1349b6cdced85cb8acd5e6/XlsxWriter-3.0.3-py3-none-any.whl (149kB)
[K     |████████████████████████████████| 153kB 3.4MB/s eta 0:00:01
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.0.3


In [29]:
import xlsxwriter

In [43]:
import os
import os.path as osp

In [44]:
os.getcwd()

'/Users/nbai/surfdrive/TUD/Paper/Venice_Flickr'

In [62]:
# Create an new Excel file and add a worksheet.
workbook = xlsxwriter.Workbook('Venezia/data_storage/training_data_verification_1.xlsx')
worksheet = workbook.add_worksheet()

# Widen the first column to make the text clearer.
for i in range(len(df_C)):
    worksheet.set_row(i, 130)
    ID_image = (df_C['ID'].iloc[i])
    worksheet.insert_image('A'+str(i+1), os.getcwd()+f'/Venezia/data_storage/images/150/{ID_image}.jpg', {'x_scale': 1, 
                                        'y_scale': 1, 'x_offset': 10, 'y_offset': 10})
worksheet.set_column('A:A', 25)

# Insert an image with scaling.

workbook.close()

In [31]:
df_C['ID'].iloc[0]

51871789478