In [98]:
import os
import random
import math
import json
import yaml
import joblib
from ast import literal_eval

import torch
import pydantic
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import dgl

In [75]:
image_embed_dict = joblib.load('../zillow_data/image_embed.joblib')
keyword_embed_dict = {}
scene_embed_dict = {}

keyword_embed_dictlist = joblib.load('../zillow_data/keyword_embed.joblib')
for dict in tqdm(keyword_embed_dictlist, desc='converting list to dict'):
    dict_key = list(dict.keys())[0]
    keyword_embed_dict[dict_key] = dict[dict_key]

scene_embed_dictlist = joblib.load('../zillow_data/scene_embed.joblib')
for dict in tqdm(scene_embed_dictlist, desc='converting list to dict'):
    dict_key = list(dict.keys())[0]
    scene_embed_dict[dict_key] = dict[dict_key]

modal_dicts = {
    'images': image_embed_dict,
    'keywords': keyword_embed_dict,
    'scenes': scene_embed_dict
}

converting list to dict: 100%|██████████| 1542/1542 [00:00<00:00, 2150138.55it/s]
converting list to dict: 100%|██████████| 18/18 [00:00<00:00, 719023.54it/s]


In [69]:
for modal in modal_dicts:
    print(f'Unique {modal} count: {len(list(modal_dicts[modal].keys()))}')

Unique images count: 82720
Unique keywords count: 1542
Unique scenes count: 18


In [94]:
node_links = pd.read_csv('../zillow_data/NYU_photoboard_file.csv')

# convert string object of image_keyword_hash column back to a list of keyword hashes
node_links['image_keyword_hash'] = node_links['image_keyword_hash'].apply(lambda x: literal_eval(x))
node_links = node_links.rename(columns={'url_hash': 'image_hash', 'image_keyword_hash': 'keyword_hash'})

print('Total rows:', len(node_links))

node_links_keywords_only = node_links[node_links['keyword_hash'].apply(lambda x: x!=[])]
print('Pct of images with at least one keyword:', len(node_links_keywords_only) / len(node_links))
node_links.head()


Total rows: 130065
Pct of images with at least one keyword: 0.2462076653980702


Unnamed: 0,scene_hash,keyword_hash,image_hash,propertyIdhash
0,67e69903823c5a541b29be07e784b7e6,[],8f1c8d0b5b7cc98231cdea9f923d928e,a3d2de7675556553a5f08e4c88d2c228
1,60825cdcaad5cba560a6f63c895e1f3b,"[851112d8c835d58c497101d9886a1348, b10a8c0bede...",f2d2571c7b9a1e124b188f84222be032,6b693c1082a8f8b747d0db60d179b7b2
2,624109635107c411520779cf9c82ad65,[],3feadc281c74873a6e55a4ab83ebd623,c871527ccd30371d69d21faa47008f12
3,624109635107c411520779cf9c82ad65,[],71906fef662e966b21afa3d53b92decd,c871527ccd30371d69d21faa47008f12
4,624109635107c411520779cf9c82ad65,[],163c91537b81f3ad637d9ceb771e2e1d,c871527ccd30371d69d21faa47008f12


In [92]:
test = node_links['scene_hash'].values
test_lens = [len(x) for x in test]
print(np.min(test_lens))


32


In [113]:
def train_val_test_ids(node_ids):
    node_ids_copy = node_ids.copy()
    random.shuffle(node_ids_copy)
    n = len(node_ids_copy)
    train_ct, val_ct = [math.floor(n*0.7), math.ceil(n*0.15)]
    return [node_ids_copy[0:train_ct], node_ids_copy[train_ct:train_ct+val_ct], node_ids_copy[train_ct+val_ct:]]

def split_mask(node_id, split_ids):
    return True if node_id in split_ids else False

def nodes_table(modal, modal_dict):
    node_ids = list(modal_dict.keys())
    nodes = pd.DataFrame({'node_id': list(node_ids)})
    train_ids, val_ids, test_ids = train_val_test_ids(node_ids)

    tqdm.pandas(desc='applying train masks')
    nodes['train_mask'] = nodes['node_id'].progress_apply(lambda x: split_mask(x, train_ids))
    tqdm.pandas(desc='applying val masks')
    nodes['val_mask'] = nodes['node_id'].progress_apply(lambda x: split_mask(x, val_ids))
    tqdm.pandas(desc='applying test masks')
    nodes['test_mask'] = nodes['node_id'].progress_apply(lambda x: split_mask(x, test_ids))
    
    tqdm.pandas(desc=f'formatting {modal} node embeddings')
    nodes['feat'] = nodes['node_id'].progress_apply(lambda x: ', '.join([str(y) for y in modal_dict[x].tolist()]))

    return nodes

nodes_table_modals = pd.DataFrame()
for modal in modal_dicts:
    nodes_table_modals = pd.concat([nodes_table_modals, nodes_table(modal, modal_dicts[modal])])

nodes_table_modals['node_id'].drop_duplicates(inplace=True)

applying train masks: 100%|██████████| 82720/82720 [00:39<00:00, 2075.18it/s]
applying val masks: 100%|██████████| 82720/82720 [00:09<00:00, 8374.65it/s]
applying test masks: 100%|██████████| 82720/82720 [00:09<00:00, 8302.75it/s]
formatting images node embeddings: 100%|██████████| 82720/82720 [00:18<00:00, 4439.27it/s]
applying train masks: 100%|██████████| 1542/1542 [00:00<00:00, 147024.70it/s]
applying val masks: 100%|██████████| 1542/1542 [00:00<00:00, 421425.48it/s]
applying test masks: 100%|██████████| 1542/1542 [00:00<00:00, 411976.35it/s]
formatting keywords node embeddings: 100%|██████████| 1542/1542 [00:00<00:00, 4415.02it/s]
applying train masks: 100%|██████████| 18/18 [00:00<00:00, 119269.31it/s]
applying val masks: 100%|██████████| 18/18 [00:00<00:00, 122560.83it/s]
applying test masks: 100%|██████████| 18/18 [00:00<00:00, 113189.61it/s]
formatting scenes node embeddings: 100%|██████████| 18/18 [00:00<00:00, 4384.04it/s]


In [106]:
print(len())

84280
84275


Unnamed: 0,node_id,train_mask,val_mask,test_mask,feat
0,e8ca9e20156cff1b5166093249a1a808,False,False,True,"-0.019534708932042122, -0.008678311482071877, ..."
1,0b4f1031b484ce309932faab718593e5,True,False,False,"-0.020777437835931778, 0.007536700926721096, -..."
2,54ea244b01d4fe7bcfb8910e1b16e2da,True,False,False,"-0.012318197637796402, 0.002343656960874796, 0..."
3,3fde20991fe86c0b28b70c050c2c7d5b,False,False,True,"0.03275497257709503, 0.01234776247292757, -0.0..."
4,d14b527f68e2b30ced4d2ddc9d18ff83,True,False,False,"-0.027265874668955803, -0.010033353231847286, ..."


In [101]:
'''
Notes about the node_links dataframe:
    - ~130,000 total roows
    - scene_hash is always populated
    - keywords_hash is a list of keyword hashes, usually an empty list (at least one entry for ~25% of images)
'''

def edges_table(node_links):
    edges = pd.DataFrame(columns=['src_id', 'dst_id'])
    for idx, row in tqdm(node_links.iterrows(), desc='parsing data into edges'):
        edges_subset = {'src_id': [], 'dst_id': []}
        edges_subset['src_id'].append(row['image_hash'])
        edges_subset['dst_id'].append(row['scene_hash'])
        for keyword in row['keyword_hash']:
            edges_subset['src_id'].append(row['image_hash'])
            edges_subset['dst_id'].append(keyword)
        edges = pd.concat([edges, pd.DataFrame(edges_subset)])

    return edges

edges = edges_table(node_links)
edges.head()

parsing data into edges: 130065it [01:46, 1224.05it/s]


Unnamed: 0,src_id,dst_id
0,8f1c8d0b5b7cc98231cdea9f923d928e,67e69903823c5a541b29be07e784b7e6
0,f2d2571c7b9a1e124b188f84222be032,60825cdcaad5cba560a6f63c895e1f3b
1,f2d2571c7b9a1e124b188f84222be032,851112d8c835d58c497101d9886a1348
2,f2d2571c7b9a1e124b188f84222be032,b10a8c0bede9eb4ea771b04db3149f28
3,f2d2571c7b9a1e124b188f84222be032,6160972776d990112e5df1ceb938816c


In [109]:
# Store graph csv files

if not os.path.exists('zillow_graph_csv'):
    os.mkdir('zillow_graph_csv')

edges.to_csv('zillow_graph_csv/zillow_edges.csv', index=False)
nodes_table_modals_nodup.to_csv('zillow_graph_csv/zillow_nodes.csv', index=False)

g_metadata = {
    'dataset_name': 'zillow_graph',
    'edge_data': [{'file_name': 'zillow_edges.csv'}],
    'node_data': [{'file_name': 'zillow_nodes.csv'}]
}

with open('zillow_graph_csv/meta.yaml', 'w') as file:
    yaml.dump(g_metadata, file)

In [112]:
test = pd.read_csv('zillow_graph_csv/zillow_nodes.csv')
print(len(test))
test2 = test['node_id'].drop_duplicates()
print(len(test2))

84275
84270


In [108]:
zillow_graph_dataset = dgl.data.CSVDataset('./zillow_graph_csv')
zillow_graph_dataset

DGLError: Node IDs are required to be unique but the following ids are duplicate: ['67518d646b9c9868b149f513ba47af66' '67e69903823c5a541b29be07e784b7e6'
 'a203591a109f718f46c029211a7dc295' 'cff8b1fe936268ff7a363b5dc5f5fdf6'
 'e451bdad985636f1160872c46485b2ff']

In [None]:
g = zillow_graph_dataset[0]
options = {
    'node_color': 'blue',
    'node_size': 10,
    'width': 1,
}
G = dgl.to_networkx(g)
plt.figure(figsize=[15,7])
nx.draw(G, **options)