# Create Edges

This notebook constructs a graph structure among artworks in the WikiArt dataset
based on artists schools. Two artworks are connected if their corresponding
artists belong to the same school. Each artwork (node) is linked to up to 128 neighboring artworks (edges) belonging
to related artists within the same data split (train/val/test).

- **Input:** [`wikiart_full.csv`](https://raw.githubusercontent.com/thefth/ArtSAGENet/main/Dataset/wikiart_full.csv) (metadata file containing artist names, schools, and split information)

- **Output:** `pairs` â€” list of connected artwork index pairs representing undirected edges

- **Purpose:** To generate the graph connectivity structure used for training or evaluation in
  graph-based models.

In [None]:
import numpy as np
import pandas as pd
import random

from copy import deepcopy
from itertools import cycle
from tqdm.notebook import tqdm

In [None]:
def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    
set_seed(42)

In [None]:
# Load dataset
df = pd.read_csv('https://raw.githubusercontent.com/thefth/ArtSAGENet/main/Dataset/wikiart_full.csv')

In [None]:
# Build mapping: artist -> list of connected artists
# Two artists are considered connected if they belong to the same school.
connected_artists = {k: [] for k in df['artist_name'].tolist()}


for k in tqdm(connected_artists):
    
    candidates = []
    for school in df[df['artist_name']==k]['artist_school'].unique().tolist()[0].split(', '):
        
        candidates.extend(df[df['artist_school'].str.contains(school)]['artist_name'].unique().tolist())
        
    connected_artists[k] = list(set(candidates))
    connected_artists[k].remove(k)

In [None]:
# Build mapping: artist -> list of artwork indices for each data split
# This helps select edges within the same split (train/val/test)
artists2artworks = {k: {'train': df[(df['artist_name']==k) & (df['mode']=='train')].index.tolist(),
                      'val': df[(df['artist_name']==k) & (df['mode']=='val')].index.tolist(),
                      'test': df[(df['artist_name']==k) & (df['mode']=='test')].index.tolist()}
                    for k in tqdm(connected_artists)}    

In [None]:
# Edge generation: connections based on shared schools
# Each artwork connects to up to 128 paintings from artists in the same school network
# and within the same split (mode).
edges = []

for id_, row in tqdm(df.iterrows(), total=df.shape[0]):
    
    candidates = connected_artists[row['artist_name']]
    
    if len(candidates)>0:
        painting_candidates = []
        
        for candidate in candidates:
            
            if candidate in artists2artworks:
                painting_candidates.extend(artists2artworks[candidate][row['mode']])
          

        if len(painting_candidates)>0:
            edges.extend(list(zip(cycle([id_]), list(np.random.choice(painting_candidates,
                                                                             min(len(painting_candidates), 128),
                                                                                 replace=False)))))
                
    

In [None]:
# Build an adjacency list representation of the graph for the entire dataset
artworks2mode = df['mode'].to_dict()
artworks2artists = df['artist_name'].to_dict()

edges = {i: [] for i in range(df.shape[0])}

keys =  list(edges.keys())
random.shuffle(keys)

for id_ in tqdm(keys):
    
    if len(edges[id_])==128:
        continue
    
    candidates = connected_artists[artworks2artists[id_]]
    
   
    if len(candidates)>0:
        painting_candidates = []
        
        for candidate in candidates:
            
            if candidate in artists2artworks:
                painting_candidates.extend(artists2artworks[candidate][artworks2mode[id_]])
          
        painting_candidates = [i for i in set(painting_candidates) if len(edges[i])<128 and i not in edges[id_]]
        
        if len(painting_candidates)>0:
            final_candidates = list(np.random.choice(painting_candidates, 
                                                     min(len(painting_candidates), 128-len(edges[id_])), 
                                                     replace=False))
                
    
            edges[id_].extend(final_candidates)

            for painting in final_candidates:

                edges[painting].append(id_)

    

In [None]:
# Convert the adjacency dictionary into a list of unique edge pairs
pairs = []

for k, v in tqdm(deepcopy(edges).items()):
    
    for v_ in v:
        
        pairs.append([k, v_])
        edges[k].remove(v_)