In [15]:
import pickle as pkl
import os 
import sys
import ast
import numpy as np
from xopen import xopen
import json
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops

In [16]:
RAW_DATA_PATH = '/home/ubuntu/proj/data/CS-TAG'
FOLDER_NAME = 'Book'
file_name = 'Children_Final.csv'
df = pd.read_csv(
    os.path.join(RAW_DATA_PATH, FOLDER_NAME, file_name)
)
df.neighbour = df.neighbour.apply(ast.literal_eval)

In [17]:
df

Unnamed: 0,category,text,label,node_id,neighbour
0,Literature & Fiction,Description: Collection of Poetry; Title: The ...,0,0,"[5472, 14293, 15164, 26542, 33933]"
1,Literature & Fiction,Description: Pop-up book; Title: Pop-Up Book :...,0,1,"[20853, 20879]"
2,Literature & Fiction,"Description: Very good condition, hardly any w...",0,2,[84]
3,Literature & Fiction,Description: Childrens illustrated story inclu...,0,3,[14558]
4,Animals,Description: &#8220;Soft pastel illustrations ...,1,4,[73]
...,...,...,...,...,...
76870,Literature & Fiction,"Description: In its mother's womb, a tiny baby...",0,76870,"[4513, 5366, 8297, 10774, 16299, 27220, 28565,..."
76871,Growing Up & Facts of Life,Description: In Harry Potter and the Goblet of...,2,76871,"[16969, 17068, 17307, 51129, 51263]"
76872,Literature & Fiction,Description: Enjoy the adventures of magic ani...,0,76872,"[2868, 18754, 18758, 18760, 18763, 18765, 1876..."
76873,"Activities, Crafts & Games",Description: Marion Puech is a French illustra...,6,76873,[42087]


In [18]:
(df.node_id == df.index).all()

True

In [19]:
edge_index_i, edge_index_j = [], []
for i in tqdm(range(len(df))):
    edge_index_i += [df.node_id[i] for _ in range(len(df.neighbour[i]))]
    edge_index_j += df.neighbour[i]
edge_index = [edge_index_i, edge_index_j]

100%|██████████| 76875/76875 [00:08<00:00, 8964.25it/s] 


In [20]:
np.shape(edge_index)

(2, 1554578)

In [21]:

graph = Data(
    edge_index=torch.tensor(edge_index, dtype=torch.long),
    text_nodes=df.text.values.tolist(),
    text_node_labels=df.category.values.tolist(),
    y=torch.tensor(df.label.values.tolist(), dtype=torch.long)
)

In [22]:
graph

Data(edge_index=[2, 1554578], y=[76875], text_nodes=[76875], text_node_labels=[76875])

In [23]:
def split_indices(n, train_ratio=0.6, validation_ratio=0.2, test_ratio=0.2):
    # Ensure the ratios sum to 1
    assert train_ratio + validation_ratio + test_ratio == 1
    
    # Generate a shuffled array of indices from 0 to n-1
    indices = np.arange(n)
    np.random.shuffle(indices)
    
    # Calculate the split sizes
    train_end = int(n * train_ratio)
    validation_end = train_end + int(n * validation_ratio)
    
    # Split the indices
    train_indices = indices[:train_end]
    validation_indices = indices[train_end:validation_end]
    test_indices = indices[validation_end:]
    
    return train_indices, validation_indices, test_indices

# Example usage
n = len(df)  
train_indices, validation_indices, test_indices = split_indices(n)

train_indices, validation_indices, test_indices = sorted(train_indices), sorted(validation_indices), sorted(test_indices)

In [24]:
DATA_PATH = "/home/ubuntu/proj/data/graph/node_children"
DATA_NAME = "text_graph_children" # "text_graph_pubmed" #"text_graph_aids" #"text_graph_pubmed" # # 
TRAIN_SPLIT_NAME = 'train_index'
VALID_SPLIT_NAME = 'valid_index'
TEST_SPLIT_NAME = 'test_index'

with open(os.path.join(DATA_PATH, f"{DATA_NAME}.pkl"), 'wb') as f:
    pkl.dump(graph, f)
with open(os.path.join(DATA_PATH, f"{TRAIN_SPLIT_NAME}.pkl"), 'wb') as f:
    pkl.dump(train_indices, f)
with open(os.path.join(DATA_PATH, f"{VALID_SPLIT_NAME}.pkl"), 'wb') as f:
    pkl.dump(validation_indices, f)
with open(os.path.join(DATA_PATH, f"{TEST_SPLIT_NAME}.pkl"), 'wb') as f:
    pkl.dump(test_indices, f)