In [8]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import random
import time
import os

start_time = time.time()

G = nx.MultiDiGraph()  # Create a MultiDiGraph to allow multiple directed edges between nodes

def generate_heterogeneous_graph(node_types, edge_types, num_nodes, num_edges, node_attributes={}, edge_attributes={}):
    # Add nodes with types and attributes
    node_count = {node_type.strip(): 0 for node_type in node_types}  # Keep track of the number of nodes of each type
    node_distribution = [0] * len(node_types)
    for _ in range(num_nodes):
        node_distribution[random.randint(0, len(node_types) - 1)] += 1
       
    for node_type, currentnodetype_nodecount in zip(node_types, node_distribution):
        node_type = node_type.strip()
        attributes = node_attributes.get(node_type, {})  # Get attributes for the node type
        attr_values = {attr_name: assign_attribute_values(attr_info, currentnodetype_nodecount) for attr_name, attr_info in attributes.items()}
        
        # Add nodes with attribute values
        for i in range(currentnodetype_nodecount):
            node_id = f'{node_type[0]}{i}'
            node_attr = {attr_name: attr_values[attr_name][i] for attr_name in attributes}
            G.add_node(node_id, type=node_type, **node_attr)
            node_count[node_type] += 1
    
    edge_set = set()
    edge_distribution = [0] * len(edge_types)
    for _ in range(num_edges):
        edge_distribution[random.randint(0, len(edge_types) - 1)] += 1
       
    # Add edges with types and attributes
    for edge_type, currentedgetype_edgecount in zip(edge_types, edge_distribution):
        source_type, relation, target_type = edge_type.replace("'", "").split()
        source_type = source_type.strip()
        target_type = target_type.strip()
        relation = relation.strip()
        
        edge_attr_info = edge_attributes.get(relation, {})
        edge_attr_values = {attr_name: assign_attribute_values(attr_info, currentedgetype_edgecount) for attr_name, attr_info in edge_attr_info.items()}
        
        for i in range(currentedgetype_edgecount):
            source_node = f'{source_type[0]}{random.randint(0, node_count[source_type] - 1)}'
            target_node = f'{target_type[0]}{random.randint(0, node_count[target_type] - 1)}'
            
#             if relation == "FriendOf" and source_type == "Person" and target_type == "Person":
#                 # Ensure no self-loops for Person 'FriendOf' Person
#                 while source_node == target_node:
#                     target_node = f'{target_type[0]}{random.randint(0, node_count[target_type] - 1)}'
            
            edge_attr = {attr_name: edge_attr_values[attr_name][i] for attr_name in edge_attr_info}
            edge_tuple = (source_node, target_node, relation)
            if edge_tuple not in edge_set:
                G.add_edge(source_node, target_node, type=relation, **edge_attr)
                edge_set.add(edge_tuple)   
    
    return G

def assign_attribute_values(attr_info, num_values):
    if attr_info['method'] == 'random_range':
        return [random.randint(int(attr_info['range'][0]), int(attr_info['range'][1])) for _ in range(num_values)]
    elif attr_info['method'] == 'predefined_set':
        return random.choices(attr_info['values'], k=num_values)
    elif attr_info['method'] == 'gaussian_distribution':
        mean = attr_info['gaussian_mean']
        std = attr_info['gaussian_std']
        return np.random.normal(mean, std, num_values)
    else:
        return []

def read_input_from_file(filename):
    with open(filename, 'r') as file:
        lines = file.readlines()
    
    node_types = []
    edge_types = []
    num_nodes = 0
    num_edges = 0
    node_attributes = {}
    edge_attributes = {}
    
    section = None
    current_node_type = None
    current_edge_type = None
    
    for line in lines:
        line = line.strip()
        
        if not line:
            continue  # Skip empty lines

        if line.startswith('node_types:'):
            node_types = line.split(': ')[1].split(',')
        elif line.startswith('edge_types:'):
            edge_types = line.split(': ')[1].split(',')
        elif line.startswith('num_nodes:'):
            num_nodes = int(line.split(': ')[1])
        elif line.startswith('num_edges:'):
            num_edges = int(line.split(': ')[1])
        elif line == 'node_attributes:':
            section = 'node_attributes'
        elif line == 'edge_attributes:':
            section = 'edge_attributes'
        elif section == 'node_attributes' and line.endswith(':'):
            current_node_type = line[:-1].strip()
            node_attributes[current_node_type] = {}
        elif section == 'edge_attributes' and line.endswith(':'):
            current_edge_type = line[:-1].strip()
            edge_attributes[current_edge_type] = {}
        elif section == 'node_attributes' and current_node_type:
            parts = line.split(': ')
            if len(parts) < 2:
                print(f"Skipping invalid line: {line}")
                continue
            attr_name = parts[0]
            method, *values = parts[1].split(', ')
            if method == 'random_range':
                if len(values) != 2:
                    print(f"Skipping invalid random_range attribute: {line}")
                    continue
                node_attributes[current_node_type][attr_name] = {'method': method, 'range': (float(values[0]), float(values[1])), 'num_values': num_nodes}
            elif method == 'predefined_set':
                node_attributes[current_node_type][attr_name] = {'method': method, 'values': values, 'num_values': num_nodes}
            elif method == 'gaussian_distribution':
                if len(values) != 2:
                    print(f"Skipping invalid gaussian_distribution attribute: {line}")
                    continue
                node_attributes[current_node_type][attr_name] = {'method': method, 'gaussian_mean': float(values[0]), 'gaussian_std': float(values[1]), 'num_values': num_nodes}
        elif section == 'edge_attributes' and current_edge_type:
            parts = line.split(': ')
            if len(parts) < 2:
                print(f"Skipping invalid line: {line}")
                continue
            attr_name = parts[0]
            attr_value = parts[1].split(', ')
            edge_attributes[current_edge_type][attr_name] = {'method': attr_value[0], 'values': attr_value[1:] if len(attr_value) > 1 else [], 'range': list(map(float, attr_value[1:])) if attr_value[0] == 'random_range' else [], 'gaussian_mean': float(attr_value[1]) if attr_value[0] == 'gaussian_distribution' else None, 'gaussian_std': float(attr_value[2]) if attr_value[0] == 'gaussian_distribution' else None}
    
    return node_types, edge_types, num_nodes, num_edges, node_attributes, edge_attributes

def generate_txt_files(graph, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Initialize file handles for different node and edge types
    node_files = {
        node_type: open(os.path.join(output_folder, f"Node_{node_type}.txt"), 'w')
        for node_type in set(nx.get_node_attributes(graph, 'type').values())
    }
    edge_files = {
        edge_type.replace(' ', '_'): open(os.path.join(output_folder, f"Edge_{edge_type.replace(' ', '_')}.txt"), 'w')
        for edge_type in set(nx.get_edge_attributes(graph, 'type').values())
    }

    # Write nodes to respective files
    for node, attr in graph.nodes(data=True):
        node_type = attr['type']
        attributes = {key: value for key, value in attr.items() if key != 'type'}  # Filter out 'type'
        node_files[node_type].write(f"{node} {{{', '.join(f'{k}:{v}' for k, v in attributes.items())} }}\n")

    # Write edges to respective files
    for source, target, attr in graph.edges(data=True):
        edge_type = attr['type']
        attributes = {key: value for key, value in attr.items() if key != 'type'}  # Filter out 'type'
        edge_files[edge_type].write(f"{source} : {target} {{ {', '.join(f'{k}:{v}' for k, v in attributes.items())} }}\n")

    # Close all file handles
    for file in node_files.values():
        file.close()
    for file in edge_files.values():
        file.close()

###############USAGE############################################
# Read input from file
filename = 'InputFile.txt'
node_types, edge_types, num_nodes, num_edges, node_attributes, edge_attributes = read_input_from_file(filename)

# Generate the heterogeneous graph based on input from file
graph = generate_heterogeneous_graph(node_types, edge_types, num_nodes, num_edges, node_attributes, edge_attributes)

output_folder = 'Output'
generate_txt_files(G, output_folder)
end_time = time.time()
duration = end_time - start_time
print(f"Total time: {duration:.4f} seconds")


Total time: 0.0479 seconds


In [9]:
import os
import networkx as nx

def generate_cypher_files(graph, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Initialize file handles for different node and edge types
    node_files = {
        node_type: open(os.path.join(output_folder, f"Node_{node_type}.txt"), 'w')
        for node_type in set(nx.get_node_attributes(graph, 'type').values())
    }
    edge_files = {
        edge_type.replace(' ', '_'): open(os.path.join(output_folder, f"Edge_{edge_type.replace(' ', '_')}.txt"), 'w')
        for edge_type in set(nx.get_edge_attributes(graph, 'type').values())
    }

    # Write nodes to respective files
    for node, attr in graph.nodes(data=True):
        node_type = attr['type']
        attributes = {key: value for key, value in attr.items() if key != 'type'}  # Filter out 'type'
        attr_string = ', '.join(f"{k}: {repr(v)}" for k, v in attributes.items())        
        cypher_statement = f"CREATE (n:{node_type} {{id: '{node}', {attr_string}}}) RETURN n;\n"
        node_files[node_type].write(cypher_statement)

    # Write edges to respective files
    for source, target, attr in graph.edges(data=True):
        edge_type = attr['type']
        attributes = {key: value for key, value in attr.items() if key != 'type'}  # Filter out 'type'
        attr_string = ', '.join(f"{k}: {repr(v)}" for k, v in attributes.items())
        cypher_statement = f"MATCH (a),(b) WHERE a.id = '{source}' AND b.id = '{target}' CREATE (a)-[r:{edge_type} {{ {attr_string} }}]->(b) RETURN r;\n"
        edge_files[edge_type].write(cypher_statement)

    # Close all file handles
    for file in node_files.values():
        file.close()
    for file in edge_files.values():
        file.close()

# Example usage
# Assuming `G` is your generated networkx graph
output_folder = 'CypherFiles'
generate_cypher_files(G, output_folder)