## Iniciar Spark Session

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .getOrCreate()

sc = spark.sparkContext
sc

24/06/16 13:59:40 WARN Utils: Your hostname, tcouso-HP-Pavilion-Laptop-15-cw1xxx resolves to a loopback address: 127.0.1.1; using 192.168.1.94 instead (on interface wlo1)
24/06/16 13:59:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/16 13:59:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Instanciar BBD Neo4j

In [2]:
import neo4j
from neo4j import GraphDatabase

In [10]:
URI = "bolt://localhost:7687"
AUTH = ("neo4j", "123456789")

driver = GraphDatabase.driver(URI, auth=AUTH)
with driver.session() as session:
    try:
        session.run("RETURN 1")
        print("Connection to Neo4j established successfully!")
    except Exception as e:
        print(f"Failed to connect to Neo4j: {e}")

Connection to Neo4j established successfully!


In [12]:
def create_graph(tx, graph_data):
    # Create nodes
    nodes = set()
    for start_node, relationship_type, end_node in graph_data:
        nodes.add(start_node)
        nodes.add(end_node)

    for node in nodes:
        tx.run("MERGE (n:Node {id: $id})", id=node)

    # Create relationships
    for start_node, relationship_type, end_node in graph_data:
        tx.run(
            """
            MATCH (a:Node {id: $start_id})
            MATCH (b:Node {id: $end_id})
            MERGE (a)-[r:RELATIONSHIP {type: $type}]->(b)
            """,
            start_id=start_node, end_id=end_node, type=relationship_type
        )

In [20]:
# Example graph data 

graph_data = [
    (1, 11, 2), (1, 11, 3), (2, 11, 3), (3, 11, 2),
    (3, 11, 4), (4, 11, 1), (4, 11, 2), (4, 11, 3),
    (4, 12, 5), (5, 12, 1), (5, 12, 2), (5, 12, 6)
]

In [13]:

with driver.session() as session:
    try:
        session.execute_write(create_graph, graph_data)
        print("Graph instantiated successfully!")
    except Exception as e:
        print(f"Failed to instantiate the graph: {e}")

Graph instantiated successfully!


## Problema 1

Implementa una función que reciba un grafo en Neo4j y genere una RDD con las aristas de ese grafo.


In [40]:
from collections import namedtuple

Edge = namedtuple("Edge", ["n1", "R", "n2" ])

In [125]:
# Get edges from example graph database

# query = """
#     MATCH (a:Node)-[r:RELATIONSHIP]->(b:Node)
#     RETURN a.id AS start_node, r.type AS relationship_type, b.id AS end_node
#     """

query = """
    MATCH (a:Paper)-[r:CITES]->(b:Paper)
    RETURN a.id AS start_node, b.id AS end_node
    """

with driver.session() as session:
    result = session.run(query)
    edges = [Edge(record["start_node"], "cites", record["end_node"]) for record in result]
    # edges = [Edge(record["start_node"], record["relationship_type"], record["end_node"]) for record in result]

# Create rdd from edges

graph_rdd = sc.parallelize(edges)

graph_rdd.collect()

[Edge(n1=1129442, R='cites', n2=31336),
 Edge(n1=31349, R='cites', n2=31336),
 Edge(n1=686532, R='cites', n2=31336),
 Edge(n1=755217, R='cites', n2=13195),
 Edge(n1=1120731, R='cites', n2=13195),
 Edge(n1=1107312, R='cites', n2=13195),
 Edge(n1=1105116, R='cites', n2=37879),
 Edge(n1=686532, R='cites', n2=31349),
 Edge(n1=137849, R='cites', n2=109323),
 Edge(n1=154134, R='cites', n2=217139),
 Edge(n1=10531, R='cites', n2=31353),
 Edge(n1=1129608, R='cites', n2=31353),
 Edge(n1=1152272, R='cites', n2=31353),
 Edge(n1=31927, R='cites', n2=31353),
 Edge(n1=1124844, R='cites', n2=31353),
 Edge(n1=686532, R='cites', n2=31353),
 Edge(n1=194617, R='cites', n2=31353),
 Edge(n1=1135746, R='cites', n2=31353),
 Edge(n1=1063773, R='cites', n2=31353),
 Edge(n1=1152904, R='cites', n2=31353),
 Edge(n1=43698, R='cites', n2=31353),
 Edge(n1=1107567, R='cites', n2=31353),
 Edge(n1=31336, R='cites', n2=31353),
 Edge(n1=1123576, R='cites', n2=31353),
 Edge(n1=31349, R='cites', n2=31353),
 Edge(n1=1129442,

## Problema 2

* Implementa un programa en PySpark que entregue todos los triángulos (como tuplas de tres nodos) en el grafo usando b3 reducers, donde b es un parámetro. Para esta primera parte puedes asumir que tu grafo solo usa una etiqueta de arista (en el grafo de prueba, esa etiqueta corresponde al numero 11).

In [126]:
# Map procedure

def hash_node(node_id: int, b: int) -> int:

  return hash(node_id) % b


def generate_keys(edge, b):

  hashed_n1 = hash_node(edge.n1, b)
  hashed_n2 = hash_node(edge.n2, b)
  
  return (((hashed_n1, hashed_n2, i), edge) for i in range(b))

In [127]:
# Reduce procedure

# Carrousel shift of keys
# 1, 2, 3 <-> 2, 3, 1
 
def generate_shifts(key):

  b1, b2, b3 = key

  return [
    (b1, b2, b3),
    (b3, b1, b2),
    (b2, b3, b1) 
  ]

def map_to_shifts(key_edge_tuple):
    
  key, edge_value = key_edge_tuple
  
  return [((shifted_key), edge_value) for shifted_key in generate_shifts(key)]


def extract_nodes(edges):

  nodes = set()

  for edge in edges:
      nodes.add(edge.n1)
      nodes.add(edge.n2)

  return tuple(nodes)


In [128]:
b = 50

mapped_graph_rdd = graph_rdd.flatMap(lambda edge: generate_keys(edge, b))
grouped_rdd = mapped_graph_rdd.flatMap(map_to_shifts).groupByKey()
aggregated_rdd = grouped_rdd.mapValues(list)
filtered_rdd = aggregated_rdd.filter(lambda x: len(x[1]) == 3)
node_sets_rdd = filtered_rdd.mapValues(extract_nodes)
queried_triangles = node_sets_rdd.values().distinct()

In [135]:
mapped_graph_rdd.collect()

                                                                                

[((42, 36, 0), Edge(n1=1129442, R='cites', n2=31336)),
 ((42, 36, 1), Edge(n1=1129442, R='cites', n2=31336)),
 ((42, 36, 2), Edge(n1=1129442, R='cites', n2=31336)),
 ((42, 36, 3), Edge(n1=1129442, R='cites', n2=31336)),
 ((42, 36, 4), Edge(n1=1129442, R='cites', n2=31336)),
 ((42, 36, 5), Edge(n1=1129442, R='cites', n2=31336)),
 ((42, 36, 6), Edge(n1=1129442, R='cites', n2=31336)),
 ((42, 36, 7), Edge(n1=1129442, R='cites', n2=31336)),
 ((42, 36, 8), Edge(n1=1129442, R='cites', n2=31336)),
 ((42, 36, 9), Edge(n1=1129442, R='cites', n2=31336)),
 ((42, 36, 10), Edge(n1=1129442, R='cites', n2=31336)),
 ((42, 36, 11), Edge(n1=1129442, R='cites', n2=31336)),
 ((42, 36, 12), Edge(n1=1129442, R='cites', n2=31336)),
 ((42, 36, 13), Edge(n1=1129442, R='cites', n2=31336)),
 ((42, 36, 14), Edge(n1=1129442, R='cites', n2=31336)),
 ((42, 36, 15), Edge(n1=1129442, R='cites', n2=31336)),
 ((42, 36, 16), Edge(n1=1129442, R='cites', n2=31336)),
 ((42, 36, 17), Edge(n1=1129442, R='cites', n2=31336)),
 (

## Problema 3

* Asume ahora que recibes un subgrafo como tres arreglos: un arreglo A con las variables, otro L con los tipos de aristas, y una matriz M de tamaño |A| × |L| × |A| que tiene un uno en la posicion (x, R, y) si y solo si (x, R, y) es una arista de tu subgrafo. 
* Implementa un programa en PySpark que reciba un patrón que tiene solo variables, y exactamente cuatro variables, y entregue todos los matches de ese patrón (como tuplas de 4 nodos) en el grafo usando b4 reducers, donde b es un parámetro.

In [None]:
driver.close()