![image](https://user-images.githubusercontent.com/1651790/221876073-61ef4edb-adcd-4f10-b3fc-8ddc24918ea1.png)

In [None]:
# install ngdi in the first run
!pip install ngdi

## Data Intelligence Suite Spark Engine Examples
### read data with spark engine, scan mode

In this example, we are leveraging the Spark Engine of NebulaGraph DI Suite, with the Storage Scan mode.

#### Step 1, get dataframe by scanning the Graph

We will scan all edge in type `follow` first as dataframe: `df`

In [1]:
from ngdi import NebulaReader
# read data with spark engine, scan mode
reader = NebulaReader(engine="spark")
reader.scan(edge="follow", props="degree")
df = reader.read()
df.show(2)

[Stage 0:>                                                          (0 + 1) / 1]

+---------+---------+-----+------+
|   _srcId|   _dstId|_rank|degree|
+---------+---------+-----+------+
|player105|player100|    0|    70|
|player105|player104|    0|    83|
+---------+---------+-----+------+
only showing top 2 rows



                                                                                

#### Step 2, run Pagerank Algorithm

In [2]:
pr_result = df.algo.pagerank(reset_prob=0.15, max_iter=10)

                                                                                

+---------+
|       id|
+---------+
|player108|
|player129|
|player120|
|player103|
|player128|
|player148|
|player117|
|player139|
|player140|
|player134|
|player149|
|player150|
|player125|
|player137|
|player143|
|player101|
|player141|
|player144|
|player102|
|player121|
+---------+
only showing top 20 rows



                                                                                

+-----+------+-----------+----------+
|_rank|degree|     _srcId|    _dstId|
+-----+------+-----------+----------+
|    0|    90|34359738371|         3|
|    0|    90|25769803786|         6|
|    0|    90|34359738369|         6|
|    0|    80| 8589934596|         2|
|    0|    99|25769803784|         2|
|    0|    90|25769803777|         2|
|    0|    90|          1|         4|
|    0|    90|17179869189|         4|
|    0|    90|          4|         1|
|    0|    10| 8589934598|         1|
|    0|    90|17179869189|         1|
|    0|    80| 8589934598|         5|
|    0|    85|25769803786|         5|
|    0|    70|34359738373|         5|
|    0|    95|17179869185|8589934597|
|    0|    95|25769803778|8589934597|
|    0|    99|25769803784|8589934597|
|    0|    90|34359738368|8589934597|
|    0|    85|          5|8589934598|
|    0|    90| 8589934596|8589934598|
+-----+------+-----------+----------+
only showing top 20 rows



                                                                                

#### Step 3, check results of the algorithm


In [3]:
pr_result.show(5)

+---------+-------------------+
|      _id|           pagerank|
+---------+-------------------+
|player133|0.18601069183310506|
|player126|0.18601069183310506|
|player130| 1.2400712788873671|
|player108|0.18601069183310506|
|player102| 1.6602373739502538|
+---------+-------------------+
only showing top 5 rows



------------------
### read data with spark engine, query mode

In this example, we are leveraging the Spark Engine of NebulaGraph DI Suite, with the Graph Query mode.

#### Step 1, get dataframe by querying the Graph with a Cypher

We will query 100000 edges in type `follow` as a dataframe: `df`

In [4]:
from ngdi import NebulaReader
# read data with spark engine, query mode
reader = NebulaReader(engine="spark")
query = """
    MATCH ()-[e:follow]->()
    RETURN e LIMIT 100000
"""
reader.query(query=query, edge="follow", props="degree")
df = reader.read() # this will take some time
df.show(2)

+---------+---------+-----+------+
|   _srcId|   _dstId|_rank|degree|
+---------+---------+-----+------+
|player102|player100|    0|    75|
|player102|player101|    0|    75|
+---------+---------+-----+------+
only showing top 2 rows



#### Step 2, run Conncted Components Algorithm

In [5]:
cc_result = df.algo.connected_components(max_iter=10)

+---------+
|       id|
+---------+
|player129|
|player120|
|player148|
|player103|
|player128|
|player108|
|player117|
|player150|
|player125|
|player137|
|player139|
|player140|
|player134|
|player149|
|player102|
|player135|
|player147|
|player121|
|player143|
|player101|
+---------+
only showing top 20 rows

+-----+------+-----------+----------+
|_rank|degree|     _srcId|    _dstId|
+-----+------+-----------+----------+
|    0|    90|34359738368|         3|
|    0|    90|25769803781|         6|
|    0|    90|34359738371|         6|
|    0|    80| 8589934592|         1|
|    0|    99|25769803779|         1|
|    0|    90|25769803784|         1|
|    0|    90|          0|         4|
|    0|    90|17179869187|         4|
|    0|    90|          4|         0|
|    0|    10| 8589934594|         0|
|    0|    90|17179869187|         0|
|    0|    80| 8589934594|         2|
|    0|    85|25769803781|         2|
|    0|    70|34359738370|         2|
|    0|    95|17179869189|8589934593|
| 

#### Step 3, check results of the algorithm


In [6]:
cc_result.show(5)

+---------+---------+
|      _id|       cc|
+---------+---------+
|player115|player129|
|player113|player129|
|player100|player129|
|player129|player129|
|player137|player129|
+---------+---------+
only showing top 5 rows



## Other algorithm examples

In [None]:
# lpa_result  = df.algo.label_propagation()
# louvain_result = df.algo.louvain()
# k_core_result = df.algo.k_core()
# degree_statics_result = df.algo.degree_statics()
# betweenness_centrality_result = df.algo.betweenness_centrality()
# coefficient_centrality_result = df.algo.coefficient_centrality()
# bfs_result = df.algo.bfs()
# hanp_result = df.algo.hanp()
# jaccard_result = df.algo.jaccard()
# strong_connected_components_result = df.algo.strong_connected_components()
# triangle_count_result = df.algo.triangle_count()