In [1]:
from graphdatascience import GraphDataScience
import pandas as pd

pd.set_option("display.max_colwidth", None)

host = "bolt://localhost:7687"
user = "neo4j"
password = "letmein"

gds = GraphDataScience(host, auth=(user, password))

In [2]:
gds.run_cypher(
    """
CALL apoc.meta.stats()
YIELD labels, relTypesCount
"""
)

Unnamed: 0,labels,relTypesCount
0,"{'Keyword': 4198, 'Page': 15370}","{'LINKS_TO': 62365, 'HAS_KEYWORD': 26851, 'REDIRECTS': 723}"


In [3]:
gds.run_cypher(
    """
MATCH (p:Page)
RETURN p.has_text AS has_text,
       count(*) AS count
"""
)

Unnamed: 0,has_text,count
0,True,9688
1,False,2972
2,,2710


In [4]:
gds.run_cypher(
    """
MATCH (p:Page)
WHERE p.has_text IS NULL
RETURN p.url AS page,
       count{(p)<-[:LINKS_TO|REDIRECTS]-()} AS links
ORDER BY links DESC
LIMIT 5
"""
)

Unnamed: 0,page,links
0,http://localhost:7474,38
1,https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases/tag/4.3.0.12,38
2,https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases/tag/4.4.0.12,37
3,https://docs.microsoft.com/en-us/azure/cognitive-services/text-analytics/quickstarts/text-analytics-sdk,37
4,https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html,37


In [5]:
gds.run_cypher(
    """
MATCH (:Page)-[:LINKS_TO|REDIRECTS]->(:Page{is_404:true})
RETURN count(*) AS brokenLinkCount
"""
)

Unnamed: 0,brokenLinkCount
0,241


In [6]:
gds.run_cypher(
    """
MATCH (start:Page {url:"https://neo4j.com/docs"}), 
      (end:Page {url:"https://console.neo4j.io"})
MATCH p=shortestPath((start)-[:LINKS_TO|REDIRECTS*..10]->(end))
RETURN [n in nodes(p) | n.url] AS path
"""
)

Unnamed: 0,path
0,"[https://neo4j.com/docs, https://neo4j.com/docs/aura/auradb, https://neo4j.com/docs/aura/auradb/getting-started/create-database, https://console.neo4j.io]"


In [7]:
G, metadata = gds.graph.project("structure", "Page", ["LINKS_TO", "REDIRECTS"])

In [8]:
df = gds.degree.stream(G, orientation="REVERSE")
df["url"] = [d["url"] for d in gds.util.asNodes(df["nodeId"].to_list())]
df.sort_values("score", ascending=False, inplace=True)
df.head()

Unnamed: 0,nodeId,score,url
4261,6174,598.0,https://neo4j.com/developer/kb
8571,11257,391.0,https://neo4j.com/developer-blog/tagged/neo4j
8924,11610,235.0,https://neo4j.com/developer-blog/tagged/graph-database
2120,3040,165.0,https://neo4j.com/graphgists/categories/web-amp-social
2114,3034,165.0,https://neo4j.com/graphgists/categories/finance


In [9]:
pr_df = gds.pageRank.stream(G)
pr_df["pagerank"] = pr_df.pop("score")
combined_df = df.merge(pr_df, on="nodeId")
combined_df.sort_values("pagerank", ascending=False, inplace=True)

In [10]:
combined_df.head()

Unnamed: 0,nodeId,score,url,pagerank
0,6174,598.0,https://neo4j.com/developer/kb,38.593852
43,13541,55.0,https://neo4j.com/graphconnect-2018,21.989673
232,14770,21.0,https://neo4j.com/labs/apoc/4.4/graph-querying/node-querying,12.607568
242,18265,21.0,https://neo4j.com/labs/apoc/4.3/graph-querying/node-querying,12.420046
25,856,70.0,https://neo4j.com/docs/operations-manual/5/reference/configuration-settings,12.392406


In [11]:
G.drop()

graphName                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 structure
database                                                                                                                                                                                                                                                                                                                                                                                                                                                            

In [12]:
gds.run_cypher(
    """
MATCH (k:Keyword)
RETURN k.name AS keyword,
       count {(k)<-[:HAS_KEYWORD]-()} AS mentions
ORDER BY mentions DESC
LIMIT 5
"""
)

Unnamed: 0,keyword,mentions
0,node,1194
1,neo4j,983
2,clipboard,868
3,graph,596
4,java,537


In [13]:
gds.run_cypher(
    """
MATCH (p:Page)-[:HAS_KEYWORD]->(k:Keyword)
WHERE p.url CONTAINS "graph-data-science"
RETURN k.name AS keyword,
       count(*) AS mentions
ORDER BY mentions DESC
LIMIT 5
"""
)

Unnamed: 0,keyword,mentions
0,graph,259
1,algorithm,188
2,node,127
3,neo4j graph,88
4,neo4j,76


In [14]:
G, metadata = gds.graph.project(
    "keywords", ["Page", "Keyword"], {"HAS_KEYWORD": {"orientation": "REVERSE"}}
)

In [15]:
gds.nodeSimilarity.mutate(
    G, mutateRelationshipType="CO_OCCUR", mutateProperty="score", similarityCutoff=0.4
)

preProcessingMillis                                                                                                                                                                                                                                                                                                                                                                                                      0
computeMillis                                                                                                                                                                                                                                                                                                                                                                                                          389
mutateMillis                                                                                                                                                                      

In [16]:
topic_df = gds.louvain.stream(G, nodeLabels=["Keyword"], relationshipTypes=["CO_OCCUR"])
topic_df["keyword"] = [
    n["name"] for n in gds.util.asNodes(topic_df["nodeId"].to_list())
]
topic_df.groupby("communityId").agg(
    {"keyword": ["size", list]}
).reset_index().sort_values([("keyword", "size")], ascending=False).head()

Unnamed: 0_level_0,communityId,keyword,keyword
Unnamed: 0_level_1,Unnamed: 1_level_1,size,list
634,1250,46,"[santa, chewbacca, galaxy, republic, jedi, christmas eve, new year ’ s day, gandhi jayanti dussehra, caribbean iii, ganesh chaturthi, thanksgiving, makar sankranti republic day, vijayadashami, diwali padwa veterans day, labor day, maha shivratri holi memorial day, christmas day, independence day, boxing day, new year ’ s eve, diwali, president ’ day, martin luther king day, good friday, caribbean, christmas, veterans day, thanksgiving day, maha shivratri, king iii, president ’, padwa, holi memorial day, king iii holiday, reindeer, christmas tree, santa claus, street map, rome, android, tatooine, wookiepedia, alderaan, jorge albarran, santavin, luke skywalker]"
1749,3181,35,"[mar cabra, mossack fonseca, tax haven, panama, panama papers, apache solr, tika, prizesion, offshore leaks, oxwall source, zurich switzerland, paul kuhne, zurich, swiss leaks, hsbc, hsbc leaks, excel source, president of azerbaijan, panamapaper, mahabharataa, president, data science analytics, quantum analytics, azerbaijan, firepower scandal, cablegate, pentagon papers, ilham aliyev, the case, duncan campbell, baku, heydar aliyev, azerbaijan airlines, dubai, azerfo]"
2212,4063,31,"[reddit, avengers : infinity war, evelina gabasova, seattle company, craig walls, pokemon, pokemonpher, tom hiddleston, tom hidton, avengers, the avengers, x - men, node classification learning, marvel universe, hulk, spider - man, thanosbra, s . h . i . e . l ., nick fury, black panther, loki, hawkeye, guardians of the galaxy, thanos, infinity stones, marvel cinematic universe, iron man, captain america, doctor strange, thor, natasha romanoff]"
1712,3074,31,"[netcon, salesforce, anders ekstrom, netconsult, sql database, search engine, musicology, chief technical officer, streaming services, music industry, the orchard, the orchard444j, music distribution, jeremy davies, webflow, stephen o ' grady, media, state, manhattan, marriott marquis times square, tim hanssen, pat patterson, cloudera, streamsetser, hilary mason, jake graham, lauren shin, artificial, change data capture, redmonk, redmon]"
1551,2749,27,"[maharashtra, airtel, tata, vodafone, mobile operator, gujarat, rajasthan, centurylink, business conglomerate, prodapt, at & t, jhaver group, deutsche telekom, verizon, liberty global, windstream, adtran, virgin media, ebay, forresterner, comcast, fortune 100, accorhotels, jpmorgan chase, atpco, forrester, adeo]"


In [17]:
G.drop()

graphName                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 keywords
database                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     neo4j
me