In [None]:
!pip install graphdatascience

In [None]:
from graphdatascience import GraphDataScience
import pandas as pd

host = "bolt://localhost:7687"
user = "neo4j"
password= "pleaseletmein"

gds = GraphDataScience(host, auth=(user, password))

In [None]:
# Constraints
gds.run_cypher("""
CREATE CONSTRAINT IF NOT EXISTS FOR (a:Article) REQUIRE a.url IS UNIQUE;
""")
gds.run_cypher("""
CREATE CONSTRAINT IF NOT EXISTS FOR (a:Author) REQUIRE a.name IS UNIQUE;
""")
gds.run_cypher("""
CREATE CONSTRAINT IF NOT EXISTS FOR (t:Tag) REQUIRE t.name IS UNIQUE;
""")
gds.run_cypher("""
CREATE CONSTRAINT IF NOT EXISTS FOR (l:List) REQUIRE l.id IS UNIQUE;
""")


In [None]:
# Import articles
gds.run_cypher("""
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/medium/medium_articles.csv" AS row
CALL {
     WITH row
     MERGE (a:Article {url: row.story})
     SET a.title = row.title,
         a.date = date(CASE WHEN row.date <> "" THEN row.date ELSE "1990-01-01" END)
     MERGE (au:Author {name:coalesce(row.author, "Unknown")})
     MERGE (au)-[:WROTE]->(a)
     WITH a, apoc.convert.fromJsonList(row.tags) AS tags
     UNWIND tags AS tag
     MERGE (t:Tag {name:tag})
     MERGE (a)-[:HAS_TAG]->(t)
} IN TRANSACTIONS
""")

In [None]:
# Import lists
gds.run_cypher("""
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/medium/medium_lists.csv" AS row
CALL {
    WITH row
    MATCH (a:Article {url: row.article})
    MERGE (l:List {id: row.list})
    MERGE (a)-[:IN_LIST]->(l)
} IN TRANSACTIONS
""")

Download and unzip the embeddings file in this folder.
The embeddings can be downloaded using the following link:
    https://drive.google.com/file/d/1u7_qkNPT-_3q6KZE55MdOwjngoR6IHzl/view?usp=sharing

In [None]:
# Import openAI embeddings
embeddings = pd.read_csv('medium_embedding_full.csv')
params = embeddings.values
for i in range(0, len(params), 1000):
    batch = [{'url':x[0], 'embeddings':x[1]} for x in params[i:i+1000]]
    gds.run_cypher("""
      UNWIND $data AS row 
      MATCH (a:Article {url:row.url}) 
      SET a.openaiEmbedding = apoc.convert.fromJsonList(row.embeddings)""", 
                   {'data': batch})