In [None]:
from neo4j import GraphDatabase
from graphdatascience import GraphDataScience
import pandas as pd

uri = "bolt://localhost:7687" # CHANGE TO YOUR PORT NUMBER
user = "neo4j"
password = "neo4j12345"       # CHANGE PASSWORD

gds = GraphDataScience(uri, auth=(user,password))
gds.set_database("neo4j")

## Load the Recipe Dataset

In [None]:
# Set the display options
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

# Inspect the file
gds.run_cypher("""
LOAD CSV WITH HEADERS FROM "file:///Cleaned_Recipe_Data_Subset.csv" AS row
WITH row
RETURN *
LIMIT 1
""")

In [None]:
import pandas as pd
import re
import ast
import csv

def clean_json_like_string(json_like_string):
    # Convert JSON-like string to list
    items = ast.literal_eval(json_like_string)
    # Join list items into a comma-separated string
    joined_items = ', '.join(items)
    # Remove special characters
    cleaned_string = re.sub('[^A-Za-z0-9, ]+', '', joined_items)
    return cleaned_string

# Load the data from the CSV file
file_path = 'C:\\Users\\TimEa\\OneDrive\\Data\\Cooking_Recipes\\RecipeNLG_dataset.csv'
data = pd.read_csv(file_path, dtype=str)

# Convert all text to lowercase
data = data.applymap(str.lower)

# Clean the 'ingredients' and 'directions' columns
data['ingredients'] = data['ingredients'].apply(clean_json_like_string)
data['directions'] = data['directions'].apply(clean_json_like_string)
data['NER'] = data['NER'].apply(clean_json_like_string)

data.rename(columns={'Unnamed: 0': 'Row_ID'}, inplace=True)
data.head(n=1)

# Save a limited amount of data
df_subset = data.head(n = 10000)

# Save the cleaned data back to a new CSV file
cleaned_file_path = 'C:\\Users\\TimEa\\AppData\\Local\\Neo4j\\Relate\\Data\\dbmss\\dbms-d40049e8-260c-4863-8dd4-5be31fef11cd\\Cleaned_Recipe_Data.csv'
data.to_csv(cleaned_file_path, index=False, quoting=csv.QUOTE_ALL)

cleaned_file_path = 'C:\\Users\\TimEa\\AppData\\Local\\Neo4j\\Relate\\Data\\dbmss\\dbms-d40049e8-260c-4863-8dd4-5be31fef11cd\\import\\Cleaned_Recipe_Data_Subset.csv'
df_subset.to_csv(cleaned_file_path, index=False, quoting=csv.QUOTE_ALL)

data.head()


In [None]:
import pandas as pd
from io import StringIO

# Convert the comma-separated strings to lists
data['CommaSeparatedValues'] = data['NER'].str.split(',')

# Use explode to expand the lists into separate rows
exploded_df = data[['Row_ID', 'CommaSeparatedValues']].explode('CommaSeparatedValues')

# Rename the CommaSeparatedValues column
exploded_df.rename(columns={'CommaSeparatedValues': 'Ingredient'}, inplace=True)

# Display the resulting DataFrame
print(exploded_df)

In [None]:
cleaned_file_path = 'C:\\Users\\TimEa\\OneDrive\\Data\\Cooking_Recipes\\Ingredients.csv'
exploded_df.to_csv(cleaned_file_path, index=False)

In [None]:
cleaned_file_path = 'C:\\Users\\TimEa\\OneDrive\\Data\\Cooking_Recipes\\Cleaned_Recipe_Data.csv'
df_subset = data.head(n = 100000)
df_subset.to_csv(cleaned_file_path, index=False)

# Load Data

In [None]:
# Delete existing data in the database
gds.run_cypher("""
MATCH (a) DETACH DELETE a
""")

In [None]:
gds.run_cypher("""
CREATE CONSTRAINT rowid IF NOT EXISTS FOR (r:Recipe) REQUIRE r.Row_ID IS UNIQUE;
""")

In [None]:
gds.run_cypher("""
LOAD CSV WITH HEADERS FROM "file:///Cleaned_Recipe_Data.csv" AS row
WITH row
WHERE row.directions IS NOT NULL AND row.ingredients IS NOT NULL
CALL {
    WITH row
    MERGE (r:Recipe {
        Row_ID: toInteger(row.Row_ID),
        title: row.title,
        directions: row.directions,
        link: row.link,
        source: row.source,
        ingredients: row.ingredients,
        NER: row.NER
    })
} IN TRANSACTIONS OF 1000 ROWS
;

""")

In [None]:
gds.run_cypher("""
CREATE CONSTRAINT ingredient_name IF NOT EXISTS FOR (r:Ingredient) REQUIRE r.Ingredient_Name IS UNIQUE;
""")

In [None]:
gds.run_cypher("""
LOAD CSV WITH HEADERS FROM "file:///Ingredients.csv" AS row
WITH row
WHERE row.Ingredient IS NOT NULL 
CALL {
    WITH row
    MERGE (i:Ingredient {Ingredient_Name: row.Ingredient})
} IN TRANSACTIONS OF 1000 ROWS
;

""")

In [None]:
gds.run_cypher("""
LOAD CSV WITH HEADERS FROM 'file:///Ingredients.csv' AS row
MATCH (r:Recipe {Row_ID: toInteger(row.Row_ID)}), (i:Ingredient {Ingredient_Name: row.Ingredient})
MERGE (r)-[:USES]->(i)
""")

## Export into a Neo4j Friendly Import Version

In [None]:
gds.run_cypher("""
// Recipe Nodes
MATCH (r:Recipe)
WITH distinct r
LIMIT 10

MATCH (r)-[:USES]->(i)
WITH distinct r
RETURN r.Row_ID as Recipe_ID, r.title as Recipe_Title, r.directions as Directions, r.link as Link, r.source as Source, r.ingredients
""")

In [None]:
gds.run_cypher("""
// Ingredient Nodes
MATCH (r:Recipe)
WITH distinct r
LIMIT 10

MATCH (r)-[:USES]->(i)
RETURN id(i) as Ingredient_ID, trim(i.Ingredient_Name) as Ingredient_Name
""")

In [None]:
gds.run_cypher("""
// Recipe to Ingredient Relationships
MATCH (r:Recipe)
WITH distinct r
LIMIT 10

MATCH (r)-[:USES]->(i)
RETURN r.Row_ID as Recipe_ID, id(i) as Ingredient_ID
""")

## Export into a Neo4j Admin Import Version

In [None]:
gds.run_cypher("""
// Recipe Nodes
MATCH (r:Recipe)
WITH distinct r
LIMIT 10

MATCH (r)-[:USES]->(i)
WITH distinct r
RETURN r.Row_ID as Recipe_ID, r.title as Recipe_Title, r.directions as Directions, r.link as Link, r.source as Source, r.ingredients
""")

In [None]:
gds.run_cypher("""
// Ingredient Nodes
MATCH (r:Recipe)
WITH distinct r
LIMIT 10

MATCH (r)-[:USES]->(i)
RETURN id(i) as Ingredient_ID, trim(i.Ingredient_Name) as Ingredient_Name
""")

In [None]:
gds.run_cypher("""
// Recipe to Ingredient Relationships
MATCH (r:Recipe)
WITH distinct r
LIMIT 10

MATCH (r)-[:USES]->(i)
RETURN r.Row_ID as Recipe_ID, id(i) as Ingredient_ID
""")