# Table of Contents
- Final Neo4j Imports: Cypher or Python import methods
- Energy Nodes and Relationship Processing

# Connections and Functions

In [2]:
import os
import neo4j
import pandas as pd
from IPython.display import display

In [3]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))
session = driver.session(database="neo4j")

In [4]:
# functions
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query) 
    query = "match (node) delete node"
    session.run(query)
    
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    result = session.run(query, **kwargs)
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

# Final Neo4j Imports

Run the commands below in your terminal before you load data into Neo4j. Files pushed into origin/project 8/1/24. - Jason
- sudo cp ~/user/projects/project-3-energy-ownership/code/Data/entity_nodes.csv ~/neo4j_import
- sudo cp ~/user/projects/project-3-energy-ownership/code/Data/energy_project_nodes.csv ~/neo4j_import
- sudo cp ~/user/projects/project-3-energy-ownership/code/Data/energy_project_relationships.csv ~/neo4j_import
- sudo cp ~/user/projects/project-3-energy-ownership/code/Data/entity_relationships.csv ~/neo4j_import

In [1]:
my_neo4j_wipe_out_database()

NameError: name 'my_neo4j_wipe_out_database' is not defined

For reference:

In [7]:
# for reference
print('file:///entity_nodes.csv')
print('file:///energy_project_nodes.csv')
print('file:///energy_project_relationships.csv')
print('file:///entity_relationships.csv')

file:///entity_nodes.csv
file:///energy_project_nodes.csv
file:///energy_project_relationships.csv
file:///entity_relationships.csv


In [11]:
df = pd.read_csv('/user/projects/project-3-energy-ownership/code/Data/entity_nodes.csv')
print('Entity Nodes:', df.shape)
df = pd.read_csv('/user/projects/project-3-energy-ownership/code/Data/energy_project_nodes.csv')
print('Energy Project Nodes:', df.shape)
df = pd.read_csv('/user/projects/project-3-energy-ownership/code/Data/energy_project_relationships.csv')
print('Energy Project Relationships:', df.shape) # 21515; ~4k relationships did not load
df = pd.read_csv('/user/projects/project-3-energy-ownership/code/Data/entity_relationships.csv')
print('Entity Relationships:', df.shape) # 12384; ~4k relationships did not load

Entity Nodes: (12520, 11)
Energy Project Nodes: (25587, 8)
Energy Project Relationships: (25008, 3)
Entity Relationships: (16575, 3)


## METHOD 1: Copy and paste the below into Neo4j browser (python commands below)

// Copy the below directly into Neo4j; delete the first line if the index is already created
CREATE INDEX entity_id_index FOR (e:Entity) ON (e.id);
LOAD CSV WITH HEADERS FROM "file:///entity_nodes.csv" AS row
WITH row WHERE row.ID IS NOT NULL
MERGE (e:Entity {id: row.ID})
SET e.name = row.Name,
    e.entity_type = coalesce(row['Entity Type'], "Unknown"),
    e.legal_entity_type = coalesce(row['Legal Entity Type'], "Unknown"),
    e.publicly_listed = coalesce(row.PubliclyListed, "Unknown"),
    e.country = coalesce(row.Country, "Unknown"),
    e.coal_plant_capacity = coalesce(row['coal plant capacity'],"None"),
    e.gas_plant_capacity = coalesce(row['gas capacity'],"None"),
    e.bio_plant_capacity = coalesce(row['bioenergy plant capacity'],"None"),
    e.coal_mine_capacity = coalesce(row['coal mine capacity'],"None"),
    e.steel_plant_capacity = coalesce(row['coal mine capacity'],"None");

// copy the below directly into neo4j; delete the first line if the index is already created
CREATE INDEX energy_id_index FOR (e:Energy_Project) ON (e.id);
LOAD CSV WITH HEADERS FROM "file:///energy_project_nodes.csv" AS row
WITH row WHERE row.index IS NOT NULL
MERGE (e:Energy_Project {id: row.index})
SET e.name = row.Energy_Project_Node_Name,
    e.energy_project_type = row.Energy_Project_Type,
    e.capacity_mw = toFloat(coalesce(row["Capacity (MW)"], 0)),
    e.status = coalesce(row.Status, "Unknown"),
    e.country = coalesce(row.Country, "Unknown"),
    e.capacity_mtpa = toFloat(coalesce(row["Capacity (Mtpa)"], 0)),
    e.nominal_crude_steel_capacity_ttpa = toFloat(coalesce(row["Nominal crude steel capacity (ttpa)"], 0));

// copy the below directly into neo4j
LOAD CSV WITH HEADERS FROM "file:///energy_project_relationships.csv" AS row
MATCH (f:Energy_Project {id: row.from})
MATCH (t:Entity {id: row.to})
MERGE (f)-[:SHARE {share: toFloat(coalesce(row.share, 0))}]->(t)

// copy the below directly into neo4j
LOAD CSV WITH HEADERS FROM "file:///entity_relationships.csv" AS row
MATCH (f:Entity {id: row.from})
MATCH (t:Entity {id: row.to})
MERGE (f)-[:SHARE {share: toFloat(coalesce(row.share, 0))}]->(t)

## METHOD 2: Run python functions (skip the first two functions if you've already loaded the index)

In [None]:
# create an entity_id_index for faster load and searching
query = """
CREATE INDEX entity_id_index FOR (e:Entity) ON (e.id)
"""

session.run(query)

In [None]:
# create an entity_id_index for faster load and searching
query = """
CREATE INDEX energy_id_index FOR (e:Energy_Project) ON (e.id);
"""

session.run(query)

In [5]:
query = """
LOAD CSV WITH HEADERS FROM "file:///entity_nodes.csv" AS row
WITH row WHERE row.ID IS NOT NULL
MERGE (e:Entity {id: row.ID})
SET e.name = row.Name,
    e.entity_type = coalesce(row['Entity Type'], "Unknown"),
    e.legal_entity_type = coalesce(row['Legal Entity Type'], "Unknown"),
    e.publicly_listed = coalesce(row.PubliclyListed, "Unknown"),
    e.country = coalesce(row.Country, "Unknown"),
    e.coal_plant_capacity = coalesce(row['coal plant capacity'],"None"),
    e.gas_plant_capacity = coalesce(row['gas capacity'],"None"),
    e.bio_plant_capacity = coalesce(row['bioenergy plant capacity'],"None"),
    e.coal_mine_capacity = coalesce(row['coal mine capacity'],"None"),
    e.steel_plant_capacity = coalesce(row['steel plant capacity'],"None");
"""

session.run(query)

<neo4j._sync.work.result.Result at 0x7f42a148dbe0>

In [None]:
query = """
LOAD CSV WITH HEADERS FROM "file:///energy_project_nodes.csv" AS row
WITH row WHERE row.index IS NOT NULL
MERGE (e:Energy_Project {id: row.index})
SET e.name = row.Energy_Project_Node_Name,
    e.energy_project_type = row.Energy_Project_Type,
    e.capacity_mw = toFloat(coalesce(row["Capacity (MW)"], 0)),
    e.status = coalesce(row.Status, "Unknown"),
    e.country = coalesce(row.Country, "Unknown"),
    e.capacity_mtpa = toFloat(coalesce(row["Capacity (Mtpa)"], 0)),
    e.nominal_crude_steel_capacity_ttpa = toFloat(coalesce(row["Nominal crude steel capacity (ttpa)"], 0));
"""

session.run(query)

In [None]:
query = """
LOAD CSV WITH HEADERS FROM "file:///energy_project_relationships.csv" AS row
MATCH (f:Energy_Project {id: row.from})
MATCH (t:Entity {id: row.to})
MERGE (f)-[:SHARE {share: toFloat(coalesce(row.share, 0))}]->(t)
"""

session.run(query)

In [None]:
query = """
LOAD CSV WITH HEADERS FROM "file:///entity_relationships.csv" AS row
MATCH (f:Entity {id: row.from})
MATCH (t:Entity {id: row.to})
MERGE (f)-[:SHARE {share: toFloat(coalesce(row.share, 0))}]->(t)
"""

session.run(query)

# Separate Entity and Energy Project Relationships

- load relationships file; split into entity and energy projects; assigns index for energy projects to allow for steel energy nodes to load correctly

In [5]:
df = pd.read_csv('/user/projects/project-3-energy-ownership/code/Data/relationships.csv')

In [6]:
ep_df = pd.read_csv('/user/projects/project-3-energy-ownership/code/Data/energy_project_nodes.csv')
ep_df = ep_df.reset_index()
ep_df['index'] = 'EP' + ep_df.index.astype(str)
print(ep_df.shape)
ep_df.head()

(25587, 9)


Unnamed: 0,level_0,index,Energy_Project_Node_Name,Energy_Project_Type,Capacity (MW),Status,Country,Capacity (Mtpa),Nominal crude steel capacity (ttpa)
0,0,EP0,48th Street power station 9,Gas or Oil Plant,84.0,operating,United States,,
1,1,EP1,491 E 48th Street power station 9,Gas or Oil Plant,83.5,operating,United States,,
2,2,EP2,6 October power plant 1,Gas or Oil Plant,150.0,operating,Egypt,,
3,3,EP3,6 October power plant 2,Gas or Oil Plant,150.0,operating,Egypt,,
4,4,EP4,6 October power plant 3,Gas or Oil Plant,150.0,operating,Egypt,,


In [5]:
# ep_df.to_csv('/user/projects/project-3-energy-ownership/code/Data/energy_project_nodes.csv', index=False)

In [7]:
merged_relationship = pd.merge(df, ep_df[['index', 'Energy_Project_Node_Name', 'Status']],
                               how='left',
                               left_on='from',
                               right_on='Energy_Project_Node_Name')
print(merged_relationship.shape)
merged_relationship.head()

(41583, 6)


Unnamed: 0,from,to,share,index,Energy_Project_Node_Name,Status
0,E100000001858,E100000000354,100.0,,,
1,E100000000354,E100001000146,100.0,,,
2,E100000001759,E100001011410,,,,
3,E100000003784,E100000000431,100.0,,,
4,E100000000380,E100000000382,100.0,,,


In [9]:
entity_relationships = merged_relationship[merged_relationship.Energy_Project_Node_Name.isna()][['from', 'to', 'share']]
ep_relationships = merged_relationship[~merged_relationship.Energy_Project_Node_Name.isna()][['index', 'to', 'share']]
ep_relationships.columns = ['from', 'to', 'share']
ep_relationships.head()

Unnamed: 0,from,to,share
5414,EP0,E100000002215,100.0
5415,EP1,E100000002215,100.0
5416,EP2,E100000001858,100.0
5417,EP3,E100000001858,100.0
5418,EP4,E100000001858,100.0


In [10]:
# ep_relationships.to_csv('/user/projects/project-3-energy-ownership/code/Data/energy_project_relationships.csv', index=False)
# entity_relationships.to_csv('/user/projects/project-3-energy-ownership/code/Data/entity_relationships.csv', index=False)