In [1]:
from neo4j import GraphDatabase
import os
import pandas as pd

In [2]:
class Neo4jConnection:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
    
    def close(self):
        self.driver.close()

    def query(self, query, parameters=None):
        with self.driver.session() as session:
            return list(session.run(query, parameters))
        
    def clear_graph(self):
        with self.driver.session() as session:
            session.run("MATCH (n) DETACH DELETE n")
        

In [3]:
def insert_data_from_df(conn:Neo4jConnection, df:pd.DataFrame):
    all_speakers = {}
    for _, row in df.iterrows():
        bank = row['bank']
        chunk = row['chunk']
        hash_value = row['hash']
        tokens_size = row['approximate_tokens_size']
        summary = row['summary']
        full_summary = row['full_summary']
        speakers = row['speakers']

        # Insert bank node only if the bank is not empty
        if bank:
            conn.query("""
            MERGE (b:Bank {name: $bank})
            ON CREATE SET b.full_summary = $full_summary
            ON MATCH SET b.full_summary = CASE WHEN b.full_summary IS NULL OR b.full_summary <> $full_summary THEN $full_summary ELSE b.full_summary END
            """, {"bank": bank, "full_summary": full_summary})

        # Insert chunk node
        conn.query("""
        CREATE (c:Chunk {chunk: $chunk, hash: $hash, tokens_size: $tokens_size, summary: $summary})
        """, {"chunk": chunk, "hash": hash_value, "tokens_size": tokens_size, "summary": summary})

        # Create relationship between Bank and Chunk if the bank is not empty
        if bank:
            conn.query("""
            MATCH (b:Bank {name: $bank}), (c:Chunk {chunk: $chunk, hash: $hash})
            MERGE (b)-[:HAS_CHUNK]->(c)
            """, {"bank": bank, "chunk": chunk, "hash": hash_value})

        # Collect speakers and their relationships
        for speaker in speakers:
            speaker_name = speaker['name']
            if speaker_name not in all_speakers:
                all_speakers[speaker_name] = {
                    'detail': speaker.get('detail', ''),
                    'role': speaker.get('role', ''),
                    'extra_information': speaker.get('extra_information', ''),
                    'chunks': [],
                    'banks': set()
                }
            # Each speaker may appear in multiple chunks and banks
            all_speakers[speaker_name]['chunks'].append({
                'hash': hash_value,
                'chunk': chunk
            })
            if bank:
                all_speakers[speaker_name]['banks'].add(bank)

    # Insert speaker nodes and create relationships between Speakers, and Chunk
    for speaker_name, speaker in all_speakers.items():
        speaker_detail = speaker['detail']
        speaker_role = speaker['role']
        speaker_extra_information = speaker['extra_information']

        # Insert speaker node with additional properties
        conn.query("""
        MERGE (s:Speaker {name: $name})
        ON CREATE SET s.detail = $detail, s.role = $role, s.extra_information = $extra_information
        ON MATCH SET 
            s.detail = CASE WHEN s.detail IS NULL OR s.detail <> $detail THEN $detail ELSE s.detail END,
            s.role = CASE WHEN s.role IS NULL OR s.role <> $role THEN $role ELSE s.role END,
            s.extra_information = CASE WHEN s.extra_information IS NULL OR s.extra_information <> $extra_information THEN $extra_information ELSE s.extra_information END
        """, {"name": speaker_name, "detail": speaker_detail, "role": speaker_role, "extra_information": speaker_extra_information})

        # Create relationship between Speaker and their associated Chunks
        for chunk_info in speaker['chunks']:
            conn.query("""
            MATCH (s:Speaker {name: $name}), (c:Chunk {hash: $hash})
            MERGE (s)-[:SPOKE_IN]->(c)
            """, {"name": speaker_name, "hash": chunk_info['hash']})
        
        # Create relationship between Speaker and their associated Banks
        # for bank in speaker['banks']:
        #     conn.query("""
        #     MATCH (s:Speaker {name: $name}), (b:Bank {name: $bank})
        #     MERGE (s)-[:ATTENDED]->(b)
        #     """, {"name": speaker_name, "bank": bank})

In [4]:
uri = "neo4j://localhost:7687"
user = "neo4j"
password = "meCfTH39XssP92e"
connection = Neo4jConnection(uri, user, password)
connection.clear_graph()

In [5]:
df_path=os.path.join("dataset","entities_transcripts.jsonl")
df=pd.read_json(df_path,lines=True)
insert_data_from_df(connection, df)

In [6]:
def insert_data_from_bank_df(conn:Neo4jConnection, df:pd.DataFrame):
    for bank, properties in df.items():
        # Insert bank node
        conn.query("""
        MERGE (b:Bank {name: $name})
        """, {"name": bank})
        
        for prop_name, prop_value in properties.items():
            # Insert property node
            conn.query("""
            MERGE (p:Property {name: $prop_name})
            """, {"prop_name": prop_name})

            # Create relationship with value as a property of the relationship
            conn.query("""
            MATCH (b:Bank {name: $bank}), (p:Property {name: $prop_name})
            MERGE (b)-[r:HAS_PROPERTY]->(p)
            SET r.value = $value
            """, {
                "bank": bank,
                "prop_name": prop_name,
                "value": prop_value
            })

In [7]:
df_path=os.path.join("dataset","banks_entities.json")
df=pd.read_json(df_path)
insert_data_from_bank_df(connection, df)