In [None]:
from tqdm.auto import tqdm
from humemai.janusgraph import Humemai
from humemai.utils import disable_logger

disable_logger()

humemai = Humemai()
humemai.start_containers(warmup_seconds=10)
humemai.connect()
humemai.remove_all_data()

In [None]:
import re
import json
from pprint import pprint

# Open the file in read mode
with open('example.txt', 'r') as file:
    # Read the contents of the file
    content = file.read()

# Split the text into chunks based on speaker
# Match lines that start with a speaker's name followed by a colon
chunks = re.findall(r'^.*?:.*$', content, re.MULTILINE)

# Clean up the chunks by stripping unnecessary spaces
chunks = [chunk.strip() for chunk in chunks]

# Print the chunks
for chunk in chunks:
    print(chunk)


In [None]:
import torch
import transformers


def get_pipeline(
    model: str = "meta-llama/Llama-3.2-1B-Instruct",
    device: str = "cpu",
    quantization: str = "16bit",
) -> transformers.Pipeline:
    """Get a text generation pipeline with the specified device and quantization.

    Args:
        model (str): The model to use for text generation. 
            Defaults to "meta-llama/Llama-3.2-1B-Instruct".
            meta-llama/Llama-3.2-3B-Instruct, 
            meta-llama/Llama-3.1-8B-Instruct
            ...

            are also available.
        device (str): The device to run the pipeline on. Defaults to "cpu".
        quantization (str): The quantization to apply to the model. Defaults to "16bit".

    Returns:
        transformers.Pipeline: The text generation pipeline.

    """

    if quantization == "16bit":
        quantization_config = None
    elif quantization == "8bit":
        quantization_config = {"load_in_8bit": True}
    elif quantization == "4bit":
        quantization_config = {"load_in_4bit": True}

    return transformers.pipeline(
        "text-generation",
        model="meta-llama/Llama-3.2-1B-Instruct",
        model_kwargs={
            "torch_dtype": torch.bfloat16,
            "quantization_config": quantization_config,
        },
        device_map=device,
    )

In [None]:
def generate_prompt(history: str, next_text: str) -> list[dict]:
    """
    Generate the prompt for the AI assistant to convert text to a knowledge graph.

    Args:
        history (str): The history of the knowledge graph extracted so far.
        next_text (str): The new text to convert into a knowledge graph.

    Returns:
        list[dict]: A structured prompt for the AI assistant to build a knowledge graph.
    """
    prompt = [
        {
            "role": "system",
            "content": """
You are an AI assistant that builds knowledge graphs from text. 
For each input, you extract entities and relationships from the provided text 
and convert them into a structured JSON-based knowledge graph.

**Important:** You should extract entities and relations from the new text provided.
If the new text provides updated information about existing entities or relations 
(e.g., age change, new attributes), you should output these entities and relations 
again with the updated information. Do not include entities or relations from the 
previous history that have not changed.

You may use the history to understand context and disambiguate entities.

Your output must follow this JSON format:

```json
{
  "entities": [
    {"label": "Entity1", "type": "Type1"},
    {"label": "Entity2", "type": "Type2"}
  ],
  "relations": [
    {"source": "Entity1", "target": "Entity2", "relation": "RelationName"}
  ]
}
```

Each entity must have a unique label and type (e.g., "Person", "Company", "Object",
"Event"). Relations must specify:

- `source`: the label of the originating entity,
- `target`: the label of the connected entity,
- `relation`: the relationship type between the source and target.

## Example:

### Previous Knowledge Graph History:

```json
{
  "entities": [
    {"label": "Sarah", "properties": {"type": "Person"}},
    {"label": "InnovateAI", "properties": {"type": "Company"}}
  ],
  "relations": [
    {"source": "Sarah", "target": "InnovateAI", "relation": "works_at"}
  ]
}

```

### New Text to Process:

"Sarah, now 30 years old, was promoted to Senior Data Scientist at InnovateAI."

### Output Knowledge Graph:

```json
{
  "entities": [
    {"label": "Sarah", "properties": {"type": "Person", "age": 30}},
    {"label": "InnovateAI", "properties": {"type": "Company"}},
    {"label": "Senior Data Scientist", "properties": {"type": "Position"}}
  ],
  "relations": [
    {"source": "Sarah", "target": "InnovateAI", "relation": "works_at"},
    {"source": "Sarah", "target": "Senior Data Scientist", "relation": "holds_position"}
  ]
}
````
Note that even though "Sarah" and "InnovateAI" were already in the history, we included
"Sarah" again with the updated age and added new relations based on the new information.


## Detailed Instructions:

- Extract entities and relations from the new text provided.
- If the new text provides updated information about existing entities or relations, include these in your output.
- Do not include entities or relations from the history that have not changed.
- Use the history for context and to disambiguate entities.
- Ensure the output adheres strictly to the JSON format specified. """,
        },
        {
            "role": "user",
            "content": f"Here is the knowledge graph extracted so far: {history}",
        },
        {
            "role": "user",
            "content": f"Here is the new text to process and incorporate: {next_text}",
        },
    ]

    return prompt

In [None]:
pipeline = get_pipeline("meta-llama/Llama-3.2-1B-Instruct", "cuda", "16bit")

history = {"entities": [], "relations": []}

for chunk in tqdm(chunks):

    outputs = pipeline(
        generate_prompt(history, chunk),
        max_new_tokens=1024,
    )
    text_content = outputs[0]["generated_text"][-1]["content"]

    json_match = re.search(r"```json\n(.*?)\n```", text_content, re.DOTALL)

    try:
        json_text = json_match.group(1)  # Extract JSON content
        dict_graph = json.loads(json_text)

        # Write short term vertices
        for entity in dict_graph["entities"]:
            vertex = humemai.write_short_term_vertex(
                label=entity["label"], properties=entity["properties"]
            )

        # Write short term edges
        for relation in dict_graph["relations"]:
            head_label = relation["source"]
            head_vertex = humemai.find_vertex_by_label(head_label)[0]
            edge_label = relation["relation"]
            tail_label = relation["target"]
            tail_vertex = humemai.find_vertex_by_label(tail_label)[0]

            edge = humemai.write_short_term_edge(
                head_vertex=head_vertex,
                edge_label=edge_label,
                tail_vertex=tail_vertex,
            )

        short_term_vertices, long_term_vertices, short_term_edges, long_term_edges = (
            humemai.get_working_vertices_and_edges(
                short_term_vertices=humemai.get_all_short_term_vertices(),
                short_term_edges=humemai.get_all_short_term_edges(),
                include_all_long_term=False,
                hops=2,
            )
        )

        # move to the long-term memory
        for vertex in short_term_vertices:
            humemai.move_short_term_vertex(vertex, "episodic")

        for edge in short_term_edges:
            humemai.move_short_term_edge(edge, "episodic")

        # remove all short term vertices and edges
        humemai.remove_all_short_term()

        entities = []
        for vertex in long_term_vertices:
            entities.append(
                {
                    "label": vertex.label,
                    "properties": {
                        key: val
                        for key, val in humemai.get_vertex_properties(vertex).items()
                        if key not in ["num_recalled", "event_time", "known_since"]
                    },
                }
            )

        relations = []
        for edge in long_term_edges:
            relations.append(
                {
                    "source": edge.outV.label,
                    "relation": edge.label,
                    "target": edge.inV.label,
                }
            )
        history = {"entities": entities, "relations": relations}
        print("history: ", history)
    except Exception as e:
        print(e)
    print()

In [13]:
for edge in humemai.get_all_long_term_edges():
    print(
        f"{edge.outV.label} --{edge.label}--> {edge.inV.label} | Properties: {humemai.get_edge_properties(edge)}"
    )

HumemAI --averted--> Crisis | Properties: {'event_time': ['2024-11-20T14:46:35'], 'num_recalled': 0}
HumemAI --retracing--> steps | Properties: {'event_time': ['2024-11-20T14:46:35'], 'num_recalled': 0}
HumemAI --resolution--> Resolution | Properties: {'event_time': ['2024-11-20T14:46:35'], 'num_recalled': 0}
HumemAI --focuses_on--> Keys | Properties: {'num_recalled': 4, 'event_time': ['2024-11-20T14:46:35', '2024-11-20T14:46:35']}
Grace --mentions--> keys | Properties: {'event_time': ['2024-11-20T14:46:35'], 'num_recalled': 5}
Grace --mentions--> Keys | Properties: {'event_time': ['2024-11-20T14:46:35'], 'num_recalled': 3}
Grace --sees--> keys | Properties: {'num_recalled': 5, 'event_time': ['2024-11-20T14:46:35']}
Grace --sees--> Keys | Properties: {'event_time': ['2024-11-20T14:46:35'], 'num_recalled': 3}
Bob --took--> Keys | Properties: {'num_recalled': 5, 'event_time': ['2024-11-20T14:46:35', '2024-11-20T14:46:35', '2024-11-20T14:46:35']}
Bob --took--> Desk | Properties: {'event_t