In [1]:
# pip install python-dotenv
import os
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
import google.generativeai as genai
import pandas as pd
from typing_extensions import TypedDict
import json

class Entity(TypedDict):
    """
    An entity details in the knowledge graph. The entity detials consist of name, class and description.

    Attributes:
    -----------
    name : str
        The human-readable name of the entity.
    class : str
        The classification or type of the entity (e.g., Person, Product, etc.).
    description : str
        A brief description of the entity.
    """
    entity_name: str
    entity_class: str
    description: str

class Relation(TypedDict):
    """
    A class to represent a relationship (triple) in the knowledge graph.

    Attributes:
    -----------
    head : Entity
        The subject or head entity in the relationship.
    predicate : str
        The relationship or predicate that links the head and tail (e.g., Person, Product, etc.).
    tail : Entity
        The object or tail entity in the relationship.
    """
    head: Entity
    predicate: str
    tail: Entity
    
def add_to_database(
    entities: list[Entity],
    triples: list[Relation],
):
    pass



In [2]:
import time
from functools import wraps

def retry(n):
    """
    A decorator to retry a function n times upon failure, and print the number of attempts made.
    
    Parameters:
    -----------
    n : int
        The number of retry attempts.
    
    Returns:
    --------
    Callable
        A decorator that retries the decorated function up to n times.
    """
    def decorator_retry(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            attempts = 0
            while attempts < n:
                try:
                    attempts += 1
                    result = func(*args, **kwargs)
                    print(f"Success on attempt {attempts}")
                    return result
                except Exception as e:
                    print(f"Attempt {attempts} failed: {e}")
                    if attempts == n:
                        raise  # Re-raise the exception if it's the last attempt
                    time.sleep(2)  # Optional delay between retries
        return wrapper
    return decorator_retry


In [10]:
def split_text_dp(text, target):
    # Split the input text into lines
    lines = text.split('\n')
    n = len(lines)

    # Precompute the length of each line
    line_lengths = [len(line) for line in lines]

    # Initialize DP array to store the minimum error at each point
    dp = [float('inf')] * (n + 1)
    dp[0] = 0  # No error at the start

    # Initialize an array to keep track of the splits
    splits = [-1] * (n + 1)

    # Dynamic programming to calculate minimum error
    for i in range(1, n + 1):
        total_length = 0
        for j in range(i, 0, -1):
            total_length += line_lengths[j - 1] + 1  # +1 to account for newline

            # If the total_length exceeds the target, no need to continue this split
            if total_length - 1 > target:
                break

            error = abs(total_length - 1 - target)
            if dp[i] > dp[j - 1] + error:
                dp[i] = dp[j - 1] + error
                splits[i] = j - 1

    # Backtrack to find the optimal split positions
    chunks = []
    i = n
    while i > 0:
        j = splits[i]
        chunk = '\n'.join(lines[j:i])
        chunks.append(chunk)
        i = j

    chunks.reverse()  # Reverse the chunks to get the correct order
    return chunks

# Example Usage
text = 'Title\n1\n2\n3\nline1\nline2 is too long\nline3'
target = 15  # Define your target chunk size
result = split_text_dp(text, target)

for idx, chunk in enumerate(result):
    print(f"Chunk {idx + 1}:\n{chunk}\n")


Chunk 1:
line3



In [3]:
def split_text_dp(text, max_length=500):
    """
    Splits the input text into chunks using dynamic programming to minimize the number of chunks.
    The chunks must not exceed max_length and must split only at the end of paragraphs.

    Parameters:
    -----------
    text : str
        The input text to be split.
    max_length : int
        The maximum length of each chunk (default is 500 characters).

    Returns:
    --------
    list of str
        A list of text chunks that do not exceed max_length.
    """

    # Split the text into paragraphs
    paragraphs = text.split('\n')
    n = len(paragraphs)
    
    # Precompute the length of each paragraph
    lengths = [len(paragraph.strip()) for paragraph in paragraphs]
    
    # Initialize the DP table, where dp[i] represents the minimum number of chunks needed for the first i paragraphs
    dp = [float('inf')] * (n + 1)  # Infinite initially
    dp[0] = 0  # No chunks are needed for 0 paragraphs

    # Initialize a backtracking table to store the best split points
    backtrack = [-1] * (n + 1)

    # Fill the DP table
    for i in range(1, n + 1):
        current_length = 0
        for j in range(i, 0, -1):
            current_length += lengths[j - 1] + (2 if j < i else 0)  # Add 2 for newlines between paragraphs
            if current_length > max_length:
                break
            if dp[j - 1] + 1 < dp[i]:
                dp[i] = dp[j - 1] + 1
                backtrack[i] = j - 1

    # Backtrack to recover the chunks
    chunks = []
    i = n
    while i > 0:
        j = backtrack[i]
        chunks.append("\n".join(paragraphs[j:i]).strip())
        i = j

    # Reverse the chunks to get them in the correct order
    chunks.reverse()

    return chunks


# Test the function
text = """
The Liberty Bell Ruby is a sculpture crafted from the world's largest mined ruby, discovered in East Africa in the 1950s. 
It weighs four pounds, is eight and a half thousand carats (8,500), and is sculpted into a miniature form of the Liberty Bell. 

It has 50 diamonds set in it and is valued at $2 million. 

The ruby was created in 1976 for Beverly Hills-based Kazanjian Brothers jewelry company by sculptor Alfonso de Vivanco for the United States Bicentennial. 
It was made in the same spirit as sapphire busts of presidents that the jeweler's charitable foundation presented to the White House when Dwight D. Eisenhower was president.
"""

chunks = split_text_dp(text, max_length=500)
for i, chunk in enumerate(chunks, 1):
    print(f"Chunk {i}: ({len(chunk)} characters)\n{chunk}\n")


Chunk 1: (466 characters)
The Liberty Bell Ruby is a sculpture crafted from the world's largest mined ruby, discovered in East Africa in the 1950s. 
It weighs four pounds, is eight and a half thousand carats (8,500), and is sculpted into a miniature form of the Liberty Bell. 

It has 50 diamonds set in it and is valued at $2 million. 

The ruby was created in 1976 for Beverly Hills-based Kazanjian Brothers jewelry company by sculptor Alfonso de Vivanco for the United States Bicentennial.

Chunk 2: (173 characters)
It was made in the same spirit as sapphire busts of presidents that the jeweler's charitable foundation presented to the White House when Dwight D. Eisenhower was president.



In [6]:
import os
import google.generativeai as genai
import pandas as pd

# Set up the API key outside the function
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
# Set up the model and tools (ensure 'add_to_database' is accessible)
model = genai.GenerativeModel(
    model_name='models/gemini-1.5-pro-latest',
    tools=[add_to_database]
)

@retry(1)
def extract_chunk(context, chunk, model):
    """
    Extracts knowledge graph triples (subject, predicate, object) from Wikipedia text using the Gemini API.

    Parameters:
    -----------
    wiki_text : str
        The input text from which to extract the knowledge graph.

    Returns:
    --------
    tuple (pd.DataFrame, pd.DataFrame)
        - A pandas DataFrame containing the extracted knowledge graph triples.
        - A pandas DataFrame containing the extracted entities.
    """
    
    
    # Generate the content
    result = model.generate_content(f"""
    # Context
    {context}
    
    # Input Wikipedia Text:
    {wiki_text}
    
    # Instruction:
    Please extract the entities and relations from the following Wikipedia text to the database.
    Ensure that the entities and relationships are accurate.
    """)
    
    # Parse the function call response
    fc = result.candidates[0].content.parts[0].function_call
    data = type(fc).to_dict(fc)

    # Process entities
    entities_data = data['args']['entities']
    df_entity = pd.DataFrame(entities_data)

    # Process triples
    triples_data = []
    for triple in data['args']['triples']:
        triples_data.append({
            'predicate': triple['predicate'],
            'head': triple['head']['entity_name'],
            'tail': triple['tail']['entity_name']
        })

    df_triple = pd.DataFrame(triples_data)

    return df_triple, df_entity


# Test with sample Wikipedia text
wiki_text = """
The Liberty Bell Ruby is a sculpture crafted from the world's largest mined ruby,[1] discovered in East Africa in the 1950s.[2] 
It weighs four pounds, is eight and a half thousand carats (8,500), and is sculpted into a miniature form of the Liberty Bell. 
It has 50 diamonds set in it and is valued at $2 million.

The ruby was created in 1976 for Beverly Hills-based Kazanjian Brothers jewelry company by sculptor Alfonso de Vivanco for the United States Bicentennial.[3]
It was made in the same spirit as sapphire busts of presidents that the jeweler's charitable foundation presented to the White House when Dwight D. Eisenhower was president.
"""
def extract(wiki_text, context, model, chunk_size=500):
    chunks = split_text_dp(wiki_text, max_length=chunk_size)
    all_triple=[]
    all_entity=[]
    for i, chunk in enumerate(chunks, 1):
        print(f"Chunk {i}: ({len(chunk)} characters)\n{chunk}\n")
        # Call the function
        df_triple, df_entity = extract_chunk(context, chunk, model)
        
        # Display the results
        print('### Triples')
        print(df_triple.to_string())
        print('### Entity')
        print(df_entity.to_string())
        all_entity.append(df_entity)
        all_triple.append(df_triple)
    df_all_triple = pd.concat(all_triple, ignore_index=True)
    df_all_entity = pd.concat(all_entity, ignore_index=True)
    return df_all_triple, df_all_entity

df_all_triple, df_all_entity = extract(wiki_text, {'main topic':'The Liberty Bell Ruby '}, model)

Chunk 1: (473 characters)
The Liberty Bell Ruby is a sculpture crafted from the world's largest mined ruby,[1] discovered in East Africa in the 1950s.[2] 
It weighs four pounds, is eight and a half thousand carats (8,500), and is sculpted into a miniature form of the Liberty Bell. 
It has 50 diamonds set in it and is valued at $2 million.

The ruby was created in 1976 for Beverly Hills-based Kazanjian Brothers jewelry company by sculptor Alfonso de Vivanco for the United States Bicentennial.[3]

Success on attempt 1
### Triples
       predicate               head                        tail
0        made of  Liberty Bell Ruby                        Ruby
1  discovered in               Ruby                 East Africa
2  sculpted into  Liberty Bell Ruby                Liberty Bell
3            has  Liberty Bell Ruby                     Diamond
4    created for  Liberty Bell Ruby          Kazanjian Brothers
5     created by  Liberty Bell Ruby          Alfonso de Vivanco
6    created for  

In [7]:
df_all_triple.info()
df_all_triple

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   predicate  13 non-null     object
 1   head       13 non-null     object
 2   tail       13 non-null     object
dtypes: object(3)
memory usage: 440.0+ bytes


Unnamed: 0,predicate,head,tail
0,made of,Liberty Bell Ruby,Ruby
1,discovered in,Ruby,East Africa
2,sculpted into,Liberty Bell Ruby,Liberty Bell
3,has,Liberty Bell Ruby,Diamond
4,created for,Liberty Bell Ruby,Kazanjian Brothers
5,created by,Liberty Bell Ruby,Alfonso de Vivanco
6,created for,Liberty Bell Ruby,United States Bicentennial
7,created from,Liberty Bell Ruby,ruby
8,discovered in,ruby,East Africa
9,sculpted into,Liberty Bell Ruby,Liberty Bell


In [8]:
df_all_entity.info()
df_all_entity

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   description   13 non-null     object
 1   entity_class  20 non-null     object
 2   entity_name   20 non-null     object
dtypes: object(3)
memory usage: 608.0+ bytes


Unnamed: 0,description,entity_class,entity_name
0,A sculpture crafted from the world\'s largest ...,Sculpture,Liberty Bell Ruby
1,A precious gemstone,Gemstone,Ruby
2,A geographical region in Africa,Location,East Africa
3,An iconic symbol of American independence,Historical Artifact,Liberty Bell
4,A precious gemstone,Gemstone,Diamond
5,A Beverly Hills-based jewelry company,Jewelry Company,Kazanjian Brothers
6,,Sculptor,Alfonso de Vivanco
7,The 200th anniversary of the United States Dec...,Historical Event,United States Bicentennial
8,A precious gemstone,Gemstone,Sapphire
9,34th president of the United States,Person,Dwight D. Eisenhower
