In [1]:
from typing import List, Dict
from tqdm import tqdm

from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator

from langchain.prompts import PromptTemplate
from langchain.schema import BaseOutputParser
from pydantic import BaseModel, ValidationError
from typing import Any, List

from dsm_services.llm.langchain import llmChatModel
# from dask.distributed import Client, LocalCluster

llm = llmChatModel(
    model_name="llama3.1:70b",
    api_key="H9hWk3vo.iWZcjb3Kg8Cq57j2iT0QtjJG8LDbqwbU"
)

import pandas as pd
# import dask.dataframe as dd
import json
import datetime

# cluster = LocalCluster(ip="0.0.0.0", n_workers=1, threads_per_worker=1)
# client = Client(cluster)
# client

tqdm.pandas()

In [2]:
import time
from functools import wraps

def retry(n):
    """
    A decorator to retry a function n times upon failure, and print the number of attempts made.
    
    Parameters:
    -----------
    n : int
        The number of retry attempts.
    
    Returns:
    --------
    Callable
        A decorator that retries the decorated function up to n times.
    """
    def decorator_retry(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            attempts = 0
            while attempts < n:
                try:
                    attempts += 1
                    # Pass the attempt count to the decorated function
                    result = func(*args, attempt_number=attempts, **kwargs)
                    print(f"Success on attempt {attempts}")
                    return result
                except Exception as e:
                    print(f"Attempt {attempts} failed: {e}")
                    if attempts == n:
                        # raise  # Re-raise the exception if it's the last attempt
                        return {}
                    # time.sleep(2)  # Optional delay between retries
        return wrapper
    return decorator_retry

In [3]:
class Entity(BaseModel):
    """
    An entity details in the knowledge graph. The entity detials consist of name, class and description.

    Attributes:
    -----------
    entity_name : str
        The human-readable name of the entity.
    entity_class : str
        The classification or type of the entity (e.g., Person, Product, etc.).
    entity_description : str
        A brief description of the entity.
    class_description : str
        A brief description of the class or type of the entity.
    """
    entity_name: str
    entity_class: str
    entity_description: str
    class_description: str

class Predicate(BaseModel): 
    """
    An predicate details in the knowledge graph. The predicate detials consist of predicate string and description.
    
    Attributes:
    -----------
    label : str
        Predicate label; e.g. "created by", to express the intended relation between subject and object. 
    synonyms : list[str]
        list of alternative semantic expressions that can replace the predicate e.g. ["produced by", "developed by", "invented by"] can replace "created by"
    """
    label: str
    synonyms: list[str]
    
class Relation(BaseModel):
    """
    A class to represent a relationship (triple) in the knowledge graph.

    Attributes:
    -----------
    head : Entity
        The subject or head entity in the relationship.
    predicate : Predicate
        The relationship or predicate that links the head and tail (e.g., Person, Product, etc.).     
    tail : Entity
        The object or tail entity in the relationship.
    """
    head: Entity
    predicate: Predicate
    tail: Entity 

In [4]:
from dask.base import tokenize, normalize_token

class EntityRelationOutputParser(BaseOutputParser):
    def parse(self, text: str) -> Dict[str, pd.DataFrame]:
        try:
            # Parse the response assuming it comes in JSON format
            # print(text)
            parsed_data = eval(text)  # Assuming LLM responds in Python dictionary format

            # Extract relations
            relations = [Relation(**relation) for relation in parsed_data]

            # Create unique entity records for head and tail
            entities = {}
            for relation in relations:
                entities[relation.head.entity_name] = relation.head
                entities[relation.tail.entity_name] = relation.tail

            # Create DataFrames
            entity_df = pd.DataFrame([entity.dict() for entity in entities.values()])
            relations_df = pd.DataFrame([{
                'subject_name': relation.head.entity_name,
                'subject_name_description': relation.head.entity_description,
                'subject_class': relation.head.entity_class,
                'subject_class_description': relation.head.class_description,
                'predicate': relation.predicate.label,
                'predicate_synonyms': relation.predicate.synonyms, 
                'object_name': relation.tail.entity_name,
                'object_name_description': relation.tail.entity_description,
                'object_class': relation.tail.entity_class,
                'object_class_description': relation.tail.class_description
            } for relation in relations])

            # return {
            #     "entity_df": entity_df,
            #     "relations_df": relations_df
            # }
            return {
                "entity_df": entity_df.to_dict('records'),
                "relations_df": relations_df.to_dict('records')
            }

        except ValidationError as e:
            raise ValueError(f"Parsing error: {e}")
            
    def __dask_tokenize__(self):
        # This tuple fully represents self
        # Wrap non-trivial objects with normalize_token before returning them
        return normalize_token(EntityRelationOutputParser)

# Define your prompt template

prompt_template = """
Please extract the entities and relations from the following Wikipedia text to the database.
Ensure that the entities and relationships are accurate.
Given the following input, return the entities and relations as JSON format.

ps. reponse only json data

Context: {context}

Input: {input_text}

Format your output as follows:
[
    {{
        "head": {{
            "entity_name": "name of the head entity",
            "entity_class": "classification of head",
            "entity_description": "brief description",
            "class_description": "brief description of class or type of entity"
        }},
        "predicate": {{
            "label": "relationship description", 
            "synonyms": "list of alternative semantic expressions"
        }},
        "tail": {{
            "entity_name": "name of the tail entity",
            "entity_class": "classification of tail",
            "entity_description": "brief description",
            "class_description": "brief description of class or type of entity"
        }}
    }}
]

Response:
"""

# Instantiate a PromptTemplate
prompt = PromptTemplate(
    input_variables=["input_text", "context"],
    template=prompt_template
)

# Function to create the full prompt and parse output
@retry(5)
def create_and_parse_output(input_text: str, context:str, llm, attempt_number) -> List[Relation]:
    # Create the full prompt
    final_prompt = prompt.format(input_text=input_text, context=context)
    
    # Call the LLM (e.g., OpenAI, Ollama) with the prompt
    response = llm.invoke(final_prompt).content  # This depends on your LLM client

    # Parse the output using the custom parser
    parser = EntityRelationOutputParser()
    output = parser.parse(response)
    output.update({
        'attempt_number': attempt_number
    })
    return output

def extract_spo(df):
    
    _llm = llmChatModel(
        model_name="llama3.1:70b",
        api_key="H9hWk3vo.iWZcjb3Kg8Cq57j2iT0QtjJG8LDbqwbU"
    )
    df['spo_data'] = df.progress_apply(
        lambda row: json.dumps(create_and_parse_output(
            input_text=row['split_chunk'],
            llm=_llm,
            context=row['url']
        )),
        axis=1,
    )
    return df  

In [5]:
df = pd.read_parquet('./output/0_article_chunks.parquet')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20762 entries, 0 to 20761
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   wiki_id      20762 non-null  object
 1   url          20762 non-null  object
 2   chunks       20762 non-null  object
 3   len          20762 non-null  int64 
 4   split_chunk  20762 non-null  object
dtypes: int64(1), object(4)
memory usage: 811.1+ KB


Unnamed: 0,wiki_id,url,chunks,len,split_chunk
0,Q1001313,https://en.wikipedia.org/wiki/Filigree,FiligreeFiligree (also less commonly spelled f...,453,Archaeological finds in ancient Mesopotamia in...
1,Q1001313,https://en.wikipedia.org/wiki/Filigree,FiligreeFiligree (also less commonly spelled f...,496,The Latin words gave filigrana in Italian whic...
2,Q1001313,https://en.wikipedia.org/wiki/Filigree,FiligreeFiligree (also less commonly spelled f...,403,"It was popular as well in Italian, French and ..."
3,Q1001313,https://en.wikipedia.org/wiki/Filigree,FiligreeFiligree (also less commonly spelled f...,496,FiligreeFiligree (also less commonly spelled f...
4,Q1001313,https://en.wikipedia.org/wiki/Filigree,"Egyptian jewellers employed wire, both to lay ...",217,But the feathers and petals of modern Italian ...


In [6]:
art10 = [
    'https://en.wikipedia.org/wiki/Featherwork',
    'https://en.wikipedia.org/wiki/Live_insect_jewelry',
    'https://en.wikipedia.org/wiki/Hymettus',
    'https://en.wikipedia.org/wiki/Tuckahoe_marble',
    'https://en.wikipedia.org/wiki/Kerimasi',
    'https://en.wikipedia.org/wiki/Freshwater_pearl',
    'https://en.wikipedia.org/wiki/Belly_chain', 
    'https://en.wikipedia.org/wiki/Temple_ring',
    'https://en.wikipedia.org/wiki/Tahitian_pearl',
]
df = df[df['url'].isin(art10)]
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 63 entries, 2012 to 19014
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   wiki_id      63 non-null     object
 1   url          63 non-null     object
 2   chunks       63 non-null     object
 3   len          63 non-null     int64 
 4   split_chunk  63 non-null     object
dtypes: int64(1), object(4)
memory usage: 3.0+ KB


Unnamed: 0,wiki_id,url,chunks,len,split_chunk
2012,Q1232266,https://en.wikipedia.org/wiki/Hymettus,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ...",116,"Most of the forest is in the north, and much o..."
2013,Q1232266,https://en.wikipedia.org/wiki/Hymettus,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ...",422,It was noted for its thyme honey.[1] Marble ha...
2014,Q1232266,https://en.wikipedia.org/wiki/Hymettus,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ...",286,"In ancient times, the highest point was known ..."
2015,Q1232266,https://en.wikipedia.org/wiki/Hymettus,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ...",317,Hymettus was assigned the status of a protecte...
2016,Q1232266,https://en.wikipedia.org/wiki/Hymettus,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ...",488,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ..."


In [7]:
df.url.value_counts()

url
https://en.wikipedia.org/wiki/Tuckahoe_marble        11
https://en.wikipedia.org/wiki/Hymettus               10
https://en.wikipedia.org/wiki/Freshwater_pearl       10
https://en.wikipedia.org/wiki/Tahitian_pearl          8
https://en.wikipedia.org/wiki/Belly_chain             8
https://en.wikipedia.org/wiki/Live_insect_jewelry     6
https://en.wikipedia.org/wiki/Featherwork             4
https://en.wikipedia.org/wiki/Temple_ring             3
https://en.wikipedia.org/wiki/Kerimasi                3
Name: count, dtype: int64

In [8]:
df.describe()

Unnamed: 0,len
count,63.0
mean,367.206349
std,116.657364
min,44.0
25%,311.5
50%,416.0
75%,451.5
max,493.0


In [9]:
# input chunk
unit_text = df.split_chunk.iloc[12]
url = df.url.iloc[12]
print(unit_text, '\n')

# Unit test show sample output.
output = create_and_parse_output(
    input_text=unit_text, 
    llm=llm, 
    context=url
)
# Unit test show sample output
relations_df = output['relations_df'] 
pd.DataFrame(relations_df)

Temple ringA temple ring is an ornament that hangs from a headdress or in braids of hair in the front area at the side of the head –  i.e.
near a person's anatomical temple.
Temple rings were part of Slavic, Scandinavian and others' medieval women's dress.
Most were made of base metals such as copper alloys or iron, though silver and even gold were occasionally used.
These were known as temple rings because they were worn on the head, near the temples of a woman or a girl. 

Success on attempt 1


Unnamed: 0,subject_name,subject_name_description,subject_class,subject_class_description,predicate,predicate_synonyms,object_name,object_name_description,object_class,object_class_description
0,Temple ring,An ornament that hangs from a headdress or in ...,Ornament,A decorative item,is part of,"[belongs to, is included in]",Slavic women's dress,Medieval women's clothing in Slavic culture,Traditional clothing,Clothing worn for cultural or traditional purp...
1,Temple ring,An ornament that hangs from a headdress or in ...,Ornament,A decorative item,is made of,"[consists of, is composed of]",Base metals,Metals such as copper alloys or iron,Materials,Substances used to make objects
2,Temple ring,An ornament that hangs from a headdress or in ...,Ornament,A decorative item,is worn on,"[is placed on, is located on]",Head,The upper part of the human body,Body part,A part of the human anatomy
3,Temple ring,An ornament that hangs from a headdress or in ...,Ornament,A decorative item,is known as,"[is referred to as, is called]",Temple ring,An ornament that hangs from a headdress or in ...,Ornament,A decorative item


# **Process**

In [10]:
df.url.unique()

array(['https://en.wikipedia.org/wiki/Hymettus',
       'https://en.wikipedia.org/wiki/Temple_ring',
       'https://en.wikipedia.org/wiki/Tuckahoe_marble',
       'https://en.wikipedia.org/wiki/Featherwork',
       'https://en.wikipedia.org/wiki/Kerimasi',
       'https://en.wikipedia.org/wiki/Freshwater_pearl',
       'https://en.wikipedia.org/wiki/Live_insect_jewelry',
       'https://en.wikipedia.org/wiki/Tahitian_pearl',
       'https://en.wikipedia.org/wiki/Belly_chain'], dtype=object)

In [11]:
df

Unnamed: 0,wiki_id,url,chunks,len,split_chunk
2012,Q1232266,https://en.wikipedia.org/wiki/Hymettus,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ...",116,"Most of the forest is in the north, and much o..."
2013,Q1232266,https://en.wikipedia.org/wiki/Hymettus,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ...",422,It was noted for its thyme honey.[1] Marble ha...
2014,Q1232266,https://en.wikipedia.org/wiki/Hymettus,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ...",286,"In ancient times, the highest point was known ..."
2015,Q1232266,https://en.wikipedia.org/wiki/Hymettus,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ...",317,Hymettus was assigned the status of a protecte...
2016,Q1232266,https://en.wikipedia.org/wiki/Hymettus,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ...",488,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ..."
...,...,...,...,...,...
19010,Q811138,https://en.wikipedia.org/wiki/Belly_chain,Belly chainA belly chain or waist chain is the...,430,Belly chainA belly chain or waist chain is the...
19011,Q811138,https://en.wikipedia.org/wiki/Belly_chain,Namboothri men generally wear waist strings ev...,390,"In childcare, they have been included as part ..."
19012,Q811138,https://en.wikipedia.org/wiki/Belly_chain,Namboothri men generally wear waist strings ev...,412,"For cultural reasons, waist chains became a fa..."
19013,Q811138,https://en.wikipedia.org/wiki/Belly_chain,Namboothri men generally wear waist strings ev...,458,Although many boys generally abandon waist cha...


In [12]:
df_extract_spo = extract_spo(df)

In [13]:
df_extract_spo.to_parquet('./output/1_df_extract_spo-llm_70b.parquet')

In [14]:
df_extract_spo

Unnamed: 0,wiki_id,url,chunks,len,split_chunk,spo_data
2012,Q1232266,https://en.wikipedia.org/wiki/Hymettus,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ...",116,"Most of the forest is in the north, and much o...","{""entity_df"": [{""entity_name"": ""Hymettus"", ""en..."
2013,Q1232266,https://en.wikipedia.org/wiki/Hymettus,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ...",422,It was noted for its thyme honey.[1] Marble ha...,"{""entity_df"": [{""entity_name"": ""Hymettus"", ""en..."
2014,Q1232266,https://en.wikipedia.org/wiki/Hymettus,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ...",286,"In ancient times, the highest point was known ...","{""entity_df"": [{""entity_name"": ""Hymettus"", ""en..."
2015,Q1232266,https://en.wikipedia.org/wiki/Hymettus,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ...",317,Hymettus was assigned the status of a protecte...,"{""entity_df"": [{""entity_name"": ""Hymettus"", ""en..."
2016,Q1232266,https://en.wikipedia.org/wiki/Hymettus,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ...",488,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ...","{""entity_df"": [{""entity_name"": ""Hymettus"", ""en..."
...,...,...,...,...,...,...
19010,Q811138,https://en.wikipedia.org/wiki/Belly_chain,Belly chainA belly chain or waist chain is the...,430,Belly chainA belly chain or waist chain is the...,"{""entity_df"": [{""entity_name"": ""Belly chain"", ..."
19011,Q811138,https://en.wikipedia.org/wiki/Belly_chain,Namboothri men generally wear waist strings ev...,390,"In childcare, they have been included as part ...","{""entity_df"": [{""entity_name"": ""Belly chain"", ..."
19012,Q811138,https://en.wikipedia.org/wiki/Belly_chain,Namboothri men generally wear waist strings ev...,412,"For cultural reasons, waist chains became a fa...","{""entity_df"": [{""entity_name"": ""Waist chains"",..."
19013,Q811138,https://en.wikipedia.org/wiki/Belly_chain,Namboothri men generally wear waist strings ev...,458,Although many boys generally abandon waist cha...,"{""entity_df"": [{""entity_name"": ""Lord Siva"", ""e..."


In [15]:
def extract_relation_df(dataframe):
    df_res = pd.DataFrame()
    for row in dataframe.to_dict('records') : 
        df_tmp = pd.DataFrame(row['relations_data'])
        df_tmp['wiki_id'] = row['wiki_id']
        df_tmp['url'] = row['url']
        df_tmp['chunks'] = row['chunks']
        df_tmp['split_chunk'] = row['split_chunk']
        df_res = pd.concat([df_res, df_tmp])
    return df_res

df_extract_spo['spo_data'] = df_extract_spo['spo_data'].apply(lambda x : eval(x))
df_extract_spo['relations_data'] = df_extract_spo['spo_data'].apply(lambda x : list(x['relations_df']))
df_relation = extract_relation_df(df_extract_spo)
df_relation.info()
df_relation.head()

<class 'pandas.core.frame.DataFrame'>
Index: 238 entries, 0 to 2
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   subject_name               238 non-null    object
 1   subject_name_description   238 non-null    object
 2   subject_class              238 non-null    object
 3   subject_class_description  238 non-null    object
 4   predicate                  238 non-null    object
 5   predicate_synonyms         238 non-null    object
 6   object_name                238 non-null    object
 7   object_name_description    238 non-null    object
 8   object_class               238 non-null    object
 9   object_class_description   238 non-null    object
 10  wiki_id                    238 non-null    object
 11  url                        238 non-null    object
 12  chunks                     238 non-null    object
 13  split_chunk                238 non-null    object
dtypes: object(14)
mem

Unnamed: 0,subject_name,subject_name_description,subject_class,subject_class_description,predicate,predicate_synonyms,object_name,object_name_description,object_class,object_class_description,wiki_id,url,chunks,split_chunk
0,Hymettus,A mountain in Greece,Mountain,A natural elevation of the earth's surface,has terrain,"[is composed of, consists of]",Rocky terrain,Terrain characterized by rocks and stones,Geological feature,A natural feature of the earth's surface,Q1232266,https://en.wikipedia.org/wiki/Hymettus,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ...","Most of the forest is in the north, and much o..."
1,Hymettus,A mountain in Greece,Mountain,A natural elevation of the earth's surface,has terrain,"[is composed of, consists of]",Grassy terrain,Terrain characterized by grass and vegetation,Geological feature,A natural feature of the earth's surface,Q1232266,https://en.wikipedia.org/wiki/Hymettus,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ...","Most of the forest is in the north, and much o..."
2,Hymettus,A mountain in Greece,Mountain,A natural elevation of the earth's surface,has terrain,"[is composed of, consists of]",Deforested terrain,Terrain characterized by the absence of trees ...,Geological feature,A natural feature of the earth's surface,Q1232266,https://en.wikipedia.org/wiki/Hymettus,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ...","Most of the forest is in the north, and much o..."
3,Hymettus,A mountain in Greece,Mountain,A natural elevation of the earth's surface,is made of,"[consists of, composed of]",Limestone,A sedimentary rock composed primarily of calci...,Rock type,A natural solid mineral substance,Q1232266,https://en.wikipedia.org/wiki/Hymettus,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ...","Most of the forest is in the north, and much o..."
0,Hymettus,"A mountain in Greece, known for its thyme hone...",Mountain,A natural elevation of the earth's surface.,is surrounded by,"[is bordered by, has neighboring communities]","Athens, Zografou, Kaisariani, Vyronas, Ilissia...",,Community/City,A populated area with its own administration.,Q1232266,https://en.wikipedia.org/wiki/Hymettus,"HymettusHymettus (/haɪˈmɛtəs/), also Hymettos ...",It was noted for its thyme honey.[1] Marble ha...


In [16]:
df_relation.to_parquet('./output/1_df_relation.parquet')