<a href="https://colab.research.google.com/github/vifirsanova/AGGILE/blob/main/draft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install wikipedia -q

import wikipedia

text = wikipedia.page("Mathematics").content

In [5]:
text



In [9]:
!pip install huggingface_hub -q

from huggingface_hub import InferenceClient

TOKEN = 'YOUR TOKEN'
MODEL = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B'
client = InferenceClient(MODEL, token=TOKEN)

In [None]:
N = 10 # number of core entities to extract

"""
extracts:
  - named entities
  - kewords
  - concepts
"""

In [13]:
core_system = f"""extract {N} collocations descibing key concepts, keywords, named entities from the provided source"""

In [15]:
# extract entities

core_concepts = client.chat.completions.create(messages=[{"role": "system", "content": core_system},
                                                         {"role": "user", "content": text},
                                                         ],
                                               response_format={"type": "json",
                                                                "value": {"properties": {"core_concepts": {"type": "array", "items": {"type": "string"}},}}},
                                               stream=False,
                                               max_tokens=1024,
                                               temperature=0.7,
                                               top_p=0.1
                                               ).choices[0].get('message')['content']

In [18]:
import ast

core_concepts = ast.literal_eval(core_concepts)

In [19]:
core_concepts

{'core_concepts': ['Mathematics',
  'Number Theory',
  'Algebra',
  'Geometry',
  'Calculus',
  'Analysis',
  'Discrete Mathematics',
  'Mathematical Logic',
  'Set Theory',
  'Statistics']}

In [56]:
"""
generate graph network from extracted concepts:
  1. extract related concepts (collocations)
  2. find intersections
  3. generate predicates
  4. build the graph
"""

rel_system = """extract 5-10 most repesentative collocations from the provided source that are related to the provided concept"""

In [57]:
# extract related concepts

def extract_relations(word):
  return ast.literal_eval(client.chat.completions.create(messages=[{"role": "system", "content": rel_system},
                                                                   {"role": "user", "content": f"concept = {word}, source = {text}"},
                                                                  ],
                                                         response_format={"type": "json",
                                                                          "value": {"properties": {"related_concepts": {"type": "array", "items": {"type": "string"}},}}},
                                                         stream=False,
                                                         max_tokens=512,
                                                         temperature=0.7,
                                                         top_p=0.1
                                                         ).choices[0].get('message')['content'])

In [58]:
relations = {word: extract_relations(word) for word in core_concepts['core_concepts']}

In [59]:
relations

{'Mathematics': {'related_concepts': ['Mathematics',
   'Number Theory',
   'Geometry',
   'Algebra',
   'Calculus',
   'Discrete Mathematics',
   'Mathematical Logic',
   'Set Theory',
   'Statistics',
   'Computational Mathematics']},
 'Number Theory': {'related_concepts': ['Number Theory',
   'Mathematics',
   'Algebra',
   'Geometry',
   'Calculus',
   'Analysis',
   'Discrete Mathematics',
   'Mathematical Logic',
   'Set Theory',
   'Statistics']},
 'Algebra': {'related_concepts': ['Mathematics',
   'Algebra',
   'Number Theory',
   'Geometry',
   'Calculus',
   'Discrete Mathematics',
   'Mathematical Logic',
   'Set Theory',
   'Statistics',
   'Computational Mathematics']},
 'Geometry': {'related_concepts': ['Geometry',
   'Mathematics',
   'Shapes',
   'Space',
   'Theorems',
   'Proofs',
   'Euclidean Geometry',
   'Non-Euclidean Geometry',
   'Algebraic Geometry',
   'Differential Geometry']},
 'Calculus': {'related_concepts': ['Mathematics',
   'Calculus',
   'Analysis',
 

In [61]:
"""
generate graph network from extracted concepts:
  1. extract related concepts (collocations) > done
  2. find intersections > done
  3. generate predicates
  4. build the graph
"""

pred_system = """define the relationship between two words: generate a verb or a phrase decribing a relationship between two entities; return a predicate for a knowledge graph triplet"""

In [67]:
# extract predicates

def extract_relations(subj, obj):
  return ast.literal_eval(client.chat.completions.create(messages=[{"role": "system", "content": pred_system},
                                                                   {"role": "user", "content": f"what is the relationship between {subj} and {obj}? return a predicate only"},
                                                                  ],
                                                         response_format={"type": "json",
                                                                          "value": {"properties": {"predicate": {"type": "string"},}}},
                                                         stream=False,
                                                         max_tokens=512,
                                                         temperature=0.7,
                                                         top_p=0.1
                                                         ).choices[0].get('message')['content'])['predicate']

In [154]:
triplets = dict()

for subj in relations:
  triplets[subj] = list()
  for obj in relations[subj]['related_concepts']:
    temp = {'subject': subj, 'predicate': '', 'object': ''}
    temp['object'] = obj
    temp['predicate'] = extract_relations(subj, obj)
    if temp['subject'] != temp['object']:
      triplets[subj].append(temp)

In [155]:
triplets

{'Mathematics': [{'subject': 'Mathematics',
   'predicate': 'is a branch of',
   'object': 'Number Theory'},
  {'subject': 'Mathematics',
   'predicate': 'is a branch of',
   'object': 'Geometry'},
  {'subject': 'Mathematics',
   'predicate': 'is a branch of',
   'object': 'Algebra'},
  {'subject': 'Mathematics',
   'predicate': 'is a branch of',
   'object': 'Calculus'},
  {'subject': 'Mathematics',
   'predicate': 'is a branch of',
   'object': 'Discrete Mathematics'},
  {'subject': 'Mathematics',
   'predicate': 'is a branch of',
   'object': 'Mathematical Logic'},
  {'subject': 'Mathematics',
   'predicate': 'is a branch of',
   'object': 'Set Theory'},
  {'subject': 'Mathematics',
   'predicate': 'is a branch of',
   'object': 'Statistics'},
  {'subject': 'Mathematics',
   'predicate': 'is a specialized field of',
   'object': 'Computational Mathematics'}],
 'Number Theory': [{'subject': 'Number Theory',
   'predicate': 'is a branch of',
   'object': 'Mathematics'},
  {'subject': 

In [156]:
import plotly.graph_objects as go
import networkx as nx
from collections import Counter
import random

# Prepare nodes and edges
nodes = set()
edges = []

for key, values in triplets.items():
    for rel in values:
        nodes.add(rel['subject'])
        nodes.add(rel['object'])
        edges.append((rel['subject'], rel['object'], rel['predicate']))

# Create a networkx graph
G = nx.Graph()

# Add nodes and edges to the graph
for edge in edges:
    G.add_edge(edge[0], edge[1], label=edge[2])

# Generate positions for nodes using force-directed layout with more space
pos = nx.spring_layout(G, seed=42)  # Increasing k for more spacing

# Extract node and edge data for Plotly
node_x = [pos[node][0] for node in G.nodes()]
node_y = [pos[node][1] for node in G.nodes()]
node_labels = list(G.nodes())

# Count connections
node_degrees = Counter([node for edge in edges for node in edge[:2]])

# Assign distinct colors for each predicate (use a set to avoid duplicates)
unique_predicates = list(set([edge[2] for edge in edges]))
predicate_colors = {predicate: f'rgba({random.randint(0,255)},{random.randint(0,255)},{random.randint(0,255)},1)'
                    for predicate in unique_predicates}

# Plotly data for edges
edge_x = []
edge_y = []

for edge in edges:
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]

# Create the figure
fig = go.Figure()

# Add edges
fig.add_trace(go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='text',
    mode='lines'
))

# Add nodes with uniform size and labels
fig.add_trace(go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    marker=dict(
        size=25,  # Uniform node size for all nodes
        color=[node_degrees[node] for node in node_labels],
        #colorscale='Viridis',
        colorbar=dict(title='Connections')
    ),
    text=node_labels,
    hoverinfo='text',
    textposition='top center',
    textfont=dict(size=13, weight="bold")
))

# Add predicate labels near the nodes with black text
for edge in edges:
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    predicate_label = edge[2]

    # Calculate the midpoint of the edge and add small offsets to create spacing
    mid_x = (x0 + x1) / 2
    mid_y = (y0 + y1) / 2

    # Add the label near the midpoint of the edge with black text
    fig.add_trace(go.Scatter(
        x=[mid_x], y=[mid_y],
        mode='text',
        text=[predicate_label],
        textposition='middle center',
        showlegend=False,
        textfont=dict(size=10)
    ))

# Update layout
fig.update_layout(
    showlegend=False,
    margin=dict(l=0, r=0, t=0, b=0),
    xaxis=dict(showgrid=False, zeroline=False),
    yaxis=dict(showgrid=False, zeroline=False),
    title="Force-Directed Graph with Predicate Labels on Nodes"
)

fig.show()

In [157]:
# Save the figure as an HTML file
fig.write_html("graph_with_predicates.html")