## Pip installs

In [None]:
!pip install langchain_experimental

Collecting langchain_experimental
  Downloading langchain_experimental-0.3.4-py3-none-any.whl.metadata (1.7 kB)
Collecting langchain-community<0.4.0,>=0.3.0 (from langchain_experimental)
  Downloading langchain_community-0.3.26-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-core<0.4.0,>=0.3.28 (from langchain_experimental)
  Downloading langchain_core-0.3.66-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain<1.0.0,>=0.3.26 (from langchain-community<0.4.0,>=0.3.0->langchain_experimental)
  Downloading langchain-0.3.26-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community<0.4.0,>=0.3.0->langchain_experimental)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community<0.4.0,>=0.3.0->langchain_experimental)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community<0.4.0,>=0

In [None]:
!pip install langchain_openai

Collecting langchain_openai
  Downloading langchain_openai-0.3.25-py3-none-any.whl.metadata (2.3 kB)
Downloading langchain_openai-0.3.25-py3-none-any.whl (69 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.2/69.2 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain_openai
Successfully installed langchain_openai-0.3.25


In [None]:
!pip install neo4j

Collecting neo4j
  Downloading neo4j-5.28.1-py3-none-any.whl.metadata (5.9 kB)
Downloading neo4j-5.28.1-py3-none-any.whl (312 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.3/312.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neo4j
Successfully installed neo4j-5.28.1


## Imports

In [None]:
import os
import json
import re
from langchain_experimental.graph_transformers import LLMGraphTransformer
import pandas as pd

## OpenAI API

In [None]:
from langchain_openai import ChatOpenAI
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

llm = ChatOpenAI(model='gpt-4o')

## Neo4j Graph

In [None]:
from langchain_community.graphs import Neo4jGraph

In [None]:
from google.colab import userdata
from neo4j import GraphDatabase

os.environ["NEO4J_URI"] = userdata.get('NEO4J_URI')
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = userdata.get('NEO4J_PASSWORD')

## Documents' Chunks

In [None]:
# All Chunked
paths_chunked_docs = ["/content/drive/MyDrive/semantic_chunking_output_1747235215211.json",
               "/content/drive/MyDrive/semantic_chunking_output_1747235218693.json",
               "/content/drive/MyDrive/semantic_chunking_output_1747235227239.json",
               "/content/drive/MyDrive/semantic_chunking_output_1747235231233.json",
               "/content/drive/MyDrive/semantic_chunking_output_1747235235321.json",]

# Just the Buying Guides
paths_chunked_buying_guides = ["/content/drive/MyDrive/semantic_chunking_output_1747767363915.json",
                               "/content/drive/MyDrive/semantic_chunking_output_1747767370777.json",
                               "/content/drive/MyDrive/semantic_chunking_output_1747767372008.json"]

chunked_docs = []
chunked_buying_guides = []
for path_chunked_doc in paths_chunked_docs:
  with open(path_chunked_doc, 'r') as f:
    chunked_docs.append( json.load(f) )

for path_chuked_buying_guide in paths_chunked_buying_guides:
  with open(path_chuked_buying_guide, 'r') as f:
    chunked_buying_guides.append( json.load(f) )

### Chunked docs

In [None]:
chunked_docs[0][0]

{'post_type': 'article',
 'last_publish_date': 1630506033.0,
 'hero_image_url': 'https://article.images.consumerreports.org/prod/content/dam/CRO-Images-2021/Cars/08Aug/CR-Cars-InlineHero-Tesla-Model-S-blue-driving-8-21',
 'authors': '[{name=Keith Barry, id=bdd50ba6-8caf-4524-82d7-8ec8d6ff3c69, title=null}]',
 'feed_date': 1630506033.0,
 'hide_from_feed': False,
 'title': 'NHTSA Safety Defect Investigation of Tesla Autopilot Crashes',
 'description': 'With NHTSA opening a safety defect investigation into Tesla Autopilot crashes, Consumer Reports says the federal safety agency is looking at whether the technology may be a contributing factor in multiple crashes with emergency vehicles.',
 'url': 'https://www.consumerreports.org/autonomous-driving/nhtsa-safety-defect-investigation-tesla-autopilot-crashes-a6996819019/',
 'article_id': 'a6996819019',
 'model_names': [],
 'excerpt_id': 'a6996819019_1',
 'excerpt': 'NHTSA is looking at whether the technology may be a contributing factor in mu

## Data Ingestion

In [None]:
# Define chunking strategy - We would probably have this -
# text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24) # Chunking Steps - already have this
# documents = text_splitter.split_documents(raw_documents[:3]) # Documents assignment

# documents = chunked_docs

documents = chunked_buying_guides
## The 'page_content' => 'excerpt'

In [None]:
documents[0][0]

{'post_type': 'buying-guide',
 'last_publish_date': '',
 'hero_image_url': 'https://article.images.consumerreports.org/prod/content/dam/CRO-Images-2023/08August/Cars/Consumer-Reports-Small-Cars-Buying-Guide-Mazda3-0823-crop',
 'authors': '[{name=Jeff S. Bartlett, id=11cdd45d-c903-4ca7-bccb-0d516e8ea4ab, title=null}]',
 'feed_date': '',
 'hide_from_feed': True,
 'title': 'Best Small Car Buying Guide',
 'description': 'Shopping for a small car? Here is what you need to know to find the right small car for you, whether it is a coupe, a sedan, or a hatchback. ',
 'url': 'https://www.consumerreports.org/cars/small-cars/buying-guide/',
 'article_id': 'a1204546685',
 'model_names': [],
 'excerpt_id': 'a1204546685_0',
 'excerpt': 'For many people, a small car makes sense for local driving, such as commuting and running errands, and for navigating the challenges of urban travel and city parking. But with the market shift toward SUVs, there are fewer small cars to choose from. You may question w

### Flattening the documents

In [None]:
from langchain.schema import Document

json_fields = list( documents[0][0].keys() )
# Following for reduced dimension of documents
# json_fields = list( documents[0][0].keys() )
json_fields.remove("excerpt")

# Flatten and convert
flattened_documents = [
    Document(page_content=doc["excerpt"],
             metadata={k: doc.get(k) for k in json_fields}
             )

    # Following for reduced dimension of documents
    # for doc in documents
    for sublist in documents
    for doc in sublist

]

Flattened Documents looks like a list containing elements like the below

In [None]:
flattened_documents[0]

Document(metadata={'post_type': 'buying-guide', 'last_publish_date': '', 'hero_image_url': 'https://article.images.consumerreports.org/prod/content/dam/CRO-Images-2023/08August/Cars/Consumer-Reports-Small-Cars-Buying-Guide-Mazda3-0823-crop', 'authors': '[{name=Jeff S. Bartlett, id=11cdd45d-c903-4ca7-bccb-0d516e8ea4ab, title=null}]', 'feed_date': '', 'hide_from_feed': True, 'title': 'Best Small Car Buying Guide', 'description': 'Shopping for a small car? Here is what you need to know to find the right small car for you, whether it is a coupe, a sedan, or a hatchback. ', 'url': 'https://www.consumerreports.org/cars/small-cars/buying-guide/', 'article_id': 'a1204546685', 'model_names': [], 'excerpt_id': 'a1204546685_0', 'text_fragment': ':~:text=For%20many,buying%20guide.'}, page_content='For many people, a small car makes sense for local driving, such as commuting and running errands, and for navigating the challenges of urban travel and city parking. But with the market shift toward SUV

## LLM Thinking and Reasoning Layer

### Claude Generated

##### Original Prompt - For Processing the documents

In [None]:
original_prompt = """
You are an expert at identifying valuable consumer advice and expert knowledge from buying guides.

Your task is to analyze the provided buying guide excerpt and its metadata, then extract specific pieces of expert knowledge, advice, or reasoning that would be valuable to consumers.

Look for:
1. **Technical insights** - Specific technical details that require expertise to know
2. **Performance comparisons** - Data-driven comparisons between products or features
3. **Hidden considerations** - Non-obvious factors that affect purchasing decisions
4. **Professional recommendations** - Advice that comes from testing, research, or industry experience
5. **Value assessments** - Insights about price-performance relationships
6. **Usage scenarios** - Specific use cases where certain features matter most
7. **Quality indicators** - How to identify good vs poor quality products
8. **Maintenance/longevity advice** - Information about durability, care, or lifespan

For each piece of expert knowledge you identify, provide:
- The exact text or paraphrased knowledge
- Your reasoning for why this qualifies as expert knowledge
- The category it falls into (from the list above)
- A percentage of the excerpt that you used to get this knowledge. Calculate this by (words in the chunk of excerpt you used / total words in the excerpt)
- A confidence score (1-10) for how valuable this advice is to consumers

Return your response as a JSON array strictly following this structure:
[\
{"reasoning": "Why this qualifies as expert knowledge", \
"expert_knowledge": "The specific advice or knowledge", \
"category": "One of the 8 categories above", \
"confidence_score": 8, "source_context": \
"Brief context of where this appeared in the text"}\
]

Remember, in the above, things like "expert_knowledge" are key in the key:value pair created for a json.
So, remember all the best practices for creating a json file

If no expert knowledge is found, return an empty array [].
"""

##### Organization Prompt(s)

In [None]:
def create_org_prompt_1(text: str):

  org_prompt_1 = f"""
  The following text largely contains 5 points over and over again.
  {text}

  ## Task:
  You will be given several grouped records, each containing the following fields:
  - "expert_knowledge"
  - "reasoning"
  - "category"
  - "percentage"
  - "confidence_score"

  Your task is to parse these records **in order** and return a dictionary with the following keys:
  - "expert_knowledge"
  - "reasoning"
  - "category"
  - "confidence_score"
  - "source_context"

  Each key should contain a **list** of all the values extracted from each record in order of appearance.

  Note:
  - The "source_context" should be a short string like `"Record 1"`, `"Record 2"`, etc., indicating the origin of each entry.
  - Ignore the "percentage" field.

  ## Expected Output Format:
  ```json
  {
    "expert_knowledge": [...],
    "reasoning": [...],
    "category": [...],
    "confidence_score": [...],
    "source_context": ["Record 1", "Record 2", ...]
  }
  """

  return org_prompt_1

In [None]:
SYSTEM_ORG_PROMPT_1 = "You are a expert bullet points extractor and organizor designed to structure points from text into a json."

In [None]:
def create_org_prompt_2(text: str):
  org_prompt_2 = f"""
  The following text largely contains 5 points over and over again.
  {text}

  ## Instructions:
  The repeating points will have headings like the following:
  - "expert_knowledge"
  - "reasoning"
  - "category"
  - "percentage"
  - "confidence_score"

  Your task is to parse these records **in order** and return a dictionary with the following keys:
  - "expert_knowledge"
  - "reasoning"
  - "category"
  - "confidence_score"
  - "source_context"

  Each key should contain a **list** of all the values extracted from each batch of five points with the corresponding heading, in order of appearance.


  ## Expected Output Format:
  ```json
  {{
    "expert_knowledge": [...],
    "reasoning": [...],
    "category": [...],
    "confidence_score": [...],
    "source_context": ["Record 1", "Record 2", ...]
  }}
  """

  return org_prompt_2

In [None]:
SYSTEM_ORG_PROMPT_2 = "You are a expert JSON extractor designed to organize structured insights from text."

#### Helper Functions and Prompt are here

In [None]:
import openai
import pandas as pd
import json
from typing import List, Dict, Any
import time
from langchain.schema import Document

class ExpertKnowledgeExtractor:
    def __init__(self, api_key: str):
        # Initialize the extractor with OpenAI API key
        self.client = openai.OpenAI(api_key=api_key)

    def create_extraction_prompt(self, excerpt: str, metadata: str) -> str:
        # Create the prompt for extracting expert knowledge
        prompt = f"""
You are an expert at identifying valuable consumer advice and expert knowledge from small excerpts of buying guides.

Your task is to analyze the provided buying guide excerpt and its metadata, and extract exactly *three* specific pieces of expert knowledge, advice, or reasoning that would be valuable to consumers.


### Instructions:

Look for:
1. **Technical insights** - Specific technical details that require expertise to know
2. **Performance comparisons** - Data-driven comparisons between products or features
3. **Hidden considerations** - Non-obvious factors that affect purchasing decisions
4. **Professional recommendations** - Advice that comes from testing, research, or industry experience
5. **Value assessments** - Insights about price-performance relationships
6. **Usage scenarios** - Specific use cases where certain features matter most
7. **Quality indicators** - How to identify good vs poor quality products
8. **Maintenance/longevity advice** - Information about durability, care, or lifespan


For each piece of expert knowledge you identify, provide:
1. The exact text or paraphrased knowledge
2. Your reasoning for why this qualifies as expert knowledge
3. A confidence score (1-10) for how valuable this advice is to consumers


### Input Excerpt and Metadata:

Here is the buying guide excerpt and metadata for you to analyze:
**Excerpt:**
{excerpt}

**Metadata:**
{metadata}


### Output Format:

Return your response by delineating it into the following points, in the same order *strictly* following this structure:
1. "expert_knowledge": "The specific advice or knowledge"
2. "reasoning": "Why this qualifies as expert knowledge"
3. "confidence_score": 8

You should always produce *all of the above three* fields for every piece of expert knowledge you identify.

Remember: Focus on actionable, specific advice that demonstrates expertise - not generic statements.
"""

        return prompt

    def extract_expert_knowledge(self, document: Document) -> List[Dict[str, Any]]:
        """Extract expert knowledge from a single LangChain Document."""
        metadata_str = json.dumps(document.metadata, indent=2, default=str)
        prompt = self.create_extraction_prompt(document.page_content, metadata_str)

        try:
            response = self.client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are an expert at identifying valuable consumer advice and expert knowledge."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.3,
                max_tokens=2000
            )

            # Parse the response from the llm
            content = response.choices[0].message.content

            # An LLM call to organize the information
            try:
              org_prompt = create_org_prompt_2(content,)
              response = self.client.chat.completions.create(
                model="gpt-4o",

                messages=[
                    {"role": "system", "content": SYSTEM_ORG_PROMPT_2},
                    {"role": "user", "content": org_prompt}
                ],
                temperature=0,
                max_tokens=2000
                )
              org_content = response.choices[0].message.content # The organized content

              # Strip triple backticks if present
              json_str = re.search(r"\{.*\}", org_content, re.DOTALL)
              if json_str:
                  json_str = json_str.group(0)
                  parsed_json = json.loads(json_str)
              else:
                  raise ValueError("JSON block not found in the response.")

              ### Output parsed_json should look like this:
              # {
              #   "expert_knowledge": [...],
              #   "reasoning": [...],
              #   "category": [...],
              #   "confidence_score": [...],
              #   "source_context": ["Record 1", "Record 2", ...]
              # }

            except Exception as e:
              print(f"Error: {e}")
              print(f"Raw response: {content}")
              return {}


            return parsed_json

        except json.JSONDecodeError as e:
            print(f"JSON decode error for document {document.metadata.get('excerpt_id', 'unknown')}: {e}")
            print(f"Raw response: {content}")
            return {}
        except Exception as e:
            print(f"API error for document {document.metadata.get('excerpt_id', 'unknown')}: {e}")
            print(f"Raw response: {content}")
            return {}

    def process_documents(self, documents: List[Document], delay: float = 1.0) -> pd.DataFrame:
        """Process multiple LangChain Documents and return results as a DataFrame."""
        all_results = []

        for i, doc in enumerate(documents):
            print(f"Processing document {i+1}/{len(documents)} (ID: {doc.metadata.get('excerpt_id', 'unknown')})...")

            # Knowledge dictionary
            expert_knowledge = self.extract_expert_knowledge(doc)

            # Add source information to each result
            result = {
                    'excerpt_id': doc.metadata.get('excerpt_id', f'doc_{i}'),
                    'article_id': doc.metadata.get('article_id', ''),
                    'title': doc.metadata.get('title', 'Unknown'),
                    'post_type': doc.metadata.get('post_type', ''),
                    'url': doc.metadata.get('url', ''),
                    'authors': doc.metadata.get('authors', ''),
                    'last_publish_date': doc.metadata.get('last_publish_date', ''),
                    'description': doc.metadata.get('description', ''),
                    'model_names': str(doc.metadata.get('model_names', [])),

                    'expert_knowledge': expert_knowledge['expert_knowledge'],
                    'reasoning': expert_knowledge['reasoning'],
                    'category': expert_knowledge['category'],
                    'confidence_score': expert_knowledge['confidence_score'],
                    'source_context': expert_knowledge['source_context'],
                    'page_content': doc.page_content[:300] + '...' if len(doc.page_content) > 300 else doc.page_content
            }

            all_results.append(result)

            # Rate limiting
            if delay > 0:
                time.sleep(delay)

        return pd.DataFrame(all_results)

In [None]:
five_strings = ["expert_knowledge", "reasoning", "category", "confidence_score", "source_context"]

#### Main Trigger

##### Batches

In [None]:
# There are a total of 1656 of these excerpts
# Create 24 batches of 69 excerpts each

batches = []

for i in range(0, 1656, 69):
    batch = flattened_documents[i:i+69]
    batches.append(batch)

##### Functional Code

In [None]:
# Initialize the extractor
extractor = ExpertKnowledgeExtractor(api_key=os.environ["OPENAI_API_KEY"])

# Process the documents
print("Starting expert knowledge extraction...")
results_df = extractor.process_documents(batches[23], delay=1.0) #Use batch_doc(s) for now

Starting expert knowledge extraction...
Processing document 1/69 (ID: a6626544326_8809)...
Processing document 2/69 (ID: a6626544326_9471)...
Processing document 3/69 (ID: a6626544326_9935)...
Processing document 4/69 (ID: a6626544326_10601)...
Processing document 5/69 (ID: a6626544326_11290)...
Processing document 6/69 (ID: a6626544326_12781)...
Processing document 7/69 (ID: a6626544326_13233)...
Processing document 8/69 (ID: a6626544326_13762)...
Processing document 9/69 (ID: a6626544326_14134)...
Processing document 10/69 (ID: a8573149733_5)...
Processing document 11/69 (ID: a8573149733_1236)...
Processing document 12/69 (ID: a8573149733_2901)...
Processing document 13/69 (ID: a8573149733_4856)...
Processing document 14/69 (ID: a8573149733_5455)...
Processing document 15/69 (ID: a8573149733_6030)...
Processing document 16/69 (ID: a8573149733_6706)...
Processing document 17/69 (ID: a8573149733_7190)...
Processing document 18/69 (ID: a8573149733_7598)...
Processing document 19/69 (ID:

##### Saving the results_df to csv

In [None]:
## Save the results_df in raw format for human analysis
results_df.to_csv("gpt4o_expert_reasoning_Batch24.csv")

##### Display Results

In [None]:
# Display results
if not results_df.empty:
    print(f"\nExtracted {len(results_df)} pieces of expert knowledge:")
    print("=" * 60)

    for _, row in results_df.iterrows():
        print(f"Document: {row['title']}")
        print(f"Excerpt ID: {row['excerpt_id']}")
        print(f"Knowledge: {row['expert_knowledge']}")
        print(f"Reasoning: {row['reasoning']}")
        print(f"Confidence: {row['confidence_score']}/10")
        print("-" * 40)

else:
    print("No expert knowledge was extracted from the provided documents.")


Extracted 69 pieces of expert knowledge:
Document: Best Water Filter Buying Guide
Excerpt ID: a6626544326_8809
Knowledge: ['Countertop filters are less likely to clog than pitcher filters or faucet-mounted filters, according to CR’s tests.', 'Basic countertop water filter models use carbon filtration, while more costly units may use reverse osmosis or ultraviolet light to guard against more serious contaminants.', 'Countertop filters are handy for renters who might not have permission to make significant plumbing modifications.']
Reasoning: ["This qualifies as expert knowledge because it is based on Consumer Reports' testing, which provides a data-driven performance comparison between different types of water filters. Such insights help consumers understand the reliability and maintenance needs of different filter types.", 'This is a technical insight that explains the differences in filtration technology and their associated costs. It helps consumers make informed decisions based on 

**Experiment's output^^**

Explanation of the output: Every excerpt has a list of Knowledge, list of reasoning (Hopefully correspoding to each element of knowledge), Category(self-defined, also hopefully following the same correspondence), Confidence score (again hopefully same correspondence)

Explanation of the fields: <br>

1.   Excerpt ID: **unique id** for the excerpt its taken from
2.   Knowledge: The Expert insight gathered from the excerpt
3.   Reasoning: Reasoning behind LLM thinking so
4.   Category: Which category does it the knowledge belong to. Eg.: "Technical Insights"
5.   Confidence: A self-confidence score of the LLM for the knowledge




Explanation of the fields(New): <br>

1.   Excerpt ID: **unique id** for the excerpt its taken from
2.   Knowledge: The Expert insight gathered from the excerpt
3.   Reasoning: Reasoning behind LLM thinking so
4.   Confidence: A self-confidence score of the LLM for the knowledge


##### Helper Function - Reading output saved dataframe columns as python lists - function name: safe_literal_eval

In [None]:
import pandas as pd
import ast

def safe_literal_eval(val):
    """Safely evaluate string representations of lists"""
    if pd.isna(val) or val == '' or val == 'nan':
        return []
    try:
        # Handle string representations of lists
        if isinstance(val, str):
            # Remove extra whitespace and handle common formatting issues
            val = val.strip()
            if val.startswith('[') and val.endswith(']'):
                return ast.literal_eval(val)
            else:
                # If it's not in list format, return as single-item list
                return [val]
        # If it's already a list, return as is
        elif isinstance(val, list):
            return val
        else:
            # Convert other types to single-item lists
            return [val]
    except (ValueError, SyntaxError):
        # If parsing fails, return as single-item list
        return [str(val)] if val else []

# Define the columns that should be read as lists
list_columns = ["expert_knowledge", "reasoning", "category", "confidence_score", "source_context"]

# Create converters dictionary
converters = {col: safe_literal_eval for col in list_columns}

##### Helper Function - Exploding the dataframe entries

In [None]:
## Exlpodes the results_df dataframe with a better format for human analysis
# Make thi code better with the cases of disproportionate lengths of element counts in different columns
import pandas as pd

def explode_list_columns(df):
    """
    Explode list columns in dataframe to create individual rows for each list element.

    Parameters:
    df (pd.DataFrame): Input dataframe with list columns

    Returns:
    pd.DataFrame: Expanded dataframe with individual rows for each list element
    """

    # Define the list columns to explode
    list_columns = ['expert_knowledge', 'reasoning', 'category', 'confidence_score', 'source_context']

    # Create a copy of the dataframe to avoid modifying the original
    df_exploded = df.copy()

    # First, let's check and fix any length mismatches
    print("Checking list lengths...")
    for idx, row in df_exploded.iterrows():
        lengths = []
        for col in list_columns:
            if isinstance(row[col], list):
                lengths.append(len(row[col]))
            else:
                lengths.append(0)

        if len(set(lengths)) > 1:  # If lengths are not all the same
            print(f"Row {idx}: Mismatched lengths - {dict(zip(list_columns, lengths))}")

    # Method 1: Safe explode by exploding one column at a time
    try:
        for col in list_columns:
            df_exploded = df_exploded.explode(col)

    except Exception as e:
        print(f"Explode failed: {e}")
        print("Falling back to manual method...")
        return explode_list_columns_manual(df)

    # Reset index to get clean sequential indices
    df_exploded = df_exploded.reset_index(drop=True)

    return df_exploded

# Alternative robust approach - handles edge cases better
def explode_list_columns_robust(df):
    """
    Robust approach to explode list columns with better error handling.
    """
    list_columns = ['expert_knowledge', 'reasoning', 'category', 'confidence_score', 'source_context']
    expanded_rows = []

    for idx, row in df.iterrows():
        # Convert any non-list values to lists and get lengths
        list_data = {}
        max_length = 0

        for col in list_columns:
            if isinstance(row[col], list):
                list_data[col] = row[col]
            elif pd.isna(row[col]) or row[col] is None:
                list_data[col] = []
            else:
                # Convert single values to single-item lists
                list_data[col] = [row[col]]

            max_length = max(max_length, len(list_data[col]))

        # If all lists are empty, keep the original row
        if max_length == 0:
            expanded_rows.append(row.to_dict())
        else:
            # Create multiple rows, one for each list element
            for i in range(max_length):
                new_row = row.to_dict()
                for col in list_columns:
                    if i < len(list_data[col]):
                        new_row[col] = list_data[col][i]
                    else:
                        new_row[col] = None  # Handle shorter lists
                expanded_rows.append(new_row)

    return pd.DataFrame(expanded_rows)

# Diagnostic function to help identify issues
def diagnose_list_columns(df):
    """
    Diagnose issues with list columns before exploding.
    """
    list_columns = ['expert_knowledge', 'reasoning', 'category', 'confidence_score', 'source_context']

    print("=== LIST COLUMN DIAGNOSIS ===")
    for idx, row in df.iterrows():
        print(f"\nRow {idx}:")
        lengths = {}
        types = {}

        for col in list_columns:
            val = row[col]
            types[col] = type(val).__name__

            if isinstance(val, list):
                lengths[col] = len(val)
            elif pd.isna(val) or val is None:
                lengths[col] = 0
            else:
                lengths[col] = 1  # Single value

        print(f"  Types: {types}")
        print(f"  Lengths: {lengths}")

        # Check if all lengths are the same
        unique_lengths = set(lengths.values())
        if len(unique_lengths) > 1:
            print(f"  ⚠️  MISMATCH: Different lengths detected!")
        else:
            print(f"  ✅ All lengths match: {list(unique_lengths)[0]}")

# Usage examples:
# Step 1: Diagnose issues first
# diagnose_list_columns(your_dataframe)

# Step 2: Use the appropriate function
# df_expanded = explode_list_columns_robust(your_dataframe)  # More robust version

##### Better formatted expert knowledge dataframe

In [None]:
df_expanded = explode_list_columns_robust(results_df)  # More robust version

In [None]:
df_expanded.to_csv("gpt4o_expert_reasnoning_df_Experiment")

Unnamed: 0.1,Unnamed: 0,excerpt_id,article_id,title,post_type,url,authors,last_publish_date,description,model_names,expert_knowledge,reasoning,category,confidence_score,source_context,page_content
0,0,a1204546685_0,a1204546685,Best Small Car Buying Guide,buying-guide,https://www.consumerreports.org/cars/small-car...,"[{name=Jeff S. Bartlett, id=11cdd45d-c903-4ca7...",,Shopping for a small car? Here is what you nee...,[],The market shift toward SUVs has reduced the n...,This insight highlights a significant market t...,Hidden considerations,9,Record 1,"For many people, a small car makes sense for l..."
1,0,a1204546685_0,a1204546685,Best Small Car Buying Guide,buying-guide,https://www.consumerreports.org/cars/small-car...,"[{name=Jeff S. Bartlett, id=11cdd45d-c903-4ca7...",,Shopping for a small car? Here is what you nee...,[],Subcompact and compact cars are considered sma...,Breaking down ratings into subcategories provi...,Performance comparisons,8,Record 2,"For many people, a small car makes sense for l..."
2,0,a1204546685_0,a1204546685,Best Small Car Buying Guide,buying-guide,https://www.consumerreports.org/cars/small-car...,"[{name=Jeff S. Bartlett, id=11cdd45d-c903-4ca7...",,Shopping for a small car? Here is what you nee...,[],Cars like the Honda Civic and Mazda3 have incr...,Understanding the evolution of car sizes and f...,Technical insights,7,Record 3,"For many people, a small car makes sense for l..."
3,1,a2608303031_0,a2608303031,Best Convertible Buying Guide,buying-guide,https://www.consumerreports.org/cars/convertib...,"[{name=Jeff S. Bartlett, id=11cdd45d-c903-4ca7...",,Shopping for a convertible? Here is what you n...,[],Modern convertibles require far fewer compromi...,This information highlights technical advancem...,Technical insights,9,Record 1,Driving a convertible on a beautiful day is an...
4,1,a2608303031_0,a2608303031,Best Convertible Buying Guide,buying-guide,https://www.consumerreports.org/cars/convertib...,"[{name=Jeff S. Bartlett, id=11cdd45d-c903-4ca7...",,Shopping for a convertible? Here is what you n...,[],Today's soft tops come with glass rear windows...,This detail about glass rear windows provides ...,Technical insights,8,Record 2,Driving a convertible on a beautiful day is an...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244,68,a4218216331_3397,a4218216331,Best Coffee Buying Guide,buying-guide,https://www.consumerreports.org/health/coffee/...,[],,"Shopping for coffee? Read about types, feature...",[],AADenotes the second-largest beans on a Kenyan...,This provides specific technical insight into ...,Technical insights,9,Record 1,Features\nWith the popularity of coffee rising...
245,68,a4218216331_3397,a4218216331,Best Coffee Buying Guide,buying-guide,https://www.consumerreports.org/health/coffee/...,[],,"Shopping for coffee? Read about types, feature...",[],Caffeine content in coffee can vary greatly de...,Understanding the variability in caffeine cont...,Technical insights,8,Record 2,Features\nWith the popularity of coffee rising...
246,68,a4218216331_3397,a4218216331,Best Coffee Buying Guide,buying-guide,https://www.consumerreports.org/health/coffee/...,[],,"Shopping for coffee? Read about types, feature...",[],Decaffeinated coffee processing affects flavor...,The impact of decaffeination on flavor is a hi...,Hidden considerations,8,Record 3,Features\nWith the popularity of coffee rising...
247,68,a4218216331_3397,a4218216331,Best Coffee Buying Guide,buying-guide,https://www.consumerreports.org/health/coffee/...,[],,"Shopping for coffee? Read about types, feature...",[],Fair Trade Certified coffee supports sustainab...,Fair Trade certification involves professional...,Professional recommendations,9,Record 4,Features\nWith the popularity of coffee rising...


In [None]:
# Read the CSV with converters
results_df = pd.read_csv('/content/gpt4o_expert_reasoning_df_Experiment1.csv', converters=converters)

**There are fields that are of the different lengths (check following)**

In [None]:
skip_idxes = []
for index, row in results_df.iterrows():
  lengths = []
  lengths.append( len(row.expert_knowledge) )
  lengths.append( len(row.reasoning) )
  lengths.append( len(row.category) )
  lengths.append( len(row.confidence_score) )
  lengths.append( len(row.source_context) )

  if len(set(lengths)) != 1:
    skip_idxes.append(index)

##### Reading results_df from saved csv

In [None]:
results_df = pd.read_csv("/content/gpt4o_expert_reasoning_df_Experiment1.csv", converters=converters)

##### Analyses of the LLM output - for better prompting

In [None]:
# Calculating the elements with a mismatch in the length of lists of the fields
skip_idxes = []
for index, row in results_df.iterrows():
  lengths = []
  lengths.append( len(row.expert_knowledge) )
  lengths.append( len(row.reasoning) )
  lengths.append( len(row.confidence_score) )

  if len(set(lengths)) != 1:
    skip_idxes.append(index)

print(f"The indices with a mismatch in the count of the elements of the fields: {len(skip_idxes)}, out of 69")

The indices with a mismatch in the count of the elements of the fields: 0, out of 69


In [None]:
# A General summary of the lengths of the lists of different fields
for index, row in results_df.iterrows():
  print(f"# of elements in Expert Knowledge: {len(row.expert_knowledge)}")
  print(f"# of elements in Reasoning: {len(row.reasoning)}")
  print(f"# of elements in Category: {len(row.category)}")
  print(f"# of elements in Confidence Score: {len(row.confidence_score)}")
  print(f"# of elements in Source Context: {len(row.source_context)}")
  print("-" * 40)

# of elements in Expert Knowledge: 3
# of elements in Reasoning: 3
# of elements in Category: 3
# of elements in Confidence Score: 3
# of elements in Source Context: 3
----------------------------------------
# of elements in Expert Knowledge: 6
# of elements in Reasoning: 6
# of elements in Category: 6
# of elements in Confidence Score: 6
# of elements in Source Context: 6
----------------------------------------
# of elements in Expert Knowledge: 5
# of elements in Reasoning: 5
# of elements in Category: 5
# of elements in Confidence Score: 5
# of elements in Source Context: 5
----------------------------------------
# of elements in Expert Knowledge: 6
# of elements in Reasoning: 6
# of elements in Category: 6
# of elements in Confidence Score: 6
# of elements in Source Context: 6
----------------------------------------
# of elements in Expert Knowledge: 4
# of elements in Reasoning: 4
# of elements in Category: 4
# of elements in Confidence Score: 4
# of elements in Source Context

In [None]:
# Calculate the average length of the lists present in the fields
# Working Assumption - take the most occuring length (mode) off the fields and for that entry - put that as the length
import statistics

mode_lengths = []
for index, row in results_df.iterrows():
  lengths = []
  lengths.append( len(row.expert_knowledge) )
  lengths.append( len(row.reasoning) )
  lengths.append( len(row.category) )0
  lengths.append( len(row.confidence_score) )
  lengths.append( len(row.source_context) )

  mode_length = statistics.mode(lengths)
  mode_lengths.append(mode_length)

In [None]:
# Average of the mode lengths
sum(mode_lengths) / len(mode_lengths)

3.0

##### Making the Prompt Better

Pass 1 - stats:
The indices with a mismatch in the count of the elements of the fields: 12, out of 69 <br>
Average length of the lists: 3.59 <br>

WORKING ASSUMPTION: Let's keep the length of the lists to three expert knowledge chunks from an excerpt; for the excerpts that it takes out less than 3 previously (like 1), we can sort those out through top-k methods, by their confidence scores

Pass 2 - stats: The indices with a mismatch in the count of the elements of the fields: 0, out of 69 <br>
Average length of the lists: 3 (especially prompted to be so)



**MAKING THE PROMPT BETTER:**


1.   Less Fields - Instead of five, just give three
2.   Limiting the number of things it should extract from an excerpt



### Gemini Generated Prompt - Not using

In [None]:
from langchain.prompts import ChatPromptTemplate

extraction_prompt = ChatPromptTemplate.from_messages([
    ("system", """You are an AI assistant trained to identify and extract expert knowledge, reasoning, and advice from text excerpts from buying guides of consumer products.
For each excerpt provided, analyze the content and the metadata to determine if it contains specific expert knowledge, logical reasoning, or actionable advice related to the topic of the document.
If you find such content, clearly identify it and explain your reasoning for classifying it as expert knowledge, reasoning, or advice based on the language, context, and structure of the text.
If an excerpt does not contain clear expert knowledge, reasoning, or advice, state that the content is primarily descriptive or informational and explain why you think so.
Format your output as a JSON object with the following structure:
{{
  "analysis": [
    {{
      "excerpt": "The text excerpt...",
      "type": "Expert Knowledge" | "Reasoning" | "Advice" | "Informational",
      "identified_content": "The specific sentence(s) or phrase(s) identified.",
      "llm_reasoning": "Explanation of why this was classified as the given type."
    }},
    ...
  ]
}}
Ensure the JSON is valid and contains an array of analysis results for each provided excerpt."""),
    ("human", "Analyze the following document excerpts:\n{excerpts}")
])

In [None]:
import json

async def analyze_excerpts_with_llm(excerpts, llm_model, batch_size=10):
    """
    Analyzes document excerpts in batches using an LLM to identify expert knowledge,
    reasoning, and advice.

    Args:
        excerpts (list): A list of strings, where each string is a document excerpt.
        llm_model: The initialized Langchain LLM model (e.g., ChatOpenAI).
        batch_size (int): The number of excerpts to process in each batch.

    Returns:
        list: A list of dictionaries, each containing the analysis for an excerpt.
    """
    all_analysis_results = []
    for i in range(0, len(excerpts), batch_size):
        batch = excerpts[i:i + batch_size]
        # Format the batch of excerpts for the prompt
        formatted_batch = json.dumps(batch)

        # Invoke the LLM with the prompt
        response = await llm_model.invoke(extraction_prompt.format(excerpts=formatted_batch))

        # Attempt to parse the JSON response
        try:
            analysis_results = json.loads(response.content)
            all_analysis_results.extend(analysis_results.get("analysis", []))
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON response for batch starting at index {i}: {e}")
            print("Raw response content:", response.content)
            # Handle cases where the LLM might not return perfect JSON
            # You might want to log the problematic response or attempt partial parsing

    return all_analysis_results

In [None]:
# Assuming flattened_documents is already loaded as per your notebook code
# Extract the excerpt text from the Document objects
excerpt_texts = [doc.page_content for doc in flattened_documents]

# Run the analysis
# This is an asynchronous function, so you need to await it if running in a script
# In a Colab notebook cell, you can use await directly.
analysis_results = analyze_excerpts_with_llm(excerpt_texts, llm)

# Now 'analysis_results' contains the analysis for each excerpt
# You can process or display the results
for result in analysis_results:
    print(f"Excerpt: {result['excerpt'][:100]}...") # Print first 100 chars
    print(f"Type: {result['type']}")
    print(f"Identified Content: {result['identified_content']}")
    print(f"LLM Reasoning: {result['llm_reasoning']}")
    print("-" * 50)

TypeError: object AIMessage can't be used in 'await' expression

## Exploratory Graph Analysis - Functions

### run_query

In [None]:
def run_query(driver, query):
    with driver.session() as session:
        result = session.run(query)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())

## Graph Construction

### Config - LLMGraphTransformer

In [None]:
from langchain.callbacks import StdOutCallbackHandler
from langchain_core.runnables import RunnableConfig

config = RunnableConfig(
    callbacks=[StdOutCallbackHandler()],
    tags=["graph_creation"]
)

### LLM Extraction of KG

#### Prompt(s)

##### Default Prompt(s) for LLMGraphTransformer

In [None]:
from langchain_experimental.graph_transformers import LLMGraphTransformer

llm_transformer = LLMGraphTransformer(
    llm=llm,
)
llm_transformer.chain

ChatPromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='# Knowledge Graph Instructions for GPT-4\n## 1. Overview\nYou are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.\nTry to capture as much information from the text as possible without sacrificing accuracy. Do not add any information that is not explicitly mentioned in the text.\n- **Nodes** represent entities and concepts.\n- The aim is to achieve simplicity and clarity in the knowledge graph, making it\naccessible for a vast audience.\n## 2. Labeling Nodes\n- **Consistency**: Ensure you use available types for node labels.\nEnsure you use basic or elementary types for node labels.\n- For example, when you identify an entity representing a person, always label it as **\'person\'**. Avoid using more specific terms like \

In [None]:
# Default System Prompt
deafult_system_prompt = llm_transformer.chain.steps[0].messages[0].prompt.template
print(deafult_system_prompt)

# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
Try to capture as much information from the text as possible without sacrificing accuracy. Do not add any information that is not explicitly mentioned in the text.
- **Nodes** represent entities and concepts.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it
accessible for a vast audience.
## 2. Labeling Nodes
- **Consistency**: Ensure you use available types for node labels.
Ensure you use basic or elementary types for node labels.
- For example, when you identify an entity representing a person, always label it as **'person'**. Avoid using more specific terms like 'mathematician' or 'scientist'.- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
- **Relationships** represent connections between entities or concep

In [None]:
# Default Human Prompt
default_human_prompt = llm_transformer.chain.steps[0].messages[1].prompt.template
print(default_human_prompt)

 Tip: Make sure to answer in the correct format and do not include any explanations. Use the given format to extract information from the following input: {input}


#### Fetching Expert Knowledge and the Reasoning dataframes for the prompt

In [None]:
import pandas as pd

##### Helper Function - Reading output saved dataframe columns as python lists - function name: safe_literal_eval

In [None]:
import pandas as pd
import ast

def safe_literal_eval(val):
    """Safely evaluate string representations of lists"""
    if pd.isna(val) or val == '' or val == 'nan':
        return []
    try:
        # Handle string representations of lists
        if isinstance(val, str):
            # Remove extra whitespace and handle common formatting issues
            val = val.strip()
            if val.startswith('[') and val.endswith(']'):
                return ast.literal_eval(val)
            else:
                # If it's not in list format, return as single-item list
                return [val]
        # If it's already a list, return as is
        elif isinstance(val, list):
            return val
        else:
            # Convert other types to single-item lists
            return [val]
    except (ValueError, SyntaxError):
        # If parsing fails, return as single-item list
        return [str(val)] if val else []

# Define the columns that should be read as lists
list_columns = ["expert_knowledge", "reasoning", "category", "confidence_score", "source_context"]

# Create converters dictionary
converters = {col: safe_literal_eval for col in list_columns}

##### Other Stuff

In [None]:
# So that Knowledge and Reasoning df(s) are read in lists instead of strings
# Define the columns that should be read as lists
list_columns = ["expert_knowledge", "reasoning", "category", "confidence_score", "source_context"]

# Create converters dictionary
converters = {col: safe_literal_eval for col in list_columns}

In [None]:
all_columns_in_the_knowledge_reasoning_df = ['excerpt_id', 'article_id', 'title', 'post_type', 'url', 'authors',
       'last_publish_date', 'description', 'model_names', 'expert_knowledge',
       'reasoning', 'category', 'confidence_score', 'source_context',
       'page_content']

columns_to_keep = ['expert_knowledge', 'reasoning', 'reasoning', 'excerpt_id', 'article_id', 'confidence_score']
columns_to_drop = [col for col in all_columns_in_the_knowledge_reasoning_df if col not in columns_to_keep]

In [None]:
knowledge_reasoning_df_Batch1 = pd.read_csv("/content/drive/MyDrive/Copy of gpt4o_expert_reasoning_Batch1.csv", index_col=0, converters=converters)
knowledge_reasoning_df_Batch2 = pd.read_csv("/content/drive/MyDrive/Copy of gpt4o_expert_reasoning_Batch2.csv", index_col=0, converters=converters)

##### Exploding Columns

In [None]:
# Exploding the list columns to better pass the data to the extract prompt function
list_columns_to_explode = ['expert_knowledge', 'reasoning', 'confidence_score']
exploded_knowledge_reasoning_df_Batch1 = knowledge_reasoning_df_Batch1.explode(list_columns_to_explode)
exploded_knowledge_reasoning_df_Batch2 = knowledge_reasoning_df_Batch2.explode(list_columns_to_explode)

In [None]:
# Removing unecessary columns from this exploded dataframe
exploded_knowledge_reasoning_df_Batch1.drop(columns_to_drop, axis=1, inplace=True)
exploded_knowledge_reasoning_df_Batch2.drop(columns_to_drop, axis=1, inplace=True)

#### Adding the Expert Knowledge, Reasoning & Confidence Score as Metadata

In [None]:
# Name it the krc dict, which consists dictionaries of Knowledge, Reasoning and Confidence Score in that order
import copy
## Read it for the fist two batches - for now
krc_dfs = []
# krc_1 = exploded_knowledge_reasoning_df_Batch1
# krc_2 = exploded_knowledge_reasoning_df_Batch2


krc_1 = knowledge_reasoning_df_Batch1
krc_2 = knowledge_reasoning_df_Batch2

krc_dfs.append(krc_1)
krc_dfs.append(krc_2)

excerpt_ids = []
docs_to_convert_to_kg = []
for doc in flattened_documents:
  excerpt_id = doc.metadata["excerpt_id"]
  for krc in krc_dfs:
    if excerpt_id in krc["excerpt_id"].values:
      excerpt_ids.append(excerpt_id)
      doc_dummy = copy.deepcopy(doc)
      krc_series = krc[ krc["excerpt_id"] == excerpt_id][["expert_knowledge", "reasoning", "confidence_score"]]
      # doc_dummy.metadata["expert_knowledge"] = krc_series["expert_knowledge"].to_list()
      doc_dummy.metadata["expert_knowledge"] = krc_series["expert_knowledge"].values[0]
      # doc_dummy.metadata["reasoning"] = krc_series["reasoning"].to_list()
      doc_dummy.metadata["reasoning"] = krc_series["reasoning"].values[0]
      # doc_dummy.metadata["confidence_score"] = krc_series["confidence_score"].to_list()
      doc_dummy.metadata["confidence_score"] = krc_series["confidence_score"].values[0]

      docs_to_convert_to_kg.append(doc_dummy)


In [None]:
# First 2 batches of docs to add to the KG(s) are in this excerpt_ids list
# Checking if everything good!
docs_to_convert_to_kg[0].metadata["reasoning"]

### Experiment: With Knowledge & Reasoning as metadata -  no prompt

In [None]:
# Creating and adding to the graph
graph = Neo4jGraph()

  graph = Neo4jGraph()


In [None]:
no_prompt_just_metadata_batch = await llm_transformer.aconvert_to_graph_documents(docs_to_convert_to_kg, config=config)

NameError: name 'llm_transformer' is not defined

In [None]:
# Storing to neo4j
graph.add_graph_documents(
    no_prompt_just_metadata_batch,
    baseEntityLabel=True,
    include_source=True)

### Experiment: With Prompt - Enhanced Documents

In [None]:
# Creating a graph
graph = Neo4jGraph()

## Check if a new graph instance is created or not by running a query in Neo4j

  graph = Neo4jGraph()


#### System Prompt Template

In [None]:
def extract_system_prompt():

  base_system_prompt = """

    "## 1. Overview\n"
    "You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.\n"
    "Try to capture as much information of products from these articles and buying guides as possible without sacrificing accuracy.\n"
    "Do not add any information that is not explicitly mentioned in the text.\n"

    "- **Nodes** represent entities and concepts.\n"
    "- The aim is to mirror expert reasoning present in these articles and buying guides through this knowledge graph.\n"


    "## 2. Labeling Nodes\n"
    "- **Consistency**: Ensure you use available types for node labels.\n"

    "- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.\n"

    "- **Relationships** represent connections between entities or concepts. \n"
    "Ensure consistency and generality in relationship types when constructing the knowledge graphs.\n"
    "Try to focus on getting relationships that indicated expert reasoning, advice or information such as 'INCREASES_RISK_OF', 'SIGNIFICANTLY_REDUCES'.\n"
    "Try to avoid general relationship types - eg.: entity1 'CONTAINS' entity2, is a very general relationship and should be avoided.\
    "Make sure to use general and timeless relationship types!\n"

    "## 3. Factual Compliance\n"
    "- All the information from the text is objective and contain facts. You should not add any information to it."


  """

  return base_system_prompt


#### User Prompt Template

In [None]:
def extract_user_prompt():

  user_prompt = f"""
  Your task is to extract entities and relationships from the given excerpt of text.\
 You should take hints about what kind of nodes and relationships you should be extracting from this excerpt from the text.\
 under the EXPERT KNOWLEDGE AND REASONING GUIDANCE section; Remember everything under this is AI generated and thus should NOT\
 be used as exact facts.


 ### INPUT TEXT TO ANALYZE:
 ['text']


 ### EXPERT KNOWLEDGE AND REASONING GUIDANCE (AI-Generated, Use with Caution):



 ### EXTRACTION INSTRUCTIONS:
 1. **Primary Focus**: Extract entities and relationships directly evident in the text
 2. **Secondary Guidance**: Use the AI insights above to identify potentially important patterns. Use these to get insights into what relationships to extract.
 3. **Confidence Weighting**: Give more attention to higher-confidence insights (7+ scores)
 4. **Verification**: If insights contradict from what you observe in the text, trust the text.




 Tip: Make sure to answer in the correct format and do not include any explanations.
 """


  return user_prompt

#### Preparing enhanced documents for the Prompt

In [None]:
# Combine content with relevant metadata context

def prepare_documents_for_kg_prompt(documents):

  prepared_docs = []
  for doc in documents:

    enhanced_content = f"""
    ### Content:
    {doc.page_content}\n

    ### EXPERT KNOWLEDGE AND REASONING GUIDANCE (AI-Generated, Use with Caution):

    1.Expert Knowledge: {doc.metadata["expert_knowledge"][0]}
      Reasoning of why this is considered Expert Knowledge: {doc.metadata["reasoning"][0]}
      Confidence Score (Out of 10): {doc.metadata["confidence_score"][0]}\n

    2.Expert Knowledge: {doc.metadata["expert_knowledge"][1]}
      Reasoning of why this is considered Expert Knowledge: {doc.metadata["reasoning"][1]}
      Confidence Score (Out of 10): {doc.metadata["confidence_score"][1]}\n

    3.Expert Knowledge: {doc.metadata["expert_knowledge"][2]}
      Reasoning of why this is considered Expert Knowledge: {doc.metadata["reasoning"][2]}
      Confidence Score (Out of 10): {doc.metadata["confidence_score"][2]}\n

    ### EXTRACTION INSTRUCTIONS:
    1. **Primary Focus**: Extract entities and relationships directly evident in the text
    2. **Secondary Guidance**: Use the AI insights above to identify potentially important patterns. Use these to get insights into what relationships to extract.
    3. **Confidence Weighting**: Give more attention to higher-confidence insights (7+ scores)
    4. **Verification**: If insights contradict from what you observe in the text, trust the text.

    """.strip()

    # Create new document with enhanced content
    enhanced_doc = Document(
        page_content=enhanced_content,
        metadata={
            **doc.metadata,
            'prepared_for_kg': True
        }
    )

    prepared_docs.append(enhanced_doc)


  return prepared_docs



#### THE Prompt

In [None]:
# Time to create prompt
from langchain_core.prompts import ChatPromptTemplate

system_prompt = extract_system_prompt()
user_prompt = extract_user_prompt()

# Wrap them in a ChatPromptTemplate
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", user_prompt)          # 'human' = user role
])


#### Trigger

In [None]:
# Changing the docs to enhanced docs to facilitate the prompt
enhanced_docs_to_convert_to_kg = prepare_documents_for_kg_prompt(docs_to_convert_to_kg)

In [None]:
# llm transformer object
from langchain_experimental.graph_transformers import LLMGraphTransformer

llm_transformer = LLMGraphTransformer(llm=llm, prompt=prompt)

In [None]:
# Converting to graph documents
with_prompt_enhanced_documents_batch = await llm_transformer.aconvert_to_graph_documents(enhanced_docs_to_convert_to_kg, config=config, )


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[1m> Finished chain.[0m


[1m> Entering new RunnableParallel<parsed,parsing_error> chain...[0m


[1m> Entering new RunnableAssign<parsed,parsing_error> chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RunnableLambda chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RunnableParallel<parsed,parsing_error> chain...[0m


[1m> Entering new RunnableSequence chain...[0m


[1m> Entering new RunnableLambda chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[

In [None]:
# Storing to neo4j
graph.add_graph_documents(
    with_prompt_batch,
    baseEntityLabel=True,
    include_source=True)

#### Exporting the KG

In [None]:
# Stream GraphML back to the client (cloud-safe)
result = graph.query("""
    CALL apoc.export.graphml.all(
      null,
      {useTypes: true, stream: true}
    ) YIELD data
    RETURN data
""")
graphml_str = result[0]["data"]
with open("graph.graphml", "w") as f:
    f.write(graphml_str)


#### Re-importing (run this)

In [None]:
## Run the following in Neo4jBrowser after importing the graphml files

# CALL apoc.import.graphml("your_graph_file.graphml",{readLabels:true});
# (if you streamed to a local file, upload it to the server’s import/ folder first or pass stream:true again).

### Experiment: With Prompt - Metadata utilization - No Enhanced Documents

#### System Prompt Template

In [None]:
def extract_system_prompt():

  base_system_prompt = """

    "## 1. Overview\n"
    "You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.\n"
    "Try to capture as much information in different domains about consumer products from these articles and buying guides as possible without sacrificing accuracy.\n"
    "Do not add any information that is not explicitly mentioned in the text.\n"

    "- **Nodes** represent entities and concepts.\n"
    "- The aim is to mirror expert reasoning present in these articles and buying guides of consumer products.\n"
    "- This Knowledge Graph should be like a expert advisor on consumer products.\n"


    "## 2. Labeling Nodes\n"
    "- **Consistency**: Ensure you use available types for node labels.\n"
    "- **Relevance**: Only use nodes that serve a high relevance when it comes to correctly mapping the expert knowledge about the consumer products.\n"

    "- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.\n"

    "- **Relationships** represent connections between entities or concepts. \n"
    "Ensure consistency and generality in relationship types when constructing the knowledge graphs.\n"
    "Get relationships that indicate expert reasoning, advice or information such as 'INCREASES_RISK_OF', 'SIGNIFICANTLY_REDUCES'.\n"
    "Avoid general nature language relationship types - eg.: entity1 'MENTIONS' entity2. If a relationships does not mirror the expert reasoning, it should be ignored.\n"

    "## 3. Factual Compliance\n"
    "- All the information from the text is objective and contain facts. You should not add any information to it."

    "## 4. AI generated Expert Knowledge and Guidance\n"
    "- Any text provided below a heading "EXPERT KNOWLEDGE AND REASONING GUIDANCE (AI-Generated, Use with Caution)" is AI generated."\n
    "and thus should only be used as an indicative information on how and what to create as nodes and relationships."\n
    "Remember the text IS NOT the actual text from which you have to derive nodes and relationships. It is meant to be only used to supplement the process of what nodes and relationships to form."\n


  """

  return base_system_prompt


#### User Prompt Template

In [None]:
def extract_user_prompt():

  user_prompt = f"""

  Your task is to extract entities and relationships from the given excerpt of text.\
  You should recognize patterns about what kind of nodes and relationships you should be extracting from under the EXPERT KNOWLEDGE AND REASONING GUIDANCE section.\n


  ### INPUT TEXT TO ANALYZE:
  ['text']

  ### EXPERT KNOWLEDGE AND REASONING GUIDANCE (AI-Generated, Use with Caution):
  Below are three points. Each containing three pointers - Expert Knowledge, Reasoning and Confidence Score. Expert Knowledge is text from the excerpt itself.\
  It is what has been identified as Expert Knowledge by AI and Reasoning contains the reason AI think so and the Confidence Score, the confidence it has in this finding.\
  Remember everything under this is AI generated and thus should NOT be used as exact facts but instead used as indicator for what relationships should look like in the Knowledge Graph.

  ['Expert_Units']


  ### EXTRACTION INSTRUCTIONS:
  1. **Primary Focus**: Extract entities and relationships directly evident in the text.
  2. **Secondary Guidance**: Use the AI generated Expert Knowledge and Reasoning guidance above to identify potentially important patterns. Use these to get insights into what relationships to extract.
  3. **Confidence Weighting**: Give more attention to higher-confidence insights (7+ scores)
  4. **Verification**: If insights contradict from what you observe in the text, trust the text.


  Tip: Make sure to answer in the correct format and do not include any explanations.
  """

  return user_prompt

#### Without the enhanced document - metadata -> user_prompt

In [None]:
def format_expert_units(doc):
    experts = doc.metadata.get("expert_knowledge", [])
    reasons = doc.metadata.get("reasoning", [])
    confidences = doc.metadata.get("confidence_score", [])

    n = len(experts)
    formatted = []

    for i in range(n):
        formatted.append(
            f"{i+1}. Expert Knowledge: \"{experts[i]}\"\n   Reasoning: {reasons[i]}\n   Confidence_score: {confidences[i]}"
        )

    return "\n\n".join(formatted)


for doc in docs_to_convert_to_kg:
  doc.metadata["Expert_Units"] = format_expert_units(doc)



In [None]:
# Checking if the format is right!
print( docs_to_convert_to_kg[0].metadata["Expert_Units"] )

1. Expert Knowledge: "With the market shift toward SUVs, there are fewer small cars to choose from, and some brands like Ford, General Motors, and Toyota have moved away from small cars, instead creating new hatchback models with SUV-inspired styling and increased interior space."
   Reasoning: This insight highlights a significant market trend and informs consumers about the evolving landscape of small cars, which is crucial for making informed purchasing decisions. Understanding which brands are moving away from traditional small cars and how they are adapting their offerings can help consumers anticipate future availability and design changes.
   Confidence_score: 9

2. Expert Knowledge: "Cars like the Honda Civic and Mazda3 have increased in size over the years, often boasting rear-seat room and amenities you might have expected in a midsized car a few years ago."
   Reasoning: This provides a performance comparison and technical insight into how specific models have evolved, offer

#### THE Prompt

In [None]:
# Time to create prompt
from langchain_core.prompts import ChatPromptTemplate

system_prompt = extract_system_prompt()
user_prompt = extract_user_prompt()

# Wrap them in a ChatPromptTemplate
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", user_prompt)          # 'human' = user role
])


#### Trigger

In [None]:
graph = Neo4jGraph()

In [None]:
# llm transformer object
from langchain_experimental.graph_transformers import LLMGraphTransformer

llm_transformer = LLMGraphTransformer(llm=llm, prompt=prompt)

In [None]:
# Converting to graph documents
with_prompt_metadata_utilization_batch = await llm_transformer.aconvert_to_graph_documents(docs_to_convert_to_kg, config=config, )


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RunnableWithFallbacks chain...[0m


[1m> Entering new RunnableWithFallbacks chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RunnableAssign<parsed,parsing_error> chain...[0m


[1m> Entering new RunnableParallel<parsed,parsing_error> chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RunnableWithFallbacks chain...[0m

[1m> Finished chain.[0m



In [None]:
# Storing to neo4j
graph.add_graph_documents(
    with_prompt_metadata_utilization_batch,
    baseEntityLabel=True,
    include_source=True)

#### Pictorial Analysis

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/content/without_enhDoc_with_prompt_node_freq.csv")
wc = make_viz(df)
wc.to_file("without_enhDoc_with_prompt_node_freq.png")

df = pd.read_csv("/content/without_enhDoc_with_prompt_rels_freq.csv")
wc = make_viz(df)
wc.to_file("without_enhDoc_with_prompt_rels_freq.png")



<wordcloud.wordcloud.WordCloud at 0x7e059b80cd90>

#### Exploratory Graph Analysis

In [None]:
## Driver
password = userdata.get("NEO4J_PASSWORD")
uri = userdata.get("NEO4J_URI")
auth = ("neo4j", password)
driver = GraphDatabase.driver(uri, auth=auth)

##### 1.

In [None]:
# 10 most frequent nodes
query = """
MATCH (n)
UNWIND labels(n) AS label
RETURN label, count(*) AS frequency
ORDER BY frequency DESC
LIMIT 10
"""

run_query(driver, query)

Unnamed: 0,label,frequency
0,Document,139
1,__Entity__,136
2,Concept,42
3,Feature,27
4,Product,22
5,Entity,11
6,Attribute,8
7,Benefit,5
8,Type,4
9,Component,4


##### 2.

In [None]:
# 10 most frequent relationships
query = """
MATCH ()-[r]->()
RETURN type(r) AS relationship_type, count(*) AS frequency
ORDER BY frequency DESC
LIMIT 10
"""

run_query(driver, query)

Unnamed: 0,relationship_type,frequency
0,MENTIONS,279
1,HAS_FEATURE,16
2,CONTAINS,9
3,ENHANCES,6
4,INCLUDES,5
5,SIGNIFICANTLY_REDUCES,4
6,MADE_OF,4
7,INCLUDED_IN,4
8,INCREASES_RISK_OF,3
9,ASSOCIATED_WITH,3


#### Deleting the current instance of KG

#### Exporting the KG

In [None]:
# Stream GraphML back to the client (cloud-safe)
result = graph.query("""
    CALL apoc.export.graphml.all(
      null,
      {useTypes: true, stream: true}
    ) YIELD data
    RETURN data
""")
graphml_str = result[0]["data"]
with open("graph.graphml", "w") as f:
    f.write(graphml_str)


#### Re-importing (run this)

In [None]:
## Run the following in Neo4jBrowser after importing the graphml files

# CALL apoc.import.graphml("your_graph_file.graphml",{readLabels:true});
# (if you streamed to a local file, upload it to the server’s import/ folder first or pass stream:true again).

### Experiment: With Prompt - Metadata utilization - No Enhanced Documents - Better Prompt

#### System Prompt Template

In [None]:
def extract_system_prompt():

  base_system_prompt = (
    "## 1. Overview\n"
    "You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.\n"
    "Your goal is to capture expert reasoning and product knowledge while maintaining strict accuracy.\n"

    "- **Nodes** represent entities and concepts.\n"
    "- The aim is to mirror expert reasoning present in these articles and buying guides of consumer products.\n\n\n"

    "## 2. Labeling Nodes\n"
    "- **Consistency**: Ensure you use available types for node labels.\n"
    "- **Relevance**: Only use nodes that serve a high relevance when it comes to correctly mapping the expert knowledge about the consumer products.\n"
    "- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.\n"
    "- **Relationships** represent connections between entities or concepts. \n"
    "Ensure consistency and generality in relationship types when constructing the knowledge graphs.\n"
    "Get relationships that are expert reasoning, advice, information or features such as 'INCREASES_RISK_OF', 'SIGNIFICANTLY_REDUCES'.\n"
    "Avoid general nature language relationship types - eg.: you should avoid \"entity1 'MENTIONS' entity2\".\n\n\n"

    "## 3. Extreme accuracy to the text\n"
    "- You have to maintain Extreme Accuracy when deriving nodes and relationships, any entity added to the graph should be 100% accurate to the information in the text.\n\n\n"

    "## 4. Strict Compliance\n"
    "- Adhere to the rules strictly. Non-compliance will result in termination.\n\n\n"
  )

  return base_system_prompt


#### User Prompt Template

In [None]:
def extract_user_prompt():

  user_prompt = f"""

  Your task is to extract entities and relationships from the given excerpt of text.\
  You should recognize patterns about what kind of nodes and relationships you should be extracting from under the EXPERT KNOWLEDGE AND REASONING GUIDANCE section.\n


  ## Instructions
  - The text under the INPUT MAIN TEXT TO ANALYZE heading is the ONLY source of truth used to derive Nodes and Relationships from.
  - The text under the heading EXPERT KNOWLEDGE AND REASONING GUIDANCE (AI-Generated, Use with Caution) is AI generated and thus can't be relied upon totally. \
  You will use this text as supplementary information to the text under the INPUT MAIN TEXT TO ANALYZE heading,\
  and use the information to make better decisions as to what Nodes and Relationships to make
  - Remember, under no circumstance should the text under the heading EXPERT KNOWLEDGE AND REASONING GUIDANCE (AI-Generated, Use with Caution)\
  be used as Node ID(s) or Relationship ID(s).


  ### INPUT MAIN TEXT TO ANALYZE:
  ['text']

  ### EXPERT KNOWLEDGE AND REASONING GUIDANCE (AI-Generated, Use with Caution):
  Below are three points. Each containing three pointers - Expert Knowledge, Reasoning and Confidence Score. Expert Knowledge is text from the excerpt itself.\
  It is what has been identified as Expert Knowledge by AI and Reasoning contains the reason AI think so and the Confidence Score, the confidence it has in this finding.\
  Remember everything under this is AI generated and thus should NOT be used as exact facts but instead used as indicator for what relationships should look like in the Knowledge Graph.

  ['Expert_Units']


  Tip: Make sure to answer in the correct format and do not include any explanations.
  """

  return user_prompt

#### Without the enhanced document - metadata -> user_prompt

In [None]:
def format_expert_units(doc):
    experts = doc.metadata.get("expert_knowledge", [])
    reasons = doc.metadata.get("reasoning", [])
    confidences = doc.metadata.get("confidence_score", [])

    n = len(experts)
    formatted = []

    for i in range(n):
        formatted.append(
            f"{i+1}. Expert Knowledge: \"{experts[i]}\"\n   Reasoning: {reasons[i]}\n   Confidence_score: {confidences[i]}"
        )

    return "\n\n".join(formatted)


for doc in docs_to_convert_to_kg:
  doc.metadata["Expert_Units"] = format_expert_units(doc)



In [None]:
# Checking if the format is right!
print( docs_to_convert_to_kg[0].metadata["Expert_Units"] )

1. Expert Knowledge: "With the market shift toward SUVs, there are fewer small cars to choose from, and some brands like Ford, General Motors, and Toyota have moved away from small cars, instead creating new hatchback models with SUV-inspired styling and increased interior space."
   Reasoning: This insight highlights a significant market trend and informs consumers about the evolving landscape of small cars, which is crucial for making informed purchasing decisions. Understanding which brands are moving away from traditional small cars and how they are adapting their offerings can help consumers anticipate future availability and design changes.
   Confidence_score: 9

2. Expert Knowledge: "Cars like the Honda Civic and Mazda3 have increased in size over the years, often boasting rear-seat room and amenities you might have expected in a midsized car a few years ago."
   Reasoning: This provides a performance comparison and technical insight into how specific models have evolved, offer

#### THE Prompt

In [None]:
# Time to create prompt
from langchain_core.prompts import ChatPromptTemplate

system_prompt = extract_system_prompt()
user_prompt = extract_user_prompt()

# Wrap them in a ChatPromptTemplate
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", user_prompt)          # 'human' = user role
])


#### Trigger

In [None]:
graph = Neo4jGraph()

  graph = Neo4jGraph()


In [None]:
# llm transformer object
from langchain_experimental.graph_transformers import LLMGraphTransformer

llm_transformer = LLMGraphTransformer(llm=llm, prompt=prompt)

In [None]:
# Converting to graph documents
with_prompt_metadata_utilization_thinking_batch = llm_transformer.convert_to_graph_documents(docs_to_convert_to_kg, config=config, )


[1;30;43mStreaming output truncated to the last 5000 lines.[0m


[1m> Entering new RunnableSequence chain...[0m


[1m> Entering new ChatPromptTemplate chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RunnableParallel<raw> chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RunnableWithFallbacks chain...[0m


[1m> Entering new RunnableAssign<parsed,parsing_error> chain...[0m


[1m> Entering new RunnableParallel<parsed,parsing_error> chain...[0m


[1m> Entering new RunnableSequence chain...[0m


[1m> Entering new RunnableLambda chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RunnableLambda chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RunnableLambda chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RunnableSequence chain...[0m


[1m> Entering new ChatPromptTemplate chain...[0m

[

#### Adding the documents to the graph

In [None]:
# Storing to neo4j
graph.add_graph_documents(
    with_prompt_metadata_utilization_thinking_batch,
    baseEntityLabel=True,
    include_source=True)

#### Deleting the old graph if necessary

In [None]:
graph.query("MATCH (n) DETACH DELETE n")
graph.refresh_schema()

## WordClouds for KG

In [None]:
from wordcloud import WordCloud
import pandas as pd

def make_viz(df):
  label_col = df.columns[0]
  freq_col = df.columns[1]

  freqs = dict(zip(df[label_col], df[freq_col]))

  wc = WordCloud(
    width=800, height=400,
    background_color="white",
    colormap="tab20"                        # pick any matplotlib colormap
  ).generate_from_frequencies(freqs)

  return wc

In [None]:
# Node Labels and Frequencies
df = pd.read_csv("/content/newest_with_prompt_nodes_freq.csv")            # columns: label, frequency

wc = make_viz(df)

wc.to_file("newest_with_prompt_Nodes_Labels_andFreq_pic.png")

<wordcloud.wordcloud.WordCloud at 0x7de8d1a75e10>

In [None]:
# Relationship Labels and Frequencies
df = pd.read_csv("/content/newest_with_prompt_rels_freq.csv")

wc = make_viz(df)

wc.to_file("newer_with_prompt_Relationship_Labels_andFreq_pic.png")

<wordcloud.wordcloud.WordCloud at 0x7de8d1a89950>

## Graph Analytics Code

In [None]:
import pandas as pd

In [None]:
rels_desc_degree = pd.read_csv("export_rels.csv", )
rels_desc_degree.head(15)

Unnamed: 0,relationshipType,relationshipCount
0,MENTIONS,1899
1,INCLUDES,78
2,HAS_FEATURE,75
3,AVAILABLE_AT,65
4,USES,40
5,HAS,39
6,FEATURE,37
7,MANUFACTURES,34
8,PRODUCES,30
9,HAS_ATTRIBUTE,27


In [None]:
import networkx as nx

In [None]:
# Get raw node and relationship data using Cypher
nodes = graph.query("MATCH (n) RETURN id(n) AS id, labels(n) AS labels, properties(n) AS properties")
edges = graph.query("MATCH (n)-[r]->(m) RETURN id(n) AS source, id(m) AS target, type(r) AS type, properties(r) AS properties")



In [None]:
import networkx as nx

G = nx.DiGraph()  # or nx.Graph() if you want undirected

# Add nodes
for node in nodes:
    G.add_node(node['id'], labels=node['labels'], **node['properties'])

# Add edges
for edge in edges:
    G.add_edge(edge['source'], edge['target'], type=edge['type'], **edge['properties'])

In [None]:
# Centrality
deg_cent = nx.degree_centrality(G)

# PageRank
pagerank = nx.pagerank(G)

# Community detection (requires undirected graph)
from networkx.algorithms.community import greedy_modularity_communities
communities = list(greedy_modularity_communities(G.to_undirected()))

In [None]:
# Nodes
len( deg_cent.keys() )

1873

In [None]:
# Pagerank: It measures the relative importance of nodes in a graph based on the link structure.
# Basically like the google pagerank

first_few_keys = list(pagerank.keys())[0:100]

for key in first_few_keys:
  print(key, pagerank[key] )

0 0.0003340222209956841
1 0.0008772348662374848
2 0.0010220059991501392
3 0.0015896876898918012
4 0.0004939269733312078
5 0.0006838794714434215
6 0.0003695377505979669
7 0.00044858763971743014
8 0.00044858763971743014
9 0.0003340222209956841
10 0.0010361454034590485
11 0.002118563253223896
12 0.0004418612136563918
13 0.0003817074075945532
14 0.0011151952925785118
15 0.0004418612136563918
16 0.0004527890689186326
17 0.0004418612136563918
18 0.0003340222209956841
19 0.00035296383678356825
20 0.00035296383678356825
21 0.0021685317614836557
22 0.00035296383678356825
23 0.00035296383678356825
24 0.00036967702718464245
25 0.0004773530595168093
26 0.00035296383678356825
27 0.00035296383678356825
28 0.00035296383678356825
29 0.00035296383678356825
30 0.0006532040650829995
31 0.001091732787985955
32 0.0003340222209956841
33 0.0003695377505979669
34 0.0006838794714434215
35 0.001168738903072773
36 0.0006838794714434215
37 0.000563469526288972
38 0.000563469526288972
39 0.000563469526288972
40 0.

In [None]:
# Sorted Pagerank
sorted_items_descending = sorted(pagerank.items(), key=lambda item: item[1], reverse=True)
sorted_pagerank_descending = dict(sorted_items_descending)

first_few_keys = list(sorted_pagerank_descending.keys())[0:50]

for key in first_few_keys:
  print(key, sorted_pagerank_descending[key] )


1357 0.005657885036485669
1277 0.00529414492042103
298 0.004949849795673022
1527 0.003958630049279745
917 0.0033021876298521763
1574 0.00329864106684469
1352 0.0032002883688836163
1647 0.002855796733634977
1355 0.0027834033048869814
1361 0.0027715647950195535
1292 0.0027688426673019954
566 0.0025874421990392626
1156 0.0024168931283215316
1324 0.0023420399719091835
1325 0.0023420399719091835
1309 0.002315070493609424
737 0.002274834309906
243 0.002246856973860584
1614 0.002196010773938606
1470 0.002184261254852447
21 0.0021685317614836557
11 0.002118563253223896
492 0.0020482503872869574
631 0.0019912383321486213
632 0.0019912383321486213
53 0.0019764833411989677
1407 0.0019446039948941445
610 0.001942348995424047
63 0.0019305917451891265
1278 0.0018531989143462122
1279 0.0018531989143462122
1280 0.0018531989143462122
685 0.0018508806147553344
834 0.0018043051952955091
624 0.0018015587593161755
287 0.0017972955078843073
1313 0.001788763000077837
1706 0.0017380125891040827
835 0.00168629

In [None]:
for node in first_few_keys:
  print(node, G.nodes[node]['labels'] )

1357 ['__Entity__', 'Concept', 'Financial aspect']
1277 ['__Entity__', 'Technology', 'Product']
298 ['__Entity__', 'Vehicle', 'Category']
1527 ['__Entity__', 'Product', 'Object', 'Concept']
917 ['__Entity__', 'Attribute', 'Concept']
1574 ['__Entity__', 'Concept']
1352 ['__Entity__', 'Product', 'Object', 'Concept']
1647 ['__Entity__', 'Product', 'Device']
1355 ['__Entity__', 'Organization']
1361 ['__Entity__', 'Concept']
1292 ['__Entity__', 'Product']
566 ['__Entity__', 'Component']
1156 ['__Entity__', 'Concept']
1324 ['__Entity__', 'Product', 'Product line']
1325 ['__Entity__', 'Product']
1309 ['__Entity__', 'Product']
737 ['__Entity__', 'Event']
243 ['__Entity__', 'Organization', 'Company', 'Brand']
1614 ['__Entity__', 'Product', 'Object']
1470 ['__Entity__', 'Product', 'Object', 'Device']
21 ['__Entity__', 'Organization', 'Entity']
11 ['__Entity__', 'Organization']
492 ['__Entity__', 'Vehicle']
631 ['__Entity__', 'Product']
632 ['__Entity__', 'Company']
53 ['__Entity__', 'Organizatio

# Not Yet

## Prompt-Based Extraction - Not Yet

In [None]:
prompt = """
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph. Your task is to identify the entities and relations specified in the user prompt from a given text and produce the output in JSON format. This output should be a list of JSON objects, with each object containing the following keys:

- "head": The text of the extracted entity, which must match one of the types specified in the user prompt.
- "head_type": The type of the extracted head entity, selected from the specified list of types.
- "relation": The type of relation between the "head" and the "tail," chosen from the list of allowed relations.
- "tail": The text of the entity representing the tail of the relation.
- **"tail_type"**: The type of the tail entity, also selected from the provided list of types.

Extract as many entities and relationships as possible.

**Entity Consistency**: Ensure consistency in entity representation. If an entity, like "John Doe," appears multiple times in the text under different names or pronouns (e.g., "Joe," "he"), use the most complete identifier consistently. This consistency is essential for creating a coherent and easily understandable knowledge graph.

**Important Notes**:
- Do not add any extra explanations or text.
"""

## Tool-Based Construction - Not Yet

In [None]:
class Node(BaseNode):
    id: str = Field(..., description="Name or human-readable unique identifier")
    label: str = Field(..., description=f"Available options are {enum_values}")
    properties: Optional[List[Property]]

In [None]:
class Relationship(BaseRelationship):
    source_node_id: str
    source_node_label: str = Field(..., description=f"Available options are {enum_values}")
    target_node_id: str
    target_node_label: str = Field(..., description=f"Available options are {enum_values}")
    type: str = Field(..., description=f"Available options are {enum_values}")
    properties: Optional[List[Property]]

In [None]:
class Property(BaseModel):
    """A single property consisting of key and value"""
    key: str = Field(..., description=f"Available options are {enum_values}")
    value: str

## Defining the Graph Schema - Not Yet

In [None]:
from langchain_experimental.graph_transformers import LLMGraphTransformer

no_schema = LLMGraphTransformer(llm=llm)

In [None]:
data = await no_schema.aconvert_to_graph_documents(documents)

## What does a Graph Schema look like

### Defining Allowed Nodes - Not Yet



In [None]:
allowed_nodes = ["Person", "Organization", "Location", "Award", "ResearchField"]
nodes_defined = LLMGraphTransformer(llm=llm, allowed_nodes=allowed_nodes)
data = await allowed_nodes.aconvert_to_graph_documents(documents)

### ...And, Defining allowed relationships - Not Yet




In [None]:
allowed_nodes = ["Person", "Organization", "Location", "Award", "ResearchField"]
allowed_relationships = ["SPOUSE", "AWARD", "FIELD_OF_RESEARCH", "WORKS_AT", "IN_LOCATION"]
rels_defined = LLMGraphTransformer(
  llm=llm,
  allowed_nodes=allowed_nodes,
  allowed_relationships=allowed_relationships
)
data = await rels_defined.aconvert_to_graph_documents(documents)