# Exercise 3 - Information Extraction with PyDI (Solution)

This notebook provides complete solutions for the information extraction exercise using PyDI.

In [1]:
import pandas as pd
import numpy as np
import sys
import os
import re
import json

from pydantic import BaseModel, Field
from typing import Optional
from dotenv import load_dotenv
from pathlib import Path

# Add PyDI to path
sys.path.append('../../../')

# Import PyDI information extraction modules
from PyDI.informationextraction import RegexExtractor, CodeExtractor, ExtractorPipeline, InformationExtractionEvaluator, LLMExtractor
from PyDI.informationextraction.rules import built_in_rules
from PyDI.io.loaders import load_json

# Import evaluation utilities
sys.path.append('../Task/')

NLTK not available. Advanced tokenization features will be limited.


## Information Extraction with PyDI

We'll demonstrate extracting from the product title with regex, and from the description with an LLM. 

In [2]:
gpu_path = Path('../Task/input/gpu_products.json')
gpu_df = load_json(gpu_path, add_index=False)
print(f'GPU dataset shape: {gpu_df.shape}')
display(gpu_df.head())


GPU dataset shape: (10, 12)


Unnamed: 0,id,name,brand,model,chipset,memory_gb,memory_type,clock_speed_mhz,tdp_w,launch_date,price_usd,description
0,gpu-001,NVIDIA GeForce RTX 4070 Ti,NVIDIA,RTX 4070 Ti,AD104,12,GDDR6X,2310,285,2023-01-05,799,"GeForce RTX 4070 Ti with 12GB GDDR6X, boost up..."
1,gpu-002,AMD Radeon RX 7800 XT,AMD,RX 7800 XT,Navi 32,16,GDDR6,2124,263,2023-09-06,499,"Radeon RX 7800 XT with 16 GB GDDR6, boost ~2.4..."
2,gpu-003,NVIDIA GeForce RTX 4090,NVIDIA,RTX 4090,AD102,24,GDDR6X,2235,450,2022-10-12,1599,Flagship RTX 4090 (24GB GDDR6X). Boost ~2.5 GH...
3,gpu-004,AMD Radeon RX 7600,AMD,RX 7600,Navi 33,8,GDDR6,2250,165,2023-05-25,269,RX 7600 with 8 GB GDDR6. Boost ~2.6 GHz. TBP ~...
4,gpu-005,NVIDIA GeForce RTX 4060,NVIDIA,RTX 4060,AD107,8,GDDR6,2460,115,2023-06-29,299,RTX 4060 8GB. Boost around 2.5 GHz. 115W TGP. ...


In [3]:
# create our test df
title_df = gpu_df[['id','name']]
title_df.head(3)

Unnamed: 0,id,name
0,gpu-001,NVIDIA GeForce RTX 4070 Ti
1,gpu-002,AMD Radeon RX 7800 XT
2,gpu-003,NVIDIA GeForce RTX 4090


In [None]:
# Regex extraction from product title (name)
regex_rules = {
    'brand_from_title': {
        'source_column': 'name',
        'pattern': r'\b(NVIDIA|AMD|Intel)\b',
        'group': 1
    },
    'model_from_title': {
        'source_column': 'name',
        'pattern': r'\b(RX\s?\d{3,4}\s?(?:XT|XTX)?)\b',
        'group': 1
    }
}
regex_extractor = RegexExtractor(regex_rules, default_source='name')
regex_gpu_df = regex_extractor.extract(title_df)
display(regex_gpu_df.head())


Unnamed: 0,id,name,brand_from_title,model_from_title
0,gpu-001,NVIDIA GeForce RTX 4070 Ti,NVIDIA,
1,gpu-002,AMD Radeon RX 7800 XT,AMD,RX 7800 XT
2,gpu-003,NVIDIA GeForce RTX 4090,NVIDIA,
3,gpu-004,AMD Radeon RX 7600,AMD,RX 7600
4,gpu-005,NVIDIA GeForce RTX 4060,NVIDIA,


In [None]:
# Evaluate regex extraction vs. gold (brand, model)

pred_eval_df = regex_gpu_df.rename(columns={
    'brand_from_title':'brand',
    'model_from_title':'model',
})
attributes = ['brand','model']

ie_eval = InformationExtractionEvaluator()

regex_eval_results = ie_eval.evaluate(
    predictions_df=pred_eval_df,
    gold_df=gpu_df,
    pred_id_column='id',
    gold_id_column='id',
    attributes=attributes, 
)

print(json.dumps(regex_eval_results, indent=2))


{
  "micro": {
    "precision": 1.0,
    "recall": 0.75,
    "f1": 0.8571428571428571,
    "accuracy": 0.75,
    "accuracy_overall": 0.75
  },
  "macro": {
    "precision": 1.0,
    "recall": 0.75,
    "f1": 0.8333333333333333,
    "accuracy": 0.75,
    "accuracy_overall": 0.75
  },
  "attributes": {
    "brand": {
      "counts": {
        "VC": 10,
        "VW": 0,
        "VN": 0,
        "NV": 0,
        "NN": 0
      },
      "metrics": {
        "precision": 1.0,
        "recall": 1.0,
        "f1": 1.0,
        "accuracy": 1.0,
        "accuracy_overall": 1.0
      },
      "rule": "exact_match"
    },
    "model": {
      "counts": {
        "VC": 5,
        "VW": 0,
        "VN": 5,
        "NV": 0,
        "NN": 0
      },
      "metrics": {
        "precision": 1.0,
        "recall": 0.5,
        "f1": 0.6666666666666666,
        "accuracy": 0.5,
        "accuracy_overall": 0.5
      },
      "rule": "exact_match"
    }
  },
  "num_evaluated_records": 10,
  "total_counts": {

  - VC — Correct extraction (TP): Gold has a value and the prediction has the same value.
  - VW — Wrong value (FP): Gold has a value and the prediction has a different value.
  - VN — Missed extraction (FN): Gold has a value but the prediction is missing.
  - NV — Spurious extraction (FP): Gold is missing but the prediction produced a value.
  - NN — Both missing (TN): Neither gold nor prediction has a value.



# Information extraction with LLMs

### Groq API Keys

Next, we need an API key from [groq.com](https://groq.com/) to use a powerful opensource LLMs for free. Groq offers a free tier allowing for API access with rate limits You can register here if you do not have an account yet: https://huggingface.co/login

After registering, you can create your key [here](https://console.groq.com/keys)

![image.png](groc_limits.png)


Once you have your keys, you have two options to use them in this notebook:

1. For directly inputting the tokens via copy/pasting:

In [6]:
# import getpass
# import os

# if not os.getenv("GROQ_API_KEY"):
#     os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your token: ")

# if not os.getenv("TAVILY_API_KEY"):
#     os.environ["TAVILY_API_KEY"] = getpass.getpass("Enter your token: ")

2. For loading the tokens from an .env file:

In [7]:
from dotenv import load_dotenv
import os

load_dotenv()

if os.getenv("GROQ_API_KEY"):
    print('Groq API Key loaded successfully')
else:
    print('Groq API Key loading failed, please make sure the .env file exists and the spelling is correct')

if os.getenv("TAVILY_API_KEY"):
    print('Tavily API Key loaded successfully')
else:
    print('Tavily API Key loading failed, please make sure the .env file exists and the spelling is correct')

Groq API Key loading failed, please make sure the .env file exists and the spelling is correct
Tavily API Key loading failed, please make sure the .env file exists and the spelling is correct


Once you have your keys, you have two options to use them in this notebook:

1. For directly inputting the tokens via copy/pasting:

In [8]:
# import getpass
# import os

# if not os.getenv("GROQ_API_KEY"):
#     os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your token: ")


2. For loading the tokens from an .env file:

In [9]:
from dotenv import load_dotenv
import os

load_dotenv()

if os.getenv("GROQ_API_KEY"):
    print('Groq API Key loaded successfully')
else:
    print('Groq API Key loading failed, please make sure the .env file exists and the spelling is correct')


Groq API Key loading failed, please make sure the .env file exists and the spelling is correct


In [10]:
#%pip install -qU langchain-groq

In [11]:
# Lets setup our chat model
#from langchain_groq import ChatGroq
from langchain_openai import ChatOpenAI

# Check for OpenAI API key
api_key = os.getenv('OPENAI_API_KEY')
if api_key:
    print("✅ OPENAI_API_KEY found in environment")
    print(f"   Key starts with: {api_key[:10]}...")
else:
    print("❌ OPENAI_API_KEY not found in environment")
    print("   Set it with: os.environ['OPENAI_API_KEY'] = 'your-api-key'")
    print("   Or export OPENAI_API_KEY='your-api-key' in your shell")


# Initialize OpenAI chat model
chat_model = ChatOpenAI(
    model="gpt-5-nano",  
    max_tokens=500,        # Reasonable limit for structured output
    temperature=0.0,      # Deterministic output
    reasoning_effort="minimal",  
)

chat_model.invoke("How are you doing today?")

✅ OPENAI_API_KEY found in environment
   Key starts with: sk-proj-qH...


AIMessage(content="I'm doing well, thanks! How can I help you today?", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 22, 'prompt_tokens': 12, 'total_tokens': 34, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-5-nano-2025-08-07', 'system_fingerprint': None, 'id': 'chatcmpl-CEy6W4YRj8QdCiVpoaAw3t5TNLRwa', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--bea77a01-4dc5-4c7e-bae6-238a53dcacfc-0', usage_metadata={'input_tokens': 12, 'output_tokens': 22, 'total_tokens': 34, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [12]:
# Lets load our json schema that we created in the previous exercise
schema_path = Path('../Task/input/gpu_product_schema.json')
with schema_path.open('r', encoding='utf-8') as f:
    schema_dict = json.load(f)

print(json.dumps(schema_dict, indent=2))

{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://example.org/schemas/gpu_product.schema.json",
  "title": "GPUProduct",
  "description": "Schema for a single discrete graphics card (GPU) product entry used in Exercise 02/03. Includes identification, specs, pricing and a free\u2011text description.",
  "type": "object",
  "properties": {
    "name": {
      "type": "string",
      "description": "Human\u2011readable product title as seen on listings (e.g., 'NVIDIA GeForce RTX 4070 Ti')."
    },
    "brand": {
      "type": "string",
      "enum": [
        "NVIDIA",
        "AMD",
        "Intel",
        "Other"
      ],
      "description": "GPU vendor/brand name."
    },
    "model": {
      "type": "string",
      "description": "Product model designation used in marketing (e.g., 'RTX 4070 Ti', 'RX 7800 XT')."
    },
    "chipset": {
      "type": "string",
      "description": "GPU die/codename or family identifier (e.g., 'AD104', 'Navi 32')."
    },

In [13]:
llm_extractor = LLMExtractor(
    chat_model=chat_model,
    source_column='description',
    system_prompt='You are an expert at extracting information from product descriptions. Extract a JSON object that matches the provided schema.',
    schema=schema_dict,
    debug=True,
)

In [14]:
description_df = gpu_df[['id','description']]
description_df.head(3)

Unnamed: 0,id,description
0,gpu-001,"GeForce RTX 4070 Ti with 12GB GDDR6X, boost up..."
1,gpu-002,"Radeon RX 7800 XT with 16 GB GDDR6, boost ~2.4..."
2,gpu-003,Flagship RTX 4090 (24GB GDDR6X). Boost ~2.5 GH...


In [15]:
llm_extraction_results = llm_extractor.extract(description_df)
llm_extraction_results.head(3)

Unnamed: 0,id,description,name,brand,model,chipset,memory_gb,memory_type,clock_speed_mhz,tdp_w,launch_date,price_usd
0,gpu-001,"GeForce RTX 4070 Ti with 12GB GDDR6X, boost up...",GeForce RTX 4070 Ti,NVIDIA,RTX 4070 Ti,,12.0,GDDR6X,2600.0,285.0,,799.0
1,gpu-002,"Radeon RX 7800 XT with 16 GB GDDR6, boost ~2.4...",Radeon RX 7800 XT,AMD,7800 XT,,16.0,GDDR6,2400.0,260.0,,499.0
2,gpu-003,Flagship RTX 4090 (24GB GDDR6X). Boost ~2.5 GH...,RTX 4090,NVIDIA,Flagship,,24.0,GDDR6X,2500.0,450.0,,1599.0


In [None]:
# Evaluate regex extraction vs. gold (brand, model)
attributes = ['brand','model']

llm_eval_results = ie_eval.evaluate(
    predictions_df=llm_extraction_results,
    gold_df=gpu_df,
    pred_id_column='id',
    gold_id_column='id',
    attributes=attributes, 
)


=== LLM Extraction (description) ===
Micro: {'precision': 0.6, 'recall': 1.0, 'f1': 0.75, 'accuracy': 0.6, 'accuracy_overall': 0.6}
Macro: {'precision': 0.6, 'recall': 1.0, 'f1': 0.704, 'accuracy': 0.6, 'accuracy_overall': 0.6}
Counts: {'Correct (VC)': 12, 'Wrong value (VW)': 8, 'Missed (VN)': 0, 'Spurious (NV)': 0, 'Both missing (NN)': 0}


Unnamed: 0,attribute,precision,recall,f1,accuracy,accuracy_overall,rule
0,brand,0.9,1.0,0.947,0.9,0.9,exact_match
1,model,0.3,1.0,0.462,0.3,0.3,exact_match


## Final Evaluation Summary

Compare micro/macro metrics and attribute-level results for regex (title) and LLM (description) extraction.


In [26]:
def _pretty_counts(counts):
    return {
        'Correct (VC)': counts.get('VC', 0),
        'Wrong value (VW)': counts.get('VW', 0),
        'Missed (VN)': counts.get('VN', 0),
        'Spurious (NV)': counts.get('NV', 0),
        'Both missing (NN)': counts.get('NN', 0),
    }

def summarize_eval(name, results):
    print(f'=== {name} ===')
    micro = results.get('micro', {})
    macro = results.get('macro', {})
    print('Micro:', {k: round(v, 3) for k, v in micro.items()})
    print('Macro:', {k: round(v, 3) for k, v in macro.items()})
    print('Counts:', _pretty_counts(results.get('total_counts', {})))
    attrs = results.get('attributes', {})
    if attrs:
        rows = []
        for attr, info in attrs.items():
            m = info.get('metrics', {})
            rows.append({
                'attribute': attr,
                **{k: round(v, 3) for k, v in m.items()},
                'rule': info.get('rule'),
            })
        df = pd.DataFrame(rows).sort_values('attribute')
        display(df)

if 'regex_eval_results' in globals():
    summarize_eval('Regex Extraction (title)', regex_eval_results)
else:
    print('Regex evaluation results not found — run regex cells above.')

if 'llm_eval_results' in globals():
    summarize_eval('LLM Extraction (description)', llm_eval_results)
else:
    print('LLM evaluation results not found — run LLM cells above or install dependencies.')


=== Regex Extraction (title) ===
Micro: {'precision': 0.6, 'recall': 1.0, 'f1': 0.75, 'accuracy': 0.6, 'accuracy_overall': 0.6}
Macro: {'precision': 0.6, 'recall': 1.0, 'f1': 0.704, 'accuracy': 0.6, 'accuracy_overall': 0.6}
Counts: {'Correct (VC)': 12, 'Wrong value (VW)': 8, 'Missed (VN)': 0, 'Spurious (NV)': 0, 'Both missing (NN)': 0}


Unnamed: 0,attribute,precision,recall,f1,accuracy,accuracy_overall,rule
0,brand,0.9,1.0,0.947,0.9,0.9,exact_match
1,model,0.3,1.0,0.462,0.3,0.3,exact_match


=== LLM Extraction (description) ===
Micro: {'precision': 0.6, 'recall': 1.0, 'f1': 0.75, 'accuracy': 0.6, 'accuracy_overall': 0.6}
Macro: {'precision': 0.6, 'recall': 1.0, 'f1': 0.704, 'accuracy': 0.6, 'accuracy_overall': 0.6}
Counts: {'Correct (VC)': 12, 'Wrong value (VW)': 8, 'Missed (VN)': 0, 'Spurious (NV)': 0, 'Both missing (NN)': 0}


Unnamed: 0,attribute,precision,recall,f1,accuracy,accuracy_overall,rule
0,brand,0.9,1.0,0.947,0.9,0.9,exact_match
1,model,0.3,1.0,0.462,0.3,0.3,exact_match


In [27]:
llm_eval_results = ie_eval.evaluate(
    predictions_df=llm_extraction_results,
    gold_df=gpu_df,
    pred_id_column='id',
    gold_id_column='id',
)
summarize_eval('LLM Extraction (description)', llm_eval_results)

=== LLM Extraction (description) ===
Micro: {'precision': 0.625, 'recall': 0.714, 'f1': 0.667, 'accuracy': 0.5, 'accuracy_overall': 0.5}
Macro: {'precision': 0.518, 'recall': 0.618, 'f1': 0.545, 'accuracy': 0.5, 'accuracy_overall': 0.5}
Counts: {'Correct (VC)': 55, 'Wrong value (VW)': 33, 'Missed (VN)': 22, 'Spurious (NV)': 0, 'Both missing (NN)': 0}


Unnamed: 0,attribute,precision,recall,f1,accuracy,accuracy_overall,rule
10,brand,0.9,1.0,0.947,0.9,0.9,exact_match
6,chipset,0.0,0.0,0.0,0.0,0.0,exact_match
0,clock_speed_mhz,0.0,0.0,0.0,0.0,0.0,exact_match
5,description,0.6,1.0,0.75,0.6,0.6,exact_match
2,launch_date,0.0,0.0,0.0,0.0,0.0,exact_match
1,memory_gb,1.0,1.0,1.0,1.0,1.0,exact_match
8,memory_type,1.0,0.8,0.889,0.8,0.8,exact_match
7,model,0.3,1.0,0.462,0.3,0.3,exact_match
4,name,0.0,0.0,0.0,0.0,0.0,exact_match
3,price_usd,1.0,1.0,1.0,1.0,1.0,exact_match
