# Exercise 3 - Information Extraction with PyDI (Solution)

This notebook provides complete solutions for the information extraction exercise using PyDI.

In [1]:
import pandas as pd
import numpy as np
import sys
import os
import re
import json
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

from PyDI.informationextraction import LLMExtractor
from pydantic import BaseModel, Field
from typing import Optional
    


load_dotenv()

# Add PyDI to path
sys.path.append('../../../')

# Import PyDI information extraction modules
from PyDI.informationextraction import RegexExtractor, CodeExtractor, ExtractorPipeline
from PyDI.informationextraction.rules import built_in_rules

# Import evaluation utilities
sys.path.append('../Task/')
from evaluation import load_jsonl_targets, evaluate_predictions, print_evaluation_results

NLTK not available. Advanced tokenization features will be limited.


## Task 3.1 Solution: Load and Explore the Dataset

In [2]:
# Load the target data from JSONL format
targets_df = load_jsonl_targets('../Task/input/oa-mine_test.jsonl')

# Display basic information about the dataset
print(f"Dataset shape: {targets_df.shape}")
print(f"Columns: {list(targets_df.columns)}")
print("\nFirst few examples:")
targets_df.head()

Dataset shape: (491, 50)
Columns: ['input', 'category', 'Brand', 'Gender', 'Model name', 'Shoe type', 'Color', 'Size', 'Material', 'Sport', 'Supplement type', 'Dosage', 'Health benefit', 'Net content', 'Item form', 'Pack size', 'Diet', 'Supply size', 'Age', 'Flavor', 'Specialty', 'Administration type', 'Roast type', 'Caffeine content', 'Country', 'Machine type', 'Power source', 'Operation mode', 'Firmness', 'Animal', 'Cap', 'Sub brand', 'Cereal type', 'Container', 'Protection level', 'Scent', 'Skin type', 'Volume', 'Hair type', 'Benefit', 'Specific uses', 'Load size', 'Reusability', 'Mask type', 'Closure type', 'Age range', 'Package type', 'Layer', 'Certified grade', 'Tea variety']

First few examples:


Unnamed: 0,input,category,Brand,Gender,Model name,Shoe type,Color,Size,Material,Sport,...,Specific uses,Load size,Reusability,Mask type,Closure type,Age range,Package type,Layer,Certified grade,Tea variety
0,Diesel Men's Exposure High-Top Sneaker,Shoes,Diesel,Men's,Exposure,High-Top Sneaker,,,,,...,,,,,,,,,,
1,"Florsheim Men's Milano Slip-On Loafer,Burgundy...",Shoes,Florsheim,Men's,Milano,Slip-On Loafer,Burgundy,10 D US,,,...,,,,,,,,,,
2,Finn Comfort Women's 2051-014099,Shoes,Finn Comfort,Women's,2051-014099,,,,,,...,,,,,,,,,,
3,Pearl Izumi Men's X-Alp Low Shoe-49-Black/Moon...,Shoes,Pearl Izumi,Men's,X-Alp Low,,Black/Moonlight,49,,,...,,,,,,,,,,
4,G.H. Bass & Co. Women's Margie Sandal,Shoes,G.H. Bass & Co.,Women's,Margie,Sandal,,,,,...,,,,,,,,,,


In [3]:
# Examine the distribution of categories
print("Category distribution:")
print(targets_df['category'].value_counts())

# Look at available attributes
attribute_cols = [col for col in targets_df.columns if col not in ['input', 'category']]
print(f"\nAvailable attributes: {attribute_cols}")

# Check attribute coverage
for attr in attribute_cols:
    non_null_count = targets_df[attr].notna().sum()
    print(f"{attr}: {non_null_count}/{len(targets_df)} ({100*non_null_count/len(targets_df):.1f}%)")

Category distribution:
category
Vitamin              50
Coffee               50
Toothbrush           50
Breakfast Cereal     50
Sunscreen            50
Safety Mask          50
Tea                  50
Shoes                48
Laundry Detergent    48
Conditioner          45
Name: count, dtype: int64

Available attributes: ['Brand', 'Gender', 'Model name', 'Shoe type', 'Color', 'Size', 'Material', 'Sport', 'Supplement type', 'Dosage', 'Health benefit', 'Net content', 'Item form', 'Pack size', 'Diet', 'Supply size', 'Age', 'Flavor', 'Specialty', 'Administration type', 'Roast type', 'Caffeine content', 'Country', 'Machine type', 'Power source', 'Operation mode', 'Firmness', 'Animal', 'Cap', 'Sub brand', 'Cereal type', 'Container', 'Protection level', 'Scent', 'Skin type', 'Volume', 'Hair type', 'Benefit', 'Specific uses', 'Load size', 'Reusability', 'Mask type', 'Closure type', 'Age range', 'Package type', 'Layer', 'Certified grade', 'Tea variety']
Brand: 473/491 (96.3%)
Gender: 66/491 (13.4

In [4]:
# Look at some example product descriptions
print("Sample product descriptions with their attributes:")
for i in range(3):
    print(f"\n{i+1}. {targets_df['input'].iloc[i]}")
    for attr in attribute_cols:
        value = targets_df[attr].iloc[i]
        if pd.notna(value):
            print(f"   {attr}: {value}")

Sample product descriptions with their attributes:

1. Diesel Men's Exposure High-Top Sneaker
   Brand: Diesel
   Gender: Men's
   Model name: Exposure
   Shoe type: High-Top Sneaker

2. Florsheim Men's Milano Slip-On Loafer,Burgundy,10 D US
   Brand: Florsheim
   Gender: Men's
   Model name: Milano
   Shoe type: Slip-On Loafer
   Color: Burgundy
   Size: 10 D US

3. Finn Comfort Women's 2051-014099
   Brand: Finn Comfort
   Gender: Women's
   Model name: 2051-014099


In [5]:
# Now lets create our dataframe with just the input column
working_df = targets_df[['input']]
working_df.head()

Unnamed: 0,input
0,Diesel Men's Exposure High-Top Sneaker
1,"Florsheim Men's Milano Slip-On Loafer,Burgundy..."
2,Finn Comfort Women's 2051-014099
3,Pearl Izumi Men's X-Alp Low Shoe-49-Black/Moon...
4,G.H. Bass & Co. Women's Margie Sandal


## Task 3.2 Solution: Basic Regex-Based Extraction

In [6]:
# Define comprehensive regex rules for extracting product attributes
regex_rules = {
    "Brand": {
        "source_column": "input",
        "pattern": [
            r"^([A-Z][A-Za-z\s&\.]+?)\s+(?:Men's|Women's|Boys'|Girls'|Mens|Womens|for|\d)",
            r"^([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*?)\s+",  # Capitalized words at start
        ],
        "group": 1,
        "postprocess": "strip"
    },
    "Gender": {
        "source_column": "input",
        "pattern": r"\b(Men's|Women's|Boys'|Girls'|Mens|Womens|Male|Female)\b",
        "group": 1,
        "flags": re.IGNORECASE,
        "postprocess": "strip"  
    },
    "Size": {
        "source_column": "input", 
        "pattern": [
            r"\b(\d+(?:\.\d+)?\s*[A-Z]*\s*US)\b",  # US sizes
            r"\b(\d+(?:\.\d+)?)\s*([ABCDEFGHIJK]*)\s*(US|UK|EU)\b",  # International sizes
            r"\b(Size\s+\d+(?:\.\d+)?)\b",  # "Size 10" format
            r"\b(\d+(?:\.\d+)?)$",  # Numbers at end
        ],
        "flags": re.IGNORECASE
    },
    "Color": {
        "source_column": "input",
        "pattern": r"\b(Black|White|Brown|Red|Blue|Green|Yellow|Orange|Purple|Pink|Gray|Grey|Tan|Beige|Navy|Burgundy|Khaki|Silver|Gold)\b",
        "flags": re.IGNORECASE,
        "postprocess": "strip"  
    }
}

# Create the RegexExtractor
regex_extractor = RegexExtractor(
    rules=regex_rules,
    default_source="input",
    debug=False
)

print("RegexExtractor created with rules for:", list(regex_rules.keys()))
print("\nNote: Removed 'title' postprocess to avoid 'Unknown transformation' errors")

RegexExtractor created with rules for: ['Brand', 'Gender', 'Size', 'Color']

Note: Removed 'title' postprocess to avoid 'Unknown transformation' errors


In [7]:
# Apply the regex extractor to the dataset
regex_results = regex_extractor.extract(working_df.copy())

# Display some results
print("Regex extraction results (first 10 rows):")
cols_to_show = ['input', 'Brand', 'Gender', 'Size', 'Color']
print(regex_results[cols_to_show].head(10).to_string())

# Show extraction success rates
print("\nExtraction success rates:")
for attr in ['Brand', 'Gender', 'Size', 'Color']:
    success_rate = regex_results[attr].notna().sum() / len(regex_results) * 100
    print(f"{attr}: {success_rate:.1f}% ({regex_results[attr].notna().sum()}/{len(regex_results)})")

Regex extraction results (first 10 rows):
                                                           input            Brand   Gender     Size     Color
0                         Diesel Men's Exposure High-Top Sneaker           Diesel    Men's     None      None
1         Florsheim Men's Milano Slip-On Loafer,Burgundy,10 D US        Florsheim    Men's  10 D US  Burgundy
2                               Finn Comfort Women's 2051-014099     Finn Comfort  Women's   014099      None
3            Pearl Izumi Men's X-Alp Low Shoe-49-Black/Moonlight      Pearl Izumi    Men's     None     Black
4                          G.H. Bass & Co. Women's Margie Sandal  G.H. Bass & Co.  Women's     None      None
5                                Stacy Adams Men's Raynor Oxford      Stacy Adams    Men's     None      None
6  Fila Men's Hometown Extra-M, Black/White/Vintage Red, 10 M US             Fila    Men's  10 M US     Black
7        Cole Haan Men's Howland Penny, T.Moro Crocodile, 8 M US        Cole H

### Evaluation of Regex Extraction

In [8]:
# Evaluate regex extraction performance
regex_attributes = ['Brand', 'Gender', 'Size', 'Color']     
regex_eval = evaluate_predictions(regex_results, targets_df, regex_attributes)

print("=== REGEX EXTRACTION EVALUATION ===")
print_evaluation_results(regex_eval)

# Store for comparison later
regex_evaluation = regex_eval

=== REGEX EXTRACTION EVALUATION ===
INFORMATION EXTRACTION EVALUATION RESULTS

--- MICRO SCORES (Overall Performance) ---
Precision: 41.36%
Recall:    12.42%
F1 Score:  19.11%

--- MACRO SCORES (Average Across Attributes) ---
Precision: 57.96%
Recall:    12.42%
F1 Score:  17.77%

--- ATTRIBUTE-LEVEL RESULTS ---

Brand:
  Precision: 32.63%
  Recall:    25.25%
  F1 Score:  28.47%
  Counts: VC=124, VW=256, VN=111, NV=0, NN=0

Gender:
  Precision: 93.88%
  Recall:    9.37%
  F1 Score:  17.04%
  Counts: VC=46, VW=3, VN=442, NV=0, NN=0

Size:
  Precision: 66.67%
  Recall:    5.70%
  F1 Score:  10.51%
  Counts: VC=28, VW=14, VN=449, NV=0, NN=0

Color:
  Precision: 38.66%
  Recall:    9.37%
  F1 Score:  15.08%
  Counts: VC=46, VW=73, VN=372, NV=0, NN=0

--- TOTAL COUNTS ---
Valid Correct (VC):      244
Valid Wrong (VW):        346
Valid Missing (VN):      1374
Invalid Extra (NV):      0
No Target/Prediction:    0


## Task 3.3 Solution: Custom Code-Based Extraction

In [9]:
# Define custom extraction functions
def extract_gender(text):
    """Extract gender information from product text."""
    text_lower = text.lower()
    
    if "men's" in text_lower or "mens" in text_lower:
        return "Men's"
    elif "women's" in text_lower or "womens" in text_lower:
        return "Women's"
    elif "boys'" in text_lower or "boys" in text_lower:
        return "Boys'"
    elif "girls'" in text_lower or "girls" in text_lower:
        return "Girls'"
    elif "unisex" in text_lower:
        return "Unisex"
    
    return None

def extract_shoe_type(text):
    """Extract shoe type from product description."""
    text_lower = text.lower()
    
    shoe_types = {
        'sneaker': ['sneaker', 'tennis shoe', 'athletic shoe'],
        'boot': ['boot', 'bootie'],
        'sandal': ['sandal', 'flip-flop'],
        'loafer': ['loafer', 'slip-on'],
        'oxford': ['oxford'],
        'pump': ['pump'],
        'flat': ['flat', 'ballet flat'],
        'heel': ['heel', 'high heel'],
        'moccasin': ['moccasin'],
        'clog': ['clog']
    }
    
    for shoe_type, patterns in shoe_types.items():
        for pattern in patterns:
            if pattern in text_lower:
                return shoe_type.title()
    
    # Check for "high-top" or "low-top"
    if "high-top" in text_lower:
        return "High-Top Sneaker"
    elif "low-top" in text_lower:
        return "Low-Top Sneaker"
    
    return None

def extract_size(text):
    """Extract size information from product text."""
    import re
    
    # Look for various size patterns
    patterns = [
        r'(\d+(?:\.\d+)?\s*[A-Z]*\s*US)',  # "10 D US", "9.5 US"
        r'(\d+(?:\.\d+)?)-([A-Z]+)',       # "10-D", "9.5-EE"
        r'Size\s+(\d+(?:\.\d+)?)',         # "Size 10"
        r',(\d+(?:\.\d+)?(?:\s*[A-Z]*)?)\s*$',  # ",10" at end
        r'-(\d+(?:\.\d+)?)(?:-|$)',        # "-10" or "-10-"
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(1).strip()
    
    return None

def extract_model_name(text):
    """Extract model name from product text."""
    import re
    
    # Remove brand name if it's at the beginning
    text_parts = text.split()
    if len(text_parts) < 3:
        return None
    
    # Skip brand (first word) and gender (if present)
    start_idx = 1
    if len(text_parts) > 1 and text_parts[1].lower() in ['men\'s', 'women\'s', 'boys\'', 'girls\'']:
        start_idx = 2
    
    # Look for model name (usually 1-3 capitalized words)
    model_parts = []
    for i in range(start_idx, min(start_idx + 3, len(text_parts))):
        word = text_parts[i]
        # Stop at common shoe descriptors
        if word.lower() in ['shoe', 'boot', 'sneaker', 'sandal', 'loafer', 'pump', 'flat', 'heel']:
            break
        if re.match(r'^[A-Z]', word):  # Starts with capital
            model_parts.append(word)
        elif word.isdigit() or '-' in word:  # Model numbers
            model_parts.append(word)
        else:
            break
    
    return ' '.join(model_parts) if model_parts else None


In [10]:
# Define code extraction rules
code_rules = {
    "Gender": {"source_column": "input", "function": extract_gender},
    "Shoe type": {"source_column": "input", "function": extract_shoe_type},
    "Size": {"source_column": "input", "function": extract_size},
    "Model name": {"source_column": "input", "function": extract_model_name}
}

# Create and apply CodeExtractor
code_extractor = CodeExtractor(
    functions=code_rules,
    default_source="input",
    debug=False
)

code_results = code_extractor.extract(working_df.copy())

print("Code extraction results (first 2 rows):")
cols_to_show = ['input', 'Gender', 'Shoe type', 'Size', 'Model name']
display_df = code_results[cols_to_show].head(2)
for idx, row in display_df.iterrows():
    print(f"\n{idx+1}. {row['input'][:60]}...")
    for col in cols_to_show[1:]:
        if pd.notna(row[col]):
            print(f"   {col}: {row[col]}")

# Show extraction success rates
print("\nCode extraction success rates:")
for attr in ['Gender', 'Shoe type', 'Size', 'Model name']:
    success_rate = code_results[attr].notna().sum() / len(code_results) * 100
    print(f"{attr}: {success_rate:.1f}% ({code_results[attr].notna().sum()}/{len(code_results)})")

Code extraction results (first 2 rows):

1. Diesel Men's Exposure High-Top Sneaker...
   Gender: Men's
   Shoe type: Sneaker
   Model name: Exposure High-Top

2. Florsheim Men's Milano Slip-On Loafer,Burgundy,10 D US...
   Gender: Men's
   Shoe type: Loafer
   Size: 10 D US
   Model name: Milano Slip-On Loafer,Burgundy,10

Code extraction success rates:
Gender: 12.2% (60/491)
Shoe type: 4.3% (21/491)
Size: 12.8% (63/491)
Model name: 92.7% (455/491)


### Evaluation of Code Extraction

In [11]:
# Evaluate code extraction performance
code_attributes = ['Gender', 'Shoe type', 'Size', 'Model name']  
code_eval = evaluate_predictions(code_results, targets_df, code_attributes)

print("=== CODE EXTRACTION EVALUATION ===")
print_evaluation_results(code_eval)

=== CODE EXTRACTION EVALUATION ===
INFORMATION EXTRACTION EVALUATION RESULTS

--- MICRO SCORES (Overall Performance) ---
Precision: 11.69%
Recall:    3.56%
F1 Score:  5.46%

--- MACRO SCORES (Average Across Attributes) ---
Precision: 36.03%
Recall:    3.56%
F1 Score:  6.37%

--- ATTRIBUTE-LEVEL RESULTS ---

Gender:
  Precision: 55.00%
  Recall:    6.72%
  F1 Score:  11.98%
  Counts: VC=33, VW=27, VN=431, NV=0, NN=0

Shoe type:
  Precision: 47.62%
  Recall:    2.04%
  F1 Score:  3.91%
  Counts: VC=10, VW=11, VN=470, NV=0, NN=0

Size:
  Precision: 41.27%
  Recall:    5.30%
  F1 Score:  9.39%
  Counts: VC=26, VW=37, VN=428, NV=0, NN=0

Model name:
  Precision: 0.22%
  Recall:    0.20%
  F1 Score:  0.21%
  Counts: VC=1, VW=454, VN=36, NV=0, NN=0

--- TOTAL COUNTS ---
Valid Correct (VC):      70
Valid Wrong (VW):        529
Valid Missing (VN):      1365
Invalid Extra (NV):      0
No Target/Prediction:    0


## Task 3.4 Solution: Combining Extractors with Pipeline

In [12]:
# Create an ExtractorPipeline combining regex and code extractors
pipeline = ExtractorPipeline([regex_extractor, code_extractor])

# Apply the pipeline
pipeline_results = pipeline.run(working_df.copy())

print("Pipeline extraction results (first 5 rows):")
extracted_cols = ['Brand', 'Gender', 'Model name', 'Shoe type', 'Size', 'Color']
display_cols = ['input'] + [col for col in extracted_cols if col in pipeline_results.columns]

for idx, row in pipeline_results[display_cols].head(3).iterrows():
    print(f"\n{idx+1}. {row['input'][:80]}...")
    for col in display_cols[1:]:
        if pd.notna(row[col]):
            print(f"   {col}: {row[col]}")

# Show overall extraction success rates
print("\nOverall pipeline extraction success rates:")
for attr in display_cols[1:]:
    if attr in pipeline_results.columns:
        success_rate = pipeline_results[attr].notna().sum() / len(pipeline_results) * 100
        print(f"{attr}: {success_rate:.1f}% ({pipeline_results[attr].notna().sum()}/{len(pipeline_results)})")

Pipeline extraction results (first 5 rows):

1. Diesel Men's Exposure High-Top Sneaker...
   Brand: Diesel
   Gender: Men's
   Model name: Exposure High-Top
   Shoe type: Sneaker

2. Florsheim Men's Milano Slip-On Loafer,Burgundy,10 D US...
   Brand: Florsheim
   Gender: Men's
   Model name: Milano Slip-On Loafer,Burgundy,10
   Shoe type: Loafer
   Size: 10 D US
   Color: Burgundy

3. Finn Comfort Women's 2051-014099...
   Brand: Finn Comfort
   Gender: Men's
   Model name: Comfort Women's 2051-014099
   Size: 014099

Overall pipeline extraction success rates:
Brand: 77.4% (380/491)
Gender: 12.2% (60/491)
Model name: 92.7% (455/491)
Shoe type: 4.3% (21/491)
Size: 12.8% (63/491)
Color: 24.2% (119/491)


### Evaluation

In [13]:
# Prepare data for evaluation
# Get the list of attribute columns that exist in both predicted and target data
target_attributes = [col for col in targets_df.columns if col not in ['input']]
predicted_attributes = [col for col in pipeline_results.columns if col not in ['input']]

# Find common attributes for evaluation
common_attributes = list(set(target_attributes) & set(predicted_attributes))
print(f"Attributes available for evaluation: {common_attributes}")

# Ensure both dataframes have the same length and index
eval_targets = targets_df.copy().reset_index(drop=True)
eval_predictions = pipeline_results.copy().reset_index(drop=True)

# Add missing columns with None values
for attr in target_attributes:
    if attr not in eval_predictions.columns:
        eval_predictions[attr] = None

# Run evaluation
results = evaluate_predictions(eval_predictions, eval_targets, common_attributes)

# Print results
print_evaluation_results(results)

Attributes available for evaluation: ['Size', 'Brand', 'Shoe type', 'Gender', 'Model name', 'Color']
INFORMATION EXTRACTION EVALUATION RESULTS

--- MICRO SCORES (Overall Performance) ---
Precision: 21.86%
Recall:    8.15%
F1 Score:  11.87%

--- MACRO SCORES (Average Across Attributes) ---
Precision: 35.90%
Recall:    8.15%
F1 Score:  11.51%

--- ATTRIBUTE-LEVEL RESULTS ---

Size:
  Precision: 41.27%
  Recall:    5.30%
  F1 Score:  9.39%
  Counts: VC=26, VW=37, VN=428, NV=0, NN=0

Brand:
  Precision: 32.63%
  Recall:    25.25%
  F1 Score:  28.47%
  Counts: VC=124, VW=256, VN=111, NV=0, NN=0

Shoe type:
  Precision: 47.62%
  Recall:    2.04%
  F1 Score:  3.91%
  Counts: VC=10, VW=11, VN=470, NV=0, NN=0

Gender:
  Precision: 55.00%
  Recall:    6.72%
  F1 Score:  11.98%
  Counts: VC=33, VW=27, VN=431, NV=0, NN=0

Model name:
  Precision: 0.22%
  Recall:    0.20%
  F1 Score:  0.21%
  Counts: VC=1, VW=454, VN=36, NV=0, NN=0

Color:
  Precision: 38.66%
  Recall:    9.37%
  F1 Score:  15.08%


## Task 3.5 Solution: Analysis and Improvement

In [14]:
# Analyze the results
print("=== DETAILED ANALYSIS ===")

# 1. Which attributes had the best/worst performance?
print("\n1. Attribute Performance Ranking:")
attr_scores = [(attr, res['scores']['f1']) for attr, res in results['attribute_results'].items()]
attr_scores.sort(key=lambda x: x[1], reverse=True)

for attr, f1_score in attr_scores:
    counts = results['attribute_results'][attr]['counts']
    print(f"   {attr}: F1={f1_score:.1f}% (VC:{counts['VC']}, VW:{counts['VW']}, VN:{counts['VN']})")

# 2. Look at some examples where extraction failed
print("\n2. Failed Extraction Examples:")
worst_attr = attr_scores[-1][0] if attr_scores else common_attributes[0]
print(f"\nAnalyzing failures for '{worst_attr}':")

failed_examples = []
for i in range(len(eval_predictions)):
    target_val = eval_targets[worst_attr].iloc[i] if worst_attr in eval_targets.columns else None
    pred_val = eval_predictions[worst_attr].iloc[i] if worst_attr in eval_predictions.columns else None
    
    # Cases where we had a target but missed it (VN) or got it wrong (VW)
    if pd.notna(target_val):
        if pd.isna(pred_val):  # Missed (VN)
            failed_examples.append((i, eval_targets['input'].iloc[i], target_val, pred_val, 'MISSED'))
        elif str(target_val).strip().lower() != str(pred_val).strip().lower():  # Wrong (VW)
            failed_examples.append((i, eval_targets['input'].iloc[i], target_val, pred_val, 'WRONG'))

# Show first few failures
for i, (idx, input_text, target, pred, error_type) in enumerate(failed_examples[:5]):
    print(f"\n   Example {i+1} ({error_type}):")
    print(f"   Text: {input_text}")
    print(f"   Target: {target}")
    print(f"   Predicted: {pred}")

=== DETAILED ANALYSIS ===

1. Attribute Performance Ranking:
   Brand: F1=28.5% (VC:124, VW:256, VN:111)
   Color: F1=15.1% (VC:46, VW:73, VN:372)
   Gender: F1=12.0% (VC:33, VW:27, VN:431)
   Size: F1=9.4% (VC:26, VW:37, VN:428)
   Shoe type: F1=3.9% (VC:10, VW:11, VN:470)
   Model name: F1=0.2% (VC:1, VW:454, VN:36)

2. Failed Extraction Examples:

Analyzing failures for 'Model name':

   Example 1 (WRONG):
   Text: Diesel Men's Exposure High-Top Sneaker
   Target: Exposure
   Predicted: Exposure High-Top

   Example 2 (WRONG):
   Text: Florsheim Men's Milano Slip-On Loafer,Burgundy,10 D US
   Target: Milano
   Predicted: Milano Slip-On Loafer,Burgundy,10

   Example 3 (WRONG):
   Text: Finn Comfort Women's 2051-014099
   Target: 2051-014099
   Predicted: Comfort Women's 2051-014099

   Example 4 (WRONG):
   Text: Pearl Izumi Men's X-Alp Low Shoe-49-Black/Moonlight
   Target: X-Alp Low
   Predicted: Izumi Men's X-Alp

   Example 5 (WRONG):
   Text: G.H. Bass & Co. Women's Margie Sand

## Bonus Task 3.6 Solution: LLM-Based Extraction (Optional)

In [15]:
# Check for OpenAI API key
api_key = os.getenv('OPENAI_API_KEY')
if api_key:
    print("✅ OPENAI_API_KEY found in environment")
    print(f"   Key starts with: {api_key[:10]}...")
else:
    print("❌ OPENAI_API_KEY not found in environment")
    print("   Set it with: os.environ['OPENAI_API_KEY'] = 'your-api-key'")
    print("   Or export OPENAI_API_KEY='your-api-key' in your shell")


# Initialize OpenAI chat model
chat_model = ChatOpenAI(
    model="gpt-5-nano",  
    max_tokens=500,        # Reasonable limit for structured output
    temperature=0.0,      # Deterministic output
    reasoning_effort="minimal",  
)

print(f"✅ Configured {chat_model.model_name} with temperature={chat_model.temperature}")

✅ OPENAI_API_KEY found in environment
   Key starts with: sk-proj-qH...


✅ Configured gpt-5-nano with temperature=None


In [16]:
class Product(BaseModel):
    Brand: Optional[str] = Field(None, description="Product brand or manufacturer")
    Gender: Optional[str] = Field(None, description="Target gender (Men's, Women's, etc.)")
    model_name: Optional[str] = Field(None, alias="Model name", description="Product model or name")
    shoe_type: Optional[str] = Field(None, alias="Shoe type", description="Type of shoe (sneaker, boot, sandal, etc.)")
    Color: Optional[str] = Field(None, description="Primary color of the product")  
    Size: Optional[str] = Field(None, description="Size information")
    
    class Config:
        populate_by_name = True  # Allow using both field name and alias
    
# Create LLM extractor with corrected schema
llm_extractor = LLMExtractor(
    chat_model=chat_model,
    schema=Product,
    source_column="input",
    system_prompt="Extract product attributes from the description. Return JSON with these exact field names: Brand, Gender, 'Model name', 'Shoe type', Color, Size.",
)

llm_results = llm_extractor.extract(working_df.head(5))    

In [17]:
# rename llm_results col model_name to Model name
llm_results.rename(columns={'model_name': 'Model name'}, inplace=True)
llm_results.rename(columns={'shoe_type': 'Shoe type'}, inplace=True)

In [18]:
target_attributes = [col for col in targets_df.columns if col not in ['input']]
predicted_attributes = [col for col in llm_results.columns if col not in ['input']]

# Find common attributes for evaluation
common_attributes = list(set(target_attributes) & set(predicted_attributes))
print(f"\nEvaluating on attributes: {common_attributes}")

if common_attributes:
    llm_eval = evaluate_predictions(llm_results, targets_df.head(5), common_attributes)
    print_evaluation_results(llm_eval)
else:
    print("No matching attributes found for evaluation!")


Evaluating on attributes: ['Size', 'Brand', 'Shoe type', 'Gender', 'Model name', 'Color']
INFORMATION EXTRACTION EVALUATION RESULTS

--- MICRO SCORES (Overall Performance) ---
Precision: 65.22%
Recall:    50.00%
F1 Score:  56.60%

--- MACRO SCORES (Average Across Attributes) ---
Precision: 70.83%
Recall:    50.00%
F1 Score:  56.08%

--- ATTRIBUTE-LEVEL RESULTS ---

Size:
  Precision: 100.00%
  Recall:    40.00%
  F1 Score:  57.14%
  Counts: VC=2, VW=0, VN=3, NV=0, NN=0

Brand:
  Precision: 100.00%
  Recall:    100.00%
  F1 Score:  100.00%
  Counts: VC=5, VW=0, VN=0, NV=0, NN=0

Shoe type:
  Precision: 25.00%
  Recall:    20.00%
  F1 Score:  22.22%
  Counts: VC=1, VW=3, VN=1, NV=0, NN=0

Gender:
  Precision: 20.00%
  Recall:    20.00%
  F1 Score:  20.00%
  Counts: VC=1, VW=4, VN=0, NV=0, NN=0

Model name:
  Precision: 80.00%
  Recall:    80.00%
  F1 Score:  80.00%
  Counts: VC=4, VW=1, VN=0, NV=0, NN=0

Color:
  Precision: 100.00%
  Recall:    40.00%
  F1 Score:  57.14%
  Counts: VC=2,

## 3.7 Open Schema data extraction with LLM


In [19]:
# Create LLM extractor with corrected schema
open_llm_extractor = LLMExtractor(
    chat_model=chat_model,
    source_column="input",
    system_prompt="Extract product attributes from the description. Return JSON with these exact field names",
)


test_df = targets_df.copy().head(5)  
open_llm_results = open_llm_extractor.extract(test_df[['input']].copy())


In [20]:
open_llm_results

Unnamed: 0,input,extracted
0,Diesel Men's Exposure High-Top Sneaker,"{""brand"": ""Diesel"", ""product_type"": ""Sneaker"",..."
1,"Florsheim Men's Milano Slip-On Loafer,Burgundy...","{""brand"": ""Florsheim"", ""product_type"": ""Slip-O..."
2,Finn Comfort Women's 2051-014099,"{""product_name"": ""Finn Comfort Women's 2051-01..."
3,Pearl Izumi Men's X-Alp Low Shoe-49-Black/Moon...,"{""brand"": ""Pearl Izumi"", ""product_line"": ""X-Al..."
4,G.H. Bass & Co. Women's Margie Sandal,"{""brand"": ""G.H. Bass & Co."", ""product_type"": ""..."


In [21]:
# As the LLM returns a string, we need to parse the string into a dictionary that we can then convert to a dataframe
open_llm_results = pd.concat(
    [
        open_llm_results.drop(columns=['extracted']),
        open_llm_results['extracted'].apply(lambda x: pd.Series(json.loads(x) if isinstance(x, str) else {}))
    ],
    axis=1
)

In [22]:
open_llm_results

Unnamed: 0,input,brand,product_type,style,gender,model,color,size,product_name,model_number,product_line
0,Diesel Men's Exposure High-Top Sneaker,Diesel,Sneaker,High-Top,Men,,,,,,
1,"Florsheim Men's Milano Slip-On Loafer,Burgundy...",Florsheim,Slip-On Loafer,,Men,Milano,Burgundy,10 D,,,
2,Finn Comfort Women's 2051-014099,Finn Comfort,,,Women's,,,,Finn Comfort Women's 2051-014099,2051-014099,
3,Pearl Izumi Men's X-Alp Low Shoe-49-Black/Moon...,Pearl Izumi,,,Men's,Low Shoe,Black/Moonlight,49,,,X-Alp
4,G.H. Bass & Co. Women's Margie Sandal,G.H. Bass & Co.,Sandal,,Women,Margie,,,,,
