# Exercise 3 - Information Extraction with PyDI (Solution)

This notebook provides complete solutions for the information extraction exercise using PyDI.

In [1]:
import pandas as pd
import numpy as np
import sys
import os
import re
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

from PyDI.informationextraction import LLMExtractor
from pydantic import BaseModel, Field
from typing import Optional
    


load_dotenv()

# Add PyDI to path
sys.path.append('../../../')

# Import PyDI information extraction modules
from PyDI.informationextraction import RegexExtractor, CodeExtractor, ExtractorPipeline
from PyDI.informationextraction.rules import built_in_rules

# Import evaluation utilities
sys.path.append('../Task/')
from evaluation import load_jsonl_targets, evaluate_predictions, print_evaluation_results

NLTK not available. Advanced tokenization features will be limited.


## Task 3.1 Solution: Load and Explore the Dataset

In [2]:
# Load the target data from JSONL format
targets_df = load_jsonl_targets('../Task/input/oa-mine_test.jsonl')

# Display basic information about the dataset
print(f"Dataset shape: {targets_df.shape}")
print(f"Columns: {list(targets_df.columns)}")
print("\nFirst few examples:")
targets_df.head()

Dataset shape: (491, 50)
Columns: ['input', 'category', 'Brand', 'Gender', 'Model name', 'Shoe type', 'Color', 'Size', 'Material', 'Sport', 'Supplement type', 'Dosage', 'Health benefit', 'Net content', 'Item form', 'Pack size', 'Diet', 'Supply size', 'Age', 'Flavor', 'Specialty', 'Administration type', 'Roast type', 'Caffeine content', 'Country', 'Machine type', 'Power source', 'Operation mode', 'Firmness', 'Animal', 'Cap', 'Sub brand', 'Cereal type', 'Container', 'Protection level', 'Scent', 'Skin type', 'Volume', 'Hair type', 'Benefit', 'Specific uses', 'Load size', 'Reusability', 'Mask type', 'Closure type', 'Age range', 'Package type', 'Layer', 'Certified grade', 'Tea variety']

First few examples:


Unnamed: 0,input,category,Brand,Gender,Model name,Shoe type,Color,Size,Material,Sport,...,Specific uses,Load size,Reusability,Mask type,Closure type,Age range,Package type,Layer,Certified grade,Tea variety
0,Diesel Men's Exposure High-Top Sneaker,Shoes,Diesel,Men's,Exposure,High-Top Sneaker,,,,,...,,,,,,,,,,
1,"Florsheim Men's Milano Slip-On Loafer,Burgundy...",Shoes,Florsheim,Men's,Milano,Slip-On Loafer,Burgundy,10 D US,,,...,,,,,,,,,,
2,Finn Comfort Women's 2051-014099,Shoes,Finn Comfort,Women's,2051-014099,,,,,,...,,,,,,,,,,
3,Pearl Izumi Men's X-Alp Low Shoe-49-Black/Moon...,Shoes,Pearl Izumi,Men's,X-Alp Low,,Black/Moonlight,49,,,...,,,,,,,,,,
4,G.H. Bass & Co. Women's Margie Sandal,Shoes,G.H. Bass & Co.,Women's,Margie,Sandal,,,,,...,,,,,,,,,,


In [3]:
# Examine the distribution of categories
print("Category distribution:")
print(targets_df['category'].value_counts())

# Look at available attributes
attribute_cols = [col for col in targets_df.columns if col not in ['input', 'category']]
print(f"\nAvailable attributes: {attribute_cols}")

# Check attribute coverage
for attr in attribute_cols:
    non_null_count = targets_df[attr].notna().sum()
    print(f"{attr}: {non_null_count}/{len(targets_df)} ({100*non_null_count/len(targets_df):.1f}%)")

Category distribution:
category
Vitamin              50
Coffee               50
Toothbrush           50
Breakfast Cereal     50
Sunscreen            50
Safety Mask          50
Tea                  50
Shoes                48
Laundry Detergent    48
Conditioner          45
Name: count, dtype: int64

Available attributes: ['Brand', 'Gender', 'Model name', 'Shoe type', 'Color', 'Size', 'Material', 'Sport', 'Supplement type', 'Dosage', 'Health benefit', 'Net content', 'Item form', 'Pack size', 'Diet', 'Supply size', 'Age', 'Flavor', 'Specialty', 'Administration type', 'Roast type', 'Caffeine content', 'Country', 'Machine type', 'Power source', 'Operation mode', 'Firmness', 'Animal', 'Cap', 'Sub brand', 'Cereal type', 'Container', 'Protection level', 'Scent', 'Skin type', 'Volume', 'Hair type', 'Benefit', 'Specific uses', 'Load size', 'Reusability', 'Mask type', 'Closure type', 'Age range', 'Package type', 'Layer', 'Certified grade', 'Tea variety']
Brand: 473/491 (96.3%)
Gender: 66/491 (13.4

In [4]:
# Look at some example product descriptions
print("Sample product descriptions with their attributes:")
for i in range(3):
    print(f"\n{i+1}. {targets_df['input'].iloc[i]}")
    for attr in attribute_cols:
        value = targets_df[attr].iloc[i]
        if pd.notna(value):
            print(f"   {attr}: {value}")

Sample product descriptions with their attributes:

1. Diesel Men's Exposure High-Top Sneaker
   Brand: Diesel
   Gender: Men's
   Model name: Exposure
   Shoe type: High-Top Sneaker

2. Florsheim Men's Milano Slip-On Loafer,Burgundy,10 D US
   Brand: Florsheim
   Gender: Men's
   Model name: Milano
   Shoe type: Slip-On Loafer
   Color: Burgundy
   Size: 10 D US

3. Finn Comfort Women's 2051-014099
   Brand: Finn Comfort
   Gender: Women's
   Model name: 2051-014099


## Task 3.2 Solution: Basic Regex-Based Extraction

In [5]:
# Define comprehensive regex rules for extracting product attributes
regex_rules = {
    "Brand": {
        "source_column": "input",
        "pattern": [
            r"^([A-Z][A-Za-z\s&\.]+?)\s+(?:Men's|Women's|Boys'|Girls'|Mens|Womens|for|\d)",
            r"^([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*?)\s+",  # Capitalized words at start
        ],
        "group": 1,
        "postprocess": "strip"
    },
    "Gender": {
        "source_column": "input",
        "pattern": r"\b(Men's|Women's|Boys'|Girls'|Mens|Womens|Male|Female)\b",
        "group": 1,
        "flags": re.IGNORECASE,
        "postprocess": "strip"  # Changed from "title" to "strip" to avoid error
    },
    "Size": {
        "source_column": "input", 
        "pattern": [
            r"\b(\d+(?:\.\d+)?\s*[A-Z]*\s*US)\b",  # US sizes
            r"\b(\d+(?:\.\d+)?)\s*([ABCDEFGHIJK]*)\s*(US|UK|EU)\b",  # International sizes
            r"\b(Size\s+\d+(?:\.\d+)?)\b",  # "Size 10" format
            r"\b(\d+(?:\.\d+)?)$",  # Numbers at end
        ],
        "flags": re.IGNORECASE
    },
    "Color": {
        "source_column": "input",
        "pattern": r"\b(Black|White|Brown|Red|Blue|Green|Yellow|Orange|Purple|Pink|Gray|Grey|Tan|Beige|Navy|Burgundy|Khaki|Silver|Gold)\b",
        "flags": re.IGNORECASE,
        "postprocess": "strip"  # Changed from "title" to "strip" to avoid error
    }
}

# Create the RegexExtractor
regex_extractor = RegexExtractor(
    rules=regex_rules,
    default_source="input",
    debug=False
)

print("RegexExtractor created with rules for:", list(regex_rules.keys()))
print("\nNote: Removed 'title' postprocess to avoid 'Unknown transformation' errors")

RegexExtractor created with rules for: ['Brand', 'Gender', 'Size', 'Color']

Note: Removed 'title' postprocess to avoid 'Unknown transformation' errors


In [6]:
# Apply the regex extractor to the dataset
regex_results = regex_extractor.extract(targets_df.copy())

# Display some results
print("Regex extraction results (first 10 rows):")
cols_to_show = ['input', 'Brand', 'Gender', 'Size', 'Color']
print(regex_results[cols_to_show].head(10).to_string())

# Show extraction success rates
print("\nExtraction success rates:")
for attr in ['Brand', 'Gender', 'Size', 'Color']:
    success_rate = regex_results[attr].notna().sum() / len(regex_results) * 100
    print(f"{attr}: {success_rate:.1f}% ({regex_results[attr].notna().sum()}/{len(regex_results)})")

Regex extraction results (first 10 rows):
                                                           input            Brand   Gender     Size     Color
0                         Diesel Men's Exposure High-Top Sneaker           Diesel    Men's     None      None
1         Florsheim Men's Milano Slip-On Loafer,Burgundy,10 D US        Florsheim    Men's  10 D US  Burgundy
2                               Finn Comfort Women's 2051-014099     Finn Comfort  Women's   014099      None
3            Pearl Izumi Men's X-Alp Low Shoe-49-Black/Moonlight      Pearl Izumi    Men's     None     Black
4                          G.H. Bass & Co. Women's Margie Sandal  G.H. Bass & Co.  Women's     None      None
5                                Stacy Adams Men's Raynor Oxford      Stacy Adams    Men's     None      None
6  Fila Men's Hometown Extra-M, Black/White/Vintage Red, 10 M US             Fila    Men's  10 M US     Black
7        Cole Haan Men's Howland Penny, T.Moro Crocodile, 8 M US        Cole H

## Task 3.3 Solution: Custom Code-Based Extraction

In [7]:
# Define custom extraction functions
def extract_gender(text):
    """Extract gender information from product text."""
    text_lower = text.lower()
    
    if "men's" in text_lower or "mens" in text_lower:
        return "Men's"
    elif "women's" in text_lower or "womens" in text_lower:
        return "Women's"
    elif "boys'" in text_lower or "boys" in text_lower:
        return "Boys'"
    elif "girls'" in text_lower or "girls" in text_lower:
        return "Girls'"
    elif "unisex" in text_lower:
        return "Unisex"
    
    return None

def extract_shoe_type(text):
    """Extract shoe type from product description."""
    text_lower = text.lower()
    
    shoe_types = {
        'sneaker': ['sneaker', 'tennis shoe', 'athletic shoe'],
        'boot': ['boot', 'bootie'],
        'sandal': ['sandal', 'flip-flop'],
        'loafer': ['loafer', 'slip-on'],
        'oxford': ['oxford'],
        'pump': ['pump'],
        'flat': ['flat', 'ballet flat'],
        'heel': ['heel', 'high heel'],
        'moccasin': ['moccasin'],
        'clog': ['clog']
    }
    
    for shoe_type, patterns in shoe_types.items():
        for pattern in patterns:
            if pattern in text_lower:
                return shoe_type.title()
    
    # Check for "high-top" or "low-top"
    if "high-top" in text_lower:
        return "High-Top Sneaker"
    elif "low-top" in text_lower:
        return "Low-Top Sneaker"
    
    return None

def extract_size(text):
    """Extract size information from product text."""
    import re
    
    # Look for various size patterns
    patterns = [
        r'(\d+(?:\.\d+)?\s*[A-Z]*\s*US)',  # "10 D US", "9.5 US"
        r'(\d+(?:\.\d+)?)-([A-Z]+)',       # "10-D", "9.5-EE"
        r'Size\s+(\d+(?:\.\d+)?)',         # "Size 10"
        r',(\d+(?:\.\d+)?(?:\s*[A-Z]*)?)\s*$',  # ",10" at end
        r'-(\d+(?:\.\d+)?)(?:-|$)',        # "-10" or "-10-"
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(1).strip()
    
    return None

def extract_model_name(text):
    """Extract model name from product text."""
    import re
    
    # Remove brand name if it's at the beginning
    text_parts = text.split()
    if len(text_parts) < 3:
        return None
    
    # Skip brand (first word) and gender (if present)
    start_idx = 1
    if len(text_parts) > 1 and text_parts[1].lower() in ['men\'s', 'women\'s', 'boys\'', 'girls\'']:
        start_idx = 2
    
    # Look for model name (usually 1-3 capitalized words)
    model_parts = []
    for i in range(start_idx, min(start_idx + 3, len(text_parts))):
        word = text_parts[i]
        # Stop at common shoe descriptors
        if word.lower() in ['shoe', 'boot', 'sneaker', 'sandal', 'loafer', 'pump', 'flat', 'heel']:
            break
        if re.match(r'^[A-Z]', word):  # Starts with capital
            model_parts.append(word)
        elif word.isdigit() or '-' in word:  # Model numbers
            model_parts.append(word)
        else:
            break
    
    return ' '.join(model_parts) if model_parts else None

print("Custom extraction functions defined:")
print("- extract_gender")
print("- extract_shoe_type") 
print("- extract_size")
print("- extract_model_name")

Custom extraction functions defined:
- extract_gender
- extract_shoe_type
- extract_size
- extract_model_name


In [8]:
# Define code extraction rules
code_rules = {
    "Gender": {"source_column": "input", "function": extract_gender},
    "Shoe type": {"source_column": "input", "function": extract_shoe_type},
    "Size": {"source_column": "input", "function": extract_size},
    "Model name": {"source_column": "input", "function": extract_model_name}
}

# Create and apply CodeExtractor
code_extractor = CodeExtractor(
    functions=code_rules,
    default_source="input",
    debug=False
)

code_results = code_extractor.extract(targets_df.copy())

print("Code extraction results (first 10 rows):")
cols_to_show = ['input', 'Gender', 'Shoe type', 'Size', 'Model name']
display_df = code_results[cols_to_show].head(10)
for idx, row in display_df.iterrows():
    print(f"\n{idx+1}. {row['input'][:60]}...")
    for col in cols_to_show[1:]:
        if pd.notna(row[col]):
            print(f"   {col}: {row[col]}")

# Show extraction success rates
print("\nCode extraction success rates:")
for attr in ['Gender', 'Shoe type', 'Size', 'Model name']:
    success_rate = code_results[attr].notna().sum() / len(code_results) * 100
    print(f"{attr}: {success_rate:.1f}% ({code_results[attr].notna().sum()}/{len(code_results)})")

Failed to analyze function 'Gender': {'source_column': 'input', 'function': <function extract_gender at 0x323364d60>} is not a callable object
Failed to analyze function 'Shoe type': {'source_column': 'input', 'function': <function extract_shoe_type at 0x323365080>} is not a callable object
Failed to analyze function 'Size': {'source_column': 'input', 'function': <function extract_size at 0x323365120>} is not a callable object
Failed to analyze function 'Model name': {'source_column': 'input', 'function': <function extract_model_name at 0x3233651c0>} is not a callable object


Code extraction results (first 10 rows):

1. Diesel Men's Exposure High-Top Sneaker...
   Gender: Men's
   Shoe type: High-Top Sneaker
   Model name: Exposure

2. Florsheim Men's Milano Slip-On Loafer,Burgundy,10 D US...
   Gender: Men's
   Shoe type: Slip-On Loafer
   Size: 10 D US
   Model name: Milano

3. Finn Comfort Women's 2051-014099...
   Gender: Women's
   Model name: 2051-014099

4. Pearl Izumi Men's X-Alp Low Shoe-49-Black/Moonlight...
   Gender: Men's
   Size: 49
   Model name: X-Alp Low

5. G.H. Bass & Co. Women's Margie Sandal...
   Gender: Women's
   Shoe type: Sandal
   Model name: Margie

6. Stacy Adams Men's Raynor Oxford...
   Gender: Men's
   Shoe type: Oxford
   Model name: Raynor

7. Fila Men's Hometown Extra-M, Black/White/Vintage Red, 10 M U...
   Gender: Men's
   Size: 10 M US
   Model name: Hometown Extra-M

8. Cole Haan Men's Howland Penny, T.Moro Crocodile, 8 M US...
   Gender: Men's
   Size: 8 M US
   Model name: Howland Penny, T.Moro Crocodile

9. Diesel M

## Task 3.4 Solution: Combining Extractors with Pipeline

In [9]:
# Create an ExtractorPipeline combining regex and code extractors
pipeline = ExtractorPipeline([regex_extractor, code_extractor])

# Apply the pipeline
pipeline_results = pipeline.run(targets_df.copy())

print("Pipeline extraction results (first 5 rows):")
extracted_cols = ['Brand', 'Gender', 'Model name', 'Shoe type', 'Size', 'Color']
display_cols = ['input'] + [col for col in extracted_cols if col in pipeline_results.columns]

for idx, row in pipeline_results[display_cols].head(5).iterrows():
    print(f"\n{idx+1}. {row['input'][:80]}...")
    for col in display_cols[1:]:
        if pd.notna(row[col]):
            print(f"   {col}: {row[col]}")

# Show overall extraction success rates
print("\nOverall pipeline extraction success rates:")
for attr in display_cols[1:]:
    if attr in pipeline_results.columns:
        success_rate = pipeline_results[attr].notna().sum() / len(pipeline_results) * 100
        print(f"{attr}: {success_rate:.1f}% ({pipeline_results[attr].notna().sum()}/{len(pipeline_results)})")

Pipeline extraction results (first 5 rows):

1. Diesel Men's Exposure High-Top Sneaker...
   Brand: Diesel
   Gender: Men's
   Model name: Exposure
   Shoe type: High-Top Sneaker

2. Florsheim Men's Milano Slip-On Loafer,Burgundy,10 D US...
   Brand: Florsheim
   Gender: Men's
   Model name: Milano
   Shoe type: Slip-On Loafer
   Size: 10 D US
   Color: Burgundy

3. Finn Comfort Women's 2051-014099...
   Brand: Finn Comfort
   Gender: Women's
   Model name: 2051-014099
   Size: 014099

4. Pearl Izumi Men's X-Alp Low Shoe-49-Black/Moonlight...
   Brand: Pearl Izumi
   Gender: Men's
   Model name: X-Alp Low
   Color: Black

5. G.H. Bass & Co. Women's Margie Sandal...
   Brand: G.H. Bass & Co.
   Gender: Women's
   Model name: Margie
   Shoe type: Sandal

Overall pipeline extraction success rates:
Brand: 77.4% (380/491)
Gender: 10.0% (49/491)
Model name: 9.4% (46/491)
Shoe type: 4.7% (23/491)
Size: 8.6% (42/491)
Color: 24.2% (119/491)


## Task 3.5 Solution: Evaluation

In [10]:
# Prepare data for evaluation
# Get the list of attribute columns that exist in both predicted and target data
target_attributes = [col for col in targets_df.columns if col not in ['input', 'category']]
predicted_attributes = [col for col in pipeline_results.columns if col not in ['input', 'category']]

# Find common attributes for evaluation
common_attributes = list(set(target_attributes) & set(predicted_attributes))
print(f"Attributes available for evaluation: {common_attributes}")

# Ensure both dataframes have the same length and index
eval_targets = targets_df.copy().reset_index(drop=True)
eval_predictions = pipeline_results.copy().reset_index(drop=True)

# Add missing columns with None values
for attr in target_attributes:
    if attr not in eval_predictions.columns:
        eval_predictions[attr] = None

# Run evaluation
results = evaluate_predictions(eval_predictions, eval_targets, common_attributes)

# Print results
print_evaluation_results(results)

Attributes available for evaluation: ['Protection level', 'Container', 'Item form', 'Flavor', 'Size', 'Color', 'Operation mode', 'Sub brand', 'Layer', 'Material', 'Specialty', 'Firmness', 'Cap', 'Reusability', 'Supply size', 'Benefit', 'Volume', 'Package type', 'Age', 'Shoe type', 'Specific uses', 'Brand', 'Supplement type', 'Dosage', 'Certified grade', 'Hair type', 'Administration type', 'Diet', 'Health benefit', 'Model name', 'Skin type', 'Country', 'Sport', 'Caffeine content', 'Scent', 'Net content', 'Load size', 'Mask type', 'Tea variety', 'Pack size', 'Animal', 'Power source', 'Gender', 'Roast type', 'Age range', 'Closure type', 'Machine type', 'Cereal type']
INFORMATION EXTRACTION EVALUATION RESULTS

--- MICRO SCORES (Overall Performance) ---
Precision: 98.44%
Recall:    92.70%
F1 Score:  95.49%

--- MACRO SCORES (Average Across Attributes) ---
Precision: 96.50%
Recall:    92.70%
F1 Score:  93.15%

--- ATTRIBUTE-LEVEL RESULTS ---

Protection level:
  Precision: 100.00%
  Recall: 

## Task 3.6 Solution: Analysis and Improvement

In [11]:
# Analyze the results
print("=== DETAILED ANALYSIS ===")

# 1. Which attributes had the best/worst performance?
print("\n1. Attribute Performance Ranking:")
attr_scores = [(attr, res['scores']['f1']) for attr, res in results['attribute_results'].items()]
attr_scores.sort(key=lambda x: x[1], reverse=True)

for attr, f1_score in attr_scores:
    counts = results['attribute_results'][attr]['counts']
    print(f"   {attr}: F1={f1_score:.1f}% (VC:{counts['VC']}, VW:{counts['VW']}, VN:{counts['VN']})")

# 2. Look at some examples where extraction failed
print("\n2. Failed Extraction Examples:")
worst_attr = attr_scores[-1][0] if attr_scores else common_attributes[0]
print(f"\nAnalyzing failures for '{worst_attr}':")

failed_examples = []
for i in range(len(eval_predictions)):
    target_val = eval_targets[worst_attr].iloc[i] if worst_attr in eval_targets.columns else None
    pred_val = eval_predictions[worst_attr].iloc[i] if worst_attr in eval_predictions.columns else None
    
    # Cases where we had a target but missed it (VN) or got it wrong (VW)
    if pd.notna(target_val):
        if pd.isna(pred_val):  # Missed (VN)
            failed_examples.append((i, eval_targets['input'].iloc[i], target_val, pred_val, 'MISSED'))
        elif str(target_val).strip().lower() != str(pred_val).strip().lower():  # Wrong (VW)
            failed_examples.append((i, eval_targets['input'].iloc[i], target_val, pred_val, 'WRONG'))

# Show first few failures
for i, (idx, input_text, target, pred, error_type) in enumerate(failed_examples[:5]):
    print(f"\n   Example {i+1} ({error_type}):")
    print(f"   Text: {input_text}")
    print(f"   Target: {target}")
    print(f"   Predicted: {pred}")

=== DETAILED ANALYSIS ===

1. Attribute Performance Ranking:
   Protection level: F1=100.0% (VC:491, VW:0, VN:0)
   Container: F1=100.0% (VC:491, VW:0, VN:0)
   Item form: F1=100.0% (VC:491, VW:0, VN:0)
   Flavor: F1=100.0% (VC:491, VW:0, VN:0)
   Operation mode: F1=100.0% (VC:491, VW:0, VN:0)
   Sub brand: F1=100.0% (VC:491, VW:0, VN:0)
   Layer: F1=100.0% (VC:491, VW:0, VN:0)
   Material: F1=100.0% (VC:491, VW:0, VN:0)
   Specialty: F1=100.0% (VC:491, VW:0, VN:0)
   Firmness: F1=100.0% (VC:491, VW:0, VN:0)
   Cap: F1=100.0% (VC:491, VW:0, VN:0)
   Reusability: F1=100.0% (VC:491, VW:0, VN:0)
   Supply size: F1=100.0% (VC:491, VW:0, VN:0)
   Benefit: F1=100.0% (VC:491, VW:0, VN:0)
   Volume: F1=100.0% (VC:491, VW:0, VN:0)
   Package type: F1=100.0% (VC:491, VW:0, VN:0)
   Age: F1=100.0% (VC:491, VW:0, VN:0)
   Shoe type: F1=100.0% (VC:491, VW:0, VN:0)
   Specific uses: F1=100.0% (VC:491, VW:0, VN:0)
   Supplement type: F1=100.0% (VC:491, VW:0, VN:0)
   Dosage: F1=100.0% (VC:491, VW:0, 

In [12]:
# 3. Improvement suggestions based on analysis
print("\n3. Improvement Suggestions:")

print("\nBrand Extraction Improvements:")
# Analyze brand failures
brand_failures = []
for i in range(min(20, len(eval_predictions))):
    target_brand = eval_targets['Brand'].iloc[i] if 'Brand' in eval_targets.columns else None
    pred_brand = eval_predictions['Brand'].iloc[i] if 'Brand' in eval_predictions.columns else None
    
    if pd.notna(target_brand) and (pd.isna(pred_brand) or str(target_brand).lower() != str(pred_brand).lower()):
        brand_failures.append((eval_targets['input'].iloc[i], target_brand, pred_brand))

if brand_failures:
    print("   - Add patterns for brands with special characters (e.g., 'G.H. Bass & Co.')")
    print("   - Handle brands that don't start sentences")
    print("   - Add common brand name patterns")
    
    # Show specific examples
    for i, (text, target, pred) in enumerate(brand_failures[:3]):
        print(f"   Example: '{text[:50]}...' -> Target: {target}, Got: {pred}")

print("\nSize Extraction Improvements:")
print("   - Add patterns for European sizes (38, 39, etc.)")
print("   - Handle size ranges (e.g., '8-9')")
print("   - Improve detection of width indicators (D, EE, etc.)")

print("\nGeneral Improvements:")
print("   - Use fuzzy string matching for brand recognition")
print("   - Implement confidence scoring")
print("   - Add domain-specific preprocessing")
print("   - Consider ensemble methods combining multiple extractors")


3. Improvement Suggestions:

Brand Extraction Improvements:
   - Add patterns for brands with special characters (e.g., 'G.H. Bass & Co.')
   - Handle brands that don't start sentences
   - Add common brand name patterns
   Example: 'Florsheim Lakeside Ox Brown Nubuck 8...' -> Target: Florsheim, Got: Florsheim Lakeside Ox Brown Nubuck

Size Extraction Improvements:
   - Add patterns for European sizes (38, 39, etc.)
   - Handle size ranges (e.g., '8-9')
   - Improve detection of width indicators (D, EE, etc.)

General Improvements:
   - Use fuzzy string matching for brand recognition
   - Implement confidence scoring
   - Add domain-specific preprocessing
   - Consider ensemble methods combining multiple extractors


## Bonus Task 3.7 Solution: LLM-Based Extraction (Optional)

In [13]:
# Check for OpenAI API key
api_key = os.getenv('OPENAI_API_KEY')
if api_key:
    print("✅ OPENAI_API_KEY found in environment")
    print(f"   Key starts with: {api_key[:10]}...")
else:
    print("❌ OPENAI_API_KEY not found in environment")
    print("   Set it with: os.environ['OPENAI_API_KEY'] = 'your-api-key'")
    print("   Or export OPENAI_API_KEY='your-api-key' in your shell")


# Initialize OpenAI chat model
chat_model = ChatOpenAI(
    model="gpt-5-nano",  
    max_tokens=500,        # Reasonable limit for structured output
    temperature=0.0,      # Deterministic output
    reasoning_effort="minimal",  
)

print(f"✅ Configured {chat_model.model_name} with temperature={chat_model.temperature}")

✅ OPENAI_API_KEY found in environment
   Key starts with: sk-proj-qH...
✅ Configured gpt-5-nano with temperature=None


In [14]:
class Product(BaseModel):
    Brand: Optional[str] = Field(None, description="Product brand or manufacturer")
    Gender: Optional[str] = Field(None, description="Target gender (Men's, Women's, etc.)")
    model_name: Optional[str] = Field(None, alias="Model name", description="Product model or name")
    shoe_type: Optional[str] = Field(None, alias="Shoe type", description="Type of shoe (sneaker, boot, sandal, etc.)")
    Color: Optional[str] = Field(None, description="Primary color of the product")  
    Size: Optional[str] = Field(None, description="Size information")
    
    class Config:
        populate_by_name = True  # Allow using both field name and alias
    
# Create LLM extractor with corrected schema
llm_extractor = LLMExtractor(
    chat_model=chat_model,
    schema=Product,
    source_column="input",
    system_prompt="Extract product attributes from the description. Return JSON with these exact field names: Brand, Gender, 'Model name', 'Shoe type', Color, Size.",
)


test_df = targets_df.copy().head(5)  
llm_results = llm_extractor.extract(test_df[['input']].copy())

print("LLM Extraction Results:")
print(f"LLM columns: {list(llm_results.columns)}")
print(f"Target columns: {[col for col in test_df.columns if col not in ['input', 'category']]}")
    
# Show sample results
print("\nSample LLM extraction results:")
for i in range(min(3, len(test_df))):
    print(f"\n{i+1}. {test_df['input'].iloc[i][:70]}...")
    for col in ['Brand', 'Gender', 'Model name', 'Shoe type', 'Color', 'Size']:
        if col in llm_results.columns and pd.notna(llm_results[col].iloc[i]):
            print(f"   {col}: {llm_results[col].iloc[i]}")

# Evaluate LLM results with proper column matching
llm_common_attributes = [col for col in ['Brand', 'Gender', 'Model name', 'Shoe type', 'Color', 'Size'] 
                            if col in llm_results.columns and col in test_df.columns]
print(f"\nEvaluating on attributes: {llm_common_attributes}")

if llm_common_attributes:
    llm_eval = evaluate_predictions(llm_results, test_df, llm_common_attributes)
    print_evaluation_results(llm_eval)
else:
    print("No matching attributes found for evaluation!")
    

LLM Extraction Results:
LLM columns: ['input', 'Brand', 'Gender', 'model_name', 'shoe_type', 'Color', 'Size']
Target columns: ['Brand', 'Gender', 'Model name', 'Shoe type', 'Color', 'Size', 'Material', 'Sport', 'Supplement type', 'Dosage', 'Health benefit', 'Net content', 'Item form', 'Pack size', 'Diet', 'Supply size', 'Age', 'Flavor', 'Specialty', 'Administration type', 'Roast type', 'Caffeine content', 'Country', 'Machine type', 'Power source', 'Operation mode', 'Firmness', 'Animal', 'Cap', 'Sub brand', 'Cereal type', 'Container', 'Protection level', 'Scent', 'Skin type', 'Volume', 'Hair type', 'Benefit', 'Specific uses', 'Load size', 'Reusability', 'Mask type', 'Closure type', 'Age range', 'Package type', 'Layer', 'Certified grade', 'Tea variety']

Sample LLM extraction results:

1. Diesel Men's Exposure High-Top Sneaker...
   Brand: Diesel
   Gender: Men
   Color: 
   Size: 

2. Florsheim Men's Milano Slip-On Loafer,Burgundy,10 D US...
   Brand: Florsheim
   Gender: Men's
   Color

In [15]:
# Create LLM extractor with corrected schema
open_llm_extractor = LLMExtractor(
    chat_model=chat_model,
    source_column="input",
    system_prompt="Extract product attributes from the description. Return JSON with these exact field names",
)


test_df = targets_df.copy().head(5)  
open_llm_results = open_llm_extractor.extract(test_df[['input']].copy())


In [16]:
open_llm_results

Unnamed: 0,input,extracted
0,Diesel Men's Exposure High-Top Sneaker,"{""brand"": ""Diesel"", ""name"": ""Exposure High-Top..."
1,"Florsheim Men's Milano Slip-On Loafer,Burgundy...","{""brand"": ""Florsheim"", ""product_type"": ""Slip-O..."
2,Finn Comfort Women's 2051-014099,"{""brand"": ""Finn Comfort"", ""product_name"": ""Wom..."
3,Pearl Izumi Men's X-Alp Low Shoe-49-Black/Moon...,"{""brand"": ""Pearl Izumi"", ""product_name"": ""X-Al..."
4,G.H. Bass & Co. Women's Margie Sandal,"{""brand"": ""G.H. Bass & Co."", ""gender"": ""Women""..."


In [17]:
open_llm_results = pd.concat(
    [open_llm_results.drop(columns=['extracted']), 
     open_llm_results['extracted'].apply(pd.read_json, typ='series').apply(pd.Series)], 
    axis=1
)
    

  open_llm_results['extracted'].apply(pd.read_json, typ='series').apply(pd.Series)],
  open_llm_results['extracted'].apply(pd.read_json, typ='series').apply(pd.Series)],
  open_llm_results['extracted'].apply(pd.read_json, typ='series').apply(pd.Series)],
  open_llm_results['extracted'].apply(pd.read_json, typ='series').apply(pd.Series)],
  open_llm_results['extracted'].apply(pd.read_json, typ='series').apply(pd.Series)],


In [18]:
print("LLM Extraction Results:")
print(f"LLM columns: {list(open_llm_results.columns)}")
print(f"Target columns: {[col for col in test_df.columns if col not in ['input', 'category']]}")

#lower case the columns
test_df.columns = test_df.columns.str.lower()
    
display(open_llm_results.head(3))

if llm_common_attributes:
    llm_eval = evaluate_predictions(open_llm_results, test_df, llm_common_attributes)
    print_evaluation_results(llm_eval)
else:
    print("No matching attributes found for evaluation!")

LLM Extraction Results:
LLM columns: ['input', 'brand', 'name', 'category', 'gender', 'type', 'product_type', 'model', 'color', 'size', 'product_name', 'model_number']
Target columns: ['Brand', 'Gender', 'Model name', 'Shoe type', 'Color', 'Size', 'Material', 'Sport', 'Supplement type', 'Dosage', 'Health benefit', 'Net content', 'Item form', 'Pack size', 'Diet', 'Supply size', 'Age', 'Flavor', 'Specialty', 'Administration type', 'Roast type', 'Caffeine content', 'Country', 'Machine type', 'Power source', 'Operation mode', 'Firmness', 'Animal', 'Cap', 'Sub brand', 'Cereal type', 'Container', 'Protection level', 'Scent', 'Skin type', 'Volume', 'Hair type', 'Benefit', 'Specific uses', 'Load size', 'Reusability', 'Mask type', 'Closure type', 'Age range', 'Package type', 'Layer', 'Certified grade', 'Tea variety']


Unnamed: 0,input,brand,name,category,gender,type,product_type,model,color,size,product_name,model_number
0,Diesel Men's Exposure High-Top Sneaker,Diesel,Exposure High-Top Sneaker,Sneaker,Men's,High-Top,,,,,,
1,"Florsheim Men's Milano Slip-On Loafer,Burgundy...",Florsheim,,,Men,,Slip-On Loafer,Milano,Burgundy,10 D,,
2,Finn Comfort Women's 2051-014099,Finn Comfort,,,Women,,,,,,Women's 2051-014099,2051-014099


INFORMATION EXTRACTION EVALUATION RESULTS

--- MICRO SCORES (Overall Performance) ---
Precision: 0.00%
Recall:    0.00%
F1 Score:  0.00%

--- ATTRIBUTE-LEVEL RESULTS ---

--- TOTAL COUNTS ---
Valid Correct (VC):      0
Valid Wrong (VW):        0
Valid Missing (VN):      0
Invalid Extra (NV):      0
No Target/Prediction:    0
