# Set input- & output-files and API-Key

In [None]:
inputFilePath=""
outputFilePath=""
APIKey=""
extractionPrompt="""
Goal:
Extract from the provided article the following entities:
1. AthletesAndTeams: List individuals and teams affiliated with Red Bull. List any aliases or variations of the team names and correct any spelling mistakes. If someone is known by a nickname, use nickname instead of name.
2. Disciplines: Capture every mention of competitive sports & e-sports disciplines. Consider both full names and common abbreviations.
3. Events: Identify any formally named tournaments, championships, or events (e.g.: “League of Legends World Championship”).

Additional Instructions:
- Translate all Discipline- and Event names to English
- Search entire text (including background or historical references) for all explicit and implicit references to the above categories.
- Return exactly one JSON object containing the keys “AthletesAndTeams”, “Disciplines”, and “Events”. If any of categories not mentioned, provide empty array for that key.
- Do only include mentions from the article, not from the instruction.

Output single JSON object with these exact keys, containing a list of strings:
{ 
"AthletesAndTeams": [],
"Disciplines": [],
"Events": []
}

Article:
"""
gptModel="gpt-4.1-mini"
modelTemperature=0.5
rerunCount=0
rerunAnalysisPrompt="""From web-articles extractions below, make sure all entries English, no duplicates, names spelled correctly. Return single JSON object with same keys as inputs."""

# Helper classes

### Article Processor class

In [2]:
import os
import sys
import json
import re
import logging
import unicodedata
import glob
from typing import Optional, Dict, List, Any

try:
    from bs4 import BeautifulSoup
except ImportError:
    # If BeautifulSoup is not available, fallback to regex-based HTML removal.
    BeautifulSoup = None

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler()]
)

class ArticleProcessor:
    """Handle article loading, cleaning, and processing operations"""
    
    @staticmethod
    def load_article(file_path: str) -> Optional[Dict[str, Any]]:
        """Load the JSON file from the given path with proper error handling."""
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                return data
        except FileNotFoundError:
            logging.error(f"Error: The file '{file_path}' was not found.")
            return None
        except json.JSONDecodeError as e:
            logging.error(f"Error: Failed to parse JSON file '{file_path}': {e}")
            return None
    
    @staticmethod
    def remove_html_tags(text: str) -> str:
        """Remove HTML tags from text using BeautifulSoup if available, else fallback to regex."""
        if BeautifulSoup is not None:
            # Use BeautifulSoup for robust HTML tag removal
            soup = BeautifulSoup(text, "html.parser")
            return soup.get_text(separator=" ", strip=True)
        else:
            # Fallback regex to remove <...> tags
            html_tag_pattern = re.compile(r'<[^>]+>')
            return html_tag_pattern.sub('', text)
    
    @staticmethod
    def clean_text(text: str) -> str:
        """Clean the text by applying multiple cleaning steps"""
        # Remove HTML tags
        text = ArticleProcessor.remove_html_tags(text)
        
        # Normalize Unicode (standardizes quotes, dashes, spaces, etc.)
        text = unicodedata.normalize("NFKC", text)
        
        # Remove extra whitespace and newlines
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Remove emojis using a Unicode-aware regex
        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags
            u"\U00002702-\U000027B0"  # other symbols
            u"\U000024C2-\U0001F251"  # enclosed characters
            "]+", flags=re.UNICODE)
        text = emoji_pattern.sub('', text)
        
        # Remove hyperlinks (URLs)
        link_pattern = re.compile(r'http\S+|www\.\S+')
        text = link_pattern.sub('', text)
        
        # Remove hashtag symbols (keep the text) and Twitter handles (@username)
        text = re.sub(r'#', '', text)
        text = re.sub(r'@\w+', '', text)

        # Final whitespace normalization (in case removals left extra spaces)
        text = re.sub(r'\s+', ' ', text).strip()

        # Sanity check: warn if the cleaned text is suspiciously short
        if len(text) < 20:
            logging.warning("Cleaned text is very short; please verify that essential content was not removed.")

        return text
    
    @staticmethod
    def prepare_article(data: Dict[str, Any], print_result: bool = False) -> Dict[str, str]:
        """Extract and clean fields from the JSON article."""
        headline = ArticleProcessor.clean_text(data.get('headline', ''))
        article_body = ArticleProcessor.clean_text(data.get('articleBody', ''))
        date_published = ArticleProcessor.clean_text(data.get('datePublished', ''))

        if print_result:
            logging.info(f"Headline: {headline}")
            logging.info(f"Article Body: {article_body}")
            logging.info(f"Date Published: {date_published}\n")

        return {
            "headline": headline,
            "articleBody": article_body,
            "datePublished": date_published
        }
    
    @staticmethod
    def get_json_files(directory_path: str) -> List[str]:
        """Get all JSON files from the specified directory"""
        if not os.path.exists(directory_path):
            print(f"Directory '{directory_path}' does not exist")
            return []
            
        # Find all .json files in the directory
        json_files = glob.glob(os.path.join(directory_path, "*.json"))
        
        if not json_files:
            print(f"No JSON files found in '{directory_path}'")
            return []
            
        print(f"Found {len(json_files)} JSON files in '{directory_path}'")
        return json_files

### GPT Client class

In [3]:
from openai import OpenAI
from typing import Optional, Dict, List, Any
import time
import re
import json

class GptClient:
    """Handle GPT API interactions and response processing"""
    
    def __init__(self, api_key: str, model: str, temperature: float = 0.5):
        self.api_key = api_key
        self.model = model
        self.temperature = temperature
    
    def call_api(self, prompt: str, print_output: bool = False, retries: int = 3, backoff: float = 2.0) -> Optional[str]:
        """Call GPT API with error handling and exponential backoff retries"""
        attempt = 0
        while attempt < retries:
            try:
                client = OpenAI(api_key=self.api_key)
                response = client.responses.create(
                    model=self.model,
                    input=prompt,
                    temperature=self.temperature
                )
                if print_output:
                    print(response.output_text)
                return response.output_text
            except Exception as e:
                attempt += 1
                wait_time = backoff ** attempt
                print(f"API call error ({attempt}/{retries}): {e}. Retrying in {wait_time} seconds...")
                if attempt < retries:
                    time.sleep(wait_time)
                else:
                    print("Maximum retries reached, skipping this article.")
                    return None
    
    @staticmethod
    def process_output(gpt_output_str: Optional[str]) -> Optional[Dict[str, Any]]:
        """Process GPT output with validation"""
        if not gpt_output_str:
            return None

        # Extract JSON part from potential code blocks or extra text
        json_match = re.search(r'\{[\s\S]*\}', gpt_output_str)
        if json_match:
            gpt_output_str = json_match.group(0)
            
        try:
            data = json.loads(gpt_output_str)
            return data
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {e}")
            return None
    
    def extract_entities(self, article_text: str, extraction_prompt: str, print_output: bool = False) -> Optional[Dict[str, Any]]:
        """Extract entities from an article using the extraction prompt"""
        prompt = extraction_prompt + article_text
        output = self.call_api(prompt, print_output)
        return self.process_output(output)
    
    def consolidate_results(self, extraction_results: List[Dict[str, Any]], consolidation_prompt: str) -> Optional[Dict[str, Any]]:
        """Consolidate multiple extraction results"""
        consolidated_input = json.dumps(extraction_results, indent=2)
        full_prompt = consolidation_prompt + "\n\n" + consolidated_input
        consolidated_output = self.call_api(full_prompt)
        return self.process_output(consolidated_output)

### Data output helper

In [4]:
import csv
import json
import os
from typing import Dict, List, Set, Optional, Tuple

class CsvHandler:
    """Handle CSV operations with context management"""
    def __init__(self, filename: str, fieldnames: List[str]):
        self.filename = filename
        self.fieldnames = fieldnames
        self._existing_entries: Optional[Set[Tuple[str, str, str]]] = None

    @property
    def existing_entries(self) -> Set[Tuple[str, str, str]]:
        """Lazily load existing entries as tuples of (file_path, headline, date_published)"""
        if self._existing_entries is None:
            self._existing_entries = self._load_existing_entries()
        return self._existing_entries

    def _load_existing_entries(self) -> Set[Tuple[str, str, str]]:
        if not os.path.exists(self.filename):
            return set()
        with open(self.filename, mode='r', encoding='utf-8') as f:
            return {(row['FileName'], row['Headline'], row['DatePublished']) 
                   for row in csv.DictReader(f)}

    def write_extractions(self, extractions: List[Dict]) -> None:
        """Write new extractions to CSV file"""
        new_extractions = [
            ext for ext in extractions
            if (ext.get('FileName'), ext.get('Headline'), ext.get('DatePublished')) 
               not in self.existing_entries
        ]

        if not new_extractions:
            print("No new entries to write")
            return

        mode = 'a' if os.path.exists(self.filename) else 'w'
        with open(self.filename, mode=mode, newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=self.fieldnames)
            if mode == 'w':
                writer.writeheader()
            writer.writerows(new_extractions)

        print(f"Wrote {len(new_extractions)} new entries")

# Main

### Process articles

In [5]:
# Initialize processor and client
article_processor = ArticleProcessor()
gpt_client = GptClient(api_key=APIKey, model=gptModel, temperature=modelTemperature)

# Get all JSON files from input directory
json_files = article_processor.get_json_files(inputFilePath)
json_files

Found 5 JSON files in 'webarticles_sample'


['webarticles_sample/1b48aa76-5de9-462a-993a-e7f85ea3e9ab.json',
 'webarticles_sample/0c1eefe7-d4ec-4344-9ded-32088d62f6b7.json',
 'webarticles_sample/1abfca25-f7f7-4e6c-9cb0-32fd0768d4c7.json',
 'webarticles_sample/0bdb0fa7-7269-47d0-aabc-5e5941cda28c.json',
 'webarticles_sample/1b597488-5461-4660-a4cd-6e10c5216bcc.json']

In [6]:
# Process articles

# Initialize structures to hold processed data
processed_articles = []
extraction_results = []

# Process each file
for file_path in json_files:
    print(f"Processing {file_path}")
    
    # Load and prepare article
    data = ArticleProcessor.load_article(file_path)
    if data is None:
        print(f"Skipping {file_path} due to loading error")
        continue
        
    # Clean and prepare article content
    cleaned_article = ArticleProcessor.prepare_article(data, print_result=True)
    
    # Store article data for processing
    processed_articles.append({
        "file_path": file_path,
        "headline": cleaned_article["headline"],
        "article_body": cleaned_article["articleBody"],
        "date_published": cleaned_article["datePublished"]
    })
    
print(f"Successfully processed {len(processed_articles)} articles")

2025-04-15 14:54:49,631 [INFO] Headline: The future of 'high-speed' rail in America? Here's what it was like taking Brightline in Florida
2025-04-15 14:54:49,632 [INFO] Article Body: I'm sitting on a brand new train in Florida, cruising at up to 124 miles per hour, as I begin writing this review. Brightline launched its new Orlando to South Florida service in September. I decided to take one of the first trains from Orlando International Airport (MCO) to Miami to see what the new service was like in one of the coach cabins on what's called a "Smart Fare." The Florida-based private company runs (somewhat) high-speed trains from Orlando International Airport's Terminal C in Central Florida to Miami, covering 235 miles between 3 and 3 and 1/2 hours, depending on South Florida station stops. Brightline describes the company as "... the boldest private infrastructure project in the nation and marks the revitalization of Henry Flagler's original vision for the Florida East Coast Railway." In

Processing webarticles_sample/1b48aa76-5de9-462a-993a-e7f85ea3e9ab.json
Processing webarticles_sample/0c1eefe7-d4ec-4344-9ded-32088d62f6b7.json
Processing webarticles_sample/1abfca25-f7f7-4e6c-9cb0-32fd0768d4c7.json
Processing webarticles_sample/0bdb0fa7-7269-47d0-aabc-5e5941cda28c.json
Processing webarticles_sample/1b597488-5461-4660-a4cd-6e10c5216bcc.json
Successfully processed 5 articles


### Extract entities using GPT

In [7]:
# Extract entities using GPT
for article in processed_articles:
    print(f"Extracting data from: {os.path.basename(article['file_path'])}")
    
    # First extraction run
    output_dict = gpt_client.extract_entities(article['article_body'], extractionPrompt)
    
    # Check if extraction failed
    if not output_dict:
        print(f"Failed to extract entities for {os.path.basename(article['file_path'])}")
        continue
    
    # If rerunCount > 0, do multiple runs and consolidate results
    if rerunCount > 0:
        print(f"Performing {rerunCount} additional runs for validation...")
        all_extraction_results = [output_dict]  # Store first run
        
        # Perform additional runs
        for run in range(rerunCount):
            print(f"Run {run+1}/{rerunCount}...")
            rerun_dict = gpt_client.extract_entities(article['article_body'], extractionPrompt)
            if rerun_dict:
                all_extraction_results.append(rerun_dict)
            else:
                print(f"Failed to get GPT response for run {run+1}")
            
            # Add a small delay to prevent API rate limiting
            time.sleep(0.5)
        
        # Consolidate results using rerunAnalysisPrompt if we have multiple results
        if len(all_extraction_results) > 1:
            print("Consolidating results from multiple runs...")
            
            # Call GPT to consolidate results
            consolidated_dict = gpt_client.consolidate_results(all_extraction_results, rerunAnalysisPrompt)
            if consolidated_dict:
                output_dict = consolidated_dict  # Replace with consolidated results
                print("Successfully consolidated multiple extraction runs")
            else:
                print("Failed to get consolidated output, using first run results")
    
    # Extract entities from the result
    athletes_teams = output_dict.get("AthletesAndTeams", output_dict.get("RedBullAthletesAndTeams", []))
    disciplines = output_dict.get("Disciplines", [])
    events = output_dict.get("Events", [])
    
    print(athletes_teams, disciplines, events)

    # Convert to arrays if they're not already
    if isinstance(athletes_teams, str) and athletes_teams:
        athletes_teams = [athletes_teams]
    elif not isinstance(athletes_teams, list):
        athletes_teams = []
        
    if isinstance(disciplines, str) and disciplines:
        disciplines = [disciplines]
    elif not isinstance(disciplines, list):
        disciplines = []
        
    if isinstance(events, str) and events:
        events = [events]
    elif not isinstance(events, list):
        events = []
    
    # Store results
    extraction_results.append({
        "file_path": article['file_path'],
        "headline": article['headline'],
        "date_published": article['date_published'],
        "athletes_teams": athletes_teams,
        "disciplines": disciplines,
        "events": events
    })
    
    # Add a small delay to prevent API rate limiting
    time.sleep(0.5)

print(f"Successfully extracted data from {len(extraction_results)} articles")

Extracting data from: 1b48aa76-5de9-462a-993a-e7f85ea3e9ab.json


2025-04-15 14:54:50,639 [INFO] HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


[] [] []
Extracting data from: 0c1eefe7-d4ec-4344-9ded-32088d62f6b7.json


2025-04-15 14:54:53,230 [INFO] HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


['Red Bull Racing', 'Red Bull', 'Sebastian Vettel', 'Max Verstappen', 'Williams', 'Alpine', 'Alpine F1 Team', 'Renault', 'Aston Martin', 'McLaren', 'Jaguar'] ['Formula 1', 'F1'] ['Formula 1 Dutch Grand Prix 2023']
Extracting data from: 1abfca25-f7f7-4e6c-9cb0-32fd0768d4c7.json


2025-04-15 14:54:54,705 [INFO] HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


['Wings Team (Red Bull)'] [] []
Extracting data from: 0bdb0fa7-7269-47d0-aabc-5e5941cda28c.json


2025-04-15 14:54:57,713 [INFO] HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


['Max Verstappen', 'Red Bull', 'Adrian Newey', 'Pierre Waché', 'Christian Horner', 'Sebastian Vettel', 'Nigel Mansell', 'Williams-Renault', 'BMW-Sauber', 'Ford (Jim Farley)', 'Alinghi', 'Bobby Rahal', 'Mario Andretti'] ['Formula One', 'F1', 'Grand Prix racing', 'Motorsport', 'IndyCar', 'GT racing', 'Cycling', 'Rugby', 'E-sports', 'MotoGP', 'Track car racing', 'Supercar racing', 'Submarine racing'] ['Brazilian Grand Prix', 'Rugby World Cup', 'Las Vegas Grand Prix', 'League of Legends World Championship', 'IndyCar race in Las Vegas 1984']
Extracting data from: 1b597488-5461-4660-a4cd-6e10c5216bcc.json


2025-04-15 14:55:00,001 [INFO] HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


['Red Bull'] [] []
Successfully extracted data from 5 articles


### Write the data to the output file

In [8]:
# Initialize CSV handler
csv_handler = CsvHandler(
    outputFilePath,
    ["FileName", "Headline", "DatePublished", "AthletesAndTeams", "Disciplines", "Events"]
)

# Prepare final data for CSV
csv_extractions = []
for result in extraction_results:
    # Skip entries where all extraction fields are empty
    if not result['athletes_teams'] and not result['disciplines'] and not result['events']:
        print(f"Skipping {os.path.basename(result['file_path'])} - no entities extracted")
        continue
        
    # Ensure lists contain only strings before joining
    athletes_teams_str = ", ".join([str(item) for item in result['athletes_teams']]) if result['athletes_teams'] else ""
    disciplines_str = ", ".join([str(item) for item in result['disciplines']]) if result['disciplines'] else ""
    events_str = ", ".join([str(item) for item in result['events']]) if result['events'] else ""
    
    extraction = {
        "FileName": os.path.basename(result['file_path']),
        "Headline": result['headline'],
        "DatePublished": result['date_published'],
        "AthletesAndTeams": athletes_teams_str,
        "Disciplines": disciplines_str,
        "Events": events_str
    }
    
    csv_extractions.append(extraction)

# Write to CSV
if csv_extractions:
    csv_handler.write_extractions(csv_extractions)
    print(f"Data written to {outputFilePath}")
else:
    print("No data to write to CSV")

Skipping 1b48aa76-5de9-462a-993a-e7f85ea3e9ab.json - no entities extracted
Wrote 4 new entries
Data written to extractions.csv
