LLM Curation Pipeline

This notebook parses `extracted_text.txt`, runs Mistral Large to (1) verify ACL injury,
(2) classify text source, and (3) detect surgery/partial/associated injuries (if verified).
Results are saved to CSV for the evaluation notebook.

In [None]:
# If needed, install dependencies
# !pip install -r requirements.txt

import os
import re
import time
import concurrent.futures
from typing import Iterable, Optional

import numpy as np
import pandas as pd
import scipy.stats as stats
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, precision_score
from tqdm.notebook import tqdm
from mistralai import Mistral, SDKError

pd.set_option("display.max_colwidth", None)

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
# Configuration (only edit here)

# I/O
INPUT_FILE = "extracted_text.txt"
OUTPUT_CSV = "results/appraisal_results_01.csv"

# Model/inference
MODEL_NAME  = "mistral-large-latest"
INJURY_NAME = "anterior cruciate ligament (ACL) rupture"
max_tokens  = 127_000

# Optional skips (might be necessary if specific rows or links are causing issues)
SKIP_INDICES: Iterable[int] = [] # e.g., [915]
SKIP_LINK_PREFIXES: Iterable[str] = [] # e.g., ["https://uaf.ua/article"]

# Small-sample mode for quick tests (set to None to process all)
LIMIT: Optional[int] = None # e.g., 10 to process only first 10 rows

# API key
# Create a `.env` file in the project root:
# MISTRAL_API_KEY=your_key_here
# and let `python-dotenv` load it automatically.
API_KEY = os.getenv("MISTRAL_API_KEY")
if not API_KEY:
    raise RuntimeError("Missing MISTRAL_API_KEY in environment.")


In [None]:
# Parser

def parse_text_file(file_path):
    entries = []
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    # Split the file content into entries
    raw_entries = content.split('END_ENTRY\n*\n')
    for raw_entry in raw_entries:
        # Check if the entry is not empty
        if 'START_ENTRY' in raw_entry:
            # Extract the index, player, and tokens using regular expressions
            match = re.search(r'INDEX=(\d+), PLAYER=([^,]+), TOKENS=(\d+)', raw_entry)
            if match:
                index = int(match.group(1))
                player = match.group(2).strip()
                tokens = int(match.group(3))
                # Extract the URL link
                link_match = re.search(r'LINK=(.+)\n\*', raw_entry)
                if link_match:
                    url = link_match.group(1).strip()
                # Extract the text content
                text_content = re.search(r'LINK=.*?\*\n(.*?)\n\*\nHUMAN_INJURY', raw_entry, re.DOTALL)
                if text_content:
                    text = text_content.group(1).strip()
                # Extract human injury, text class, surgery, partial, and associated information
                human_data_match = re.search(
                    r'HUMAN_INJURY=(\d+), HUMAN_TEXT_CLASS=(\d+), HUMAN_SURGERY=(\d+), HUMAN_PARTIAL=(\d+), HUMAN_ASSOCIATED_INJURIES=(\d+)', 
                    raw_entry
                )
                if human_data_match:
                    human_injury = int(human_data_match.group(1))
                    human_text_class = int(human_data_match.group(2))
                    human_surgery = int(human_data_match.group(3))
                    human_partial = int(human_data_match.group(4))
                    human_associated_injuries = int(human_data_match.group(5))
                
                # Store the parsed data as a dictionary
                entry = {
                    'Index': index,
                    'player': player,
                    'tokens': tokens,
                    'link': url,
                    'text': text,
                    'human_injury': human_injury,
                    'human_text_class': human_text_class,
                    'human_surgery': human_surgery,
                    'human_partial': human_partial,
                    'human_associated_injuries': human_associated_injuries
                }
                entries.append(entry)
    return entries


In [None]:
# Parse the text file and store the data in a DataFrame
parsed_data = parse_text_file(INPUT_FILE)
injuries_to_verify = pd.DataFrame(parsed_data)

# Set the first column as index
injuries_to_verify.set_index('Index', inplace=True)

# Add columns to store Mistral decisions
injuries_to_verify['mistral_injury'] = 0
injuries_to_verify['mistral_text_class'] = 0
injuries_to_verify['mistral_surgery'] = 0
injuries_to_verify['mistral_partial'] = 0
injuries_to_verify['mistral_associated_injuries'] = 0

# Drop rows of texts with more than MAX_TOKENS tokens
injuries_to_verify = injuries_to_verify.drop(
    index=injuries_to_verify.loc[injuries_to_verify['tokens'] > max_tokens].index.to_list()
)

# Optional: skip specific indices
if SKIP_INDICES:
    injuries_to_verify = injuries_to_verify.drop(
        index=[i for i in SKIP_INDICES if i in injuries_to_verify.index],
        errors="ignore"
    )

# Optional: small sample
if LIMIT is not None:
    injuries_to_verify = injuries_to_verify.iloc[:int(LIMIT)].copy()

injuries_to_verify.head()


In [None]:
# Mistral integration

client = Mistral(api_key=API_KEY)
model = MODEL_NAME

def run_mistral_prompt(prompt, model=model, max_retries=8, call_timeout=15):
    messages = [{"role": "user", "content": prompt}]
    retries = 0
    delay = 4
    print(">> Starting API call for prompt.")
    while retries < max_retries:
        try:
            # Run the API call in a separate thread with a timeout
            with concurrent.futures.ThreadPoolExecutor() as executor:
                future = executor.submit(client.chat.complete, model=model, messages=messages)
                response = future.result(timeout=call_timeout)
            result = response.choices[0].message.content.strip()
            print(">> API call succeeded on attempt", retries+1)
            return result
        except concurrent.futures.TimeoutError:
            print(f">> API call timed out after {call_timeout} seconds (retry {retries+1}/{max_retries})...")
        except SDKError as e:
            error_str = str(e)
            if "429" in error_str:  # rate limit
                error_type = "Rate limit (429)"
            elif "502" in error_str:  # mistral server issue
                error_type = "Bad Gateway (502)"
            else:
                error_type = "Other"
            print(f">> {error_type} detected. Waiting {delay} seconds (retry {retries+1}/{max_retries})...")
            time.sleep(delay)
            delay *= 2  # exponential backoff
        retries += 1
    raise Exception("Maximum attempts reached.")

    
def get_valid_response(prompt, valid_responses, max_attempts=5):
    attempt = 0
    while attempt < max_attempts:
        response = run_mistral_prompt(prompt)
        resp = response.strip()
        if resp in valid_responses:
            return resp
        else:
            print(f">> Invalid response received: {resp}. Requesting new response (attempt {attempt+1}/{max_attempts})...")
            attempt += 1
    print(">> Maximum attempts reached. Returning default value '0'.")
    return "0"


In [None]:
def mistral_verifier(text, player, injury):
    prompt = f"""
As a sports medicine researcher, you need to analyze text that may or may not be specifically about soccer injuries. Your task is to determine if the following text clearly indicates that the player named {player} sustained an injury described as {injury}. Consider the following guidelines:
- Player Identification: Confirm the player's full name in the context. Be cautious of players with the same first or last name but different last or first names. Verify the context to ensure the correct player is being referenced.
- Injury Confirmation: Look for explicit mentions or strong implications of the injury type. Check for words or phrases that confirm the occurrence of the injury, such as "diagnosed with," "suffered from," "sidelined due to," or similar expressions that unambiguously relate to {injury}.
- Contextual Clarity: Ensure that the injury description is not hypothetical or speculative. The context should directly attribute the injury to the player mentioned.
- Disambiguation: If multiple players or injuries are mentioned, ensure clarity in the text that directly ties the specific injury to the specified player.
Respond with '1' if the text clearly confirms that {player} had the injury described as an anterior cruciate ligament or ACL rupture or tear. Respond with '0' if the text does not provide clear and direct confirmation.
The only allowed responses are 0 and 1.
"""
    valid = {"0", "1"}
    return int(get_valid_response(prompt + "\n\n" + text, valid))

def mistral_text_classifier(text):
    prompt = f"""
As a sports medicine researcher, your task is to classify the provided text into one of the following categories based on its content:
    'press article': 0,
    'athlete interview': 1,
    'official club/federation announcement': 2,
    'social media': 3,
    'other': 4.
Use the following guidelines to ensure accurate classification:
- 'press article': if the text appears to be extracted from a news outlet, newspaper, or online news source, including reports that reference or summarize official announcements but are not directly published by a club or federation.
- 'athlete interview': if the text contains direct quotes from the athlete or is structured as an interview.
- 'official club/federation announcement': if the text is directly sourced from the club’s or federation’s official website, press release, or an official statement.
- 'social media': if the text originates from social media platforms such as Twitter, Instagram, or Facebook.
- 'other': for any content that does not fit the above categories.
Provide only the corresponding numeric code (0, 1, 2, 3, or 4) as your response.
The only allowed responses are 0, 1, 2, 3, or 4.
"""
    valid = {"0", "1", "2", "3", "4"}
    return int(get_valid_response(prompt + "\n\n" + text, valid))

def mistral_surgery_discriminator(text, player, injury):
    prompt = f"""
As a sports medicine researcher, you are tasked with analyzing text extracted from a website regarding a soccer player named {player}, who may have sustained a specific injury described as {injury}. Your goal is to determine if the text confirms that {player} either underwent or was scheduled to undergo surgery specifically for this injury. Consider the following guidelines:
- Surgical Confirmation: Look for explicit mentions or strong implications that {player} has either undergone surgery, is scheduled for surgery, or is confirmed to require surgery for the {injury}. Phrases such as "underwent surgery," "scheduled for surgery," "required an operation," or similar expressions should be present.
- Timing and Certainty: Distinguish between past, confirmed surgeries and future or scheduled ones. Disregard statements that merely speculate or suggest surgery as a possibility.
- Injury-Specific Reference: Ensure that the surgery mentioned is directly tied to the {injury} in question.
- Player Identification: Verify that the text specifically refers to {player}.
- Contextual Clarity: Ensure that the context unambiguously connects the surgical treatment to the specified injury.
Respond with '1' if the text clearly indicates that {player} underwent or is scheduled to undergo surgery for the {injury}. Respond with '0' otherwise.
The only allowed responses are 0 and 1.
"""
    valid = {"0", "1"}
    return int(get_valid_response(prompt + "\n\n" + text, valid))

def mistral_partial_discriminator(text, player, injury):
    prompt = f"""
As a sports medicine researcher, you are tasked with analyzing soccer injuries. Review the provided text, which discusses a soccer player named {player} and a specific injury described as {injury}. Your goal is to determine if the text clearly indicates that {player} suffered a partial, but not complete, rupture of the {injury}. Consider the following guidelines:
- Partial vs. Complete Rupture: Look for explicit mentions or strong implications that {player} suffered a "partial rupture," "incomplete tear," or similar phrasing. The text should indicate that the injury is not a "complete rupture" or "full tear."
- Exclusion of Complete Rupture: If the text explicitly describes a complete rupture or full tear, respond with '0'.
- Player Identification: Verify that the text specifically refers to {player}.
- Injury-Specific Reference: Confirm that the text discusses the specific injury in question.
Respond with '1' if the text clearly indicates that {player} suffered a partial, but not complete, rupture of the {injury}. Respond with '0' otherwise.
The only allowed responses are 0 and 1.
"""
    valid = {"0", "1"}
    return int(get_valid_response(prompt + "\n\n" + text, valid))

def mistral_associated_injuries_discriminator(text, player, injury):
    prompt = f"""
As a sports medicine researcher, your task is to analyze the text scraped from a website about a soccer player named {player}, who sustained an injury described as {injury}. Your goal is to determine if the text clearly indicates that {player} had additional associated injuries, specifically related to the knee, alongside the {injury}. Consider the following guidelines:
- Identification of Associated Injuries: Look for explicit mentions of other knee injuries such as meniscal tears, medial or lateral collateral ligament injuries, or cartilage damage.
- Clear Attribution: Ensure that the additional injuries are specifically attributed to {player} in the context of the same injury incident.
- Player Identification: Verify that the text is specifically discussing {player}.
- Specificity of Injuries: Confirm that the text discusses injuries associated with the knee and directly related to the {injury}.
Respond with '1' if the text clearly indicates that {player} had other associated knee injuries in addition to the {injury}. Respond with '0' otherwise.
The only allowed responses are 0 and 1.
"""
    valid = {"0", "1"}
    return int(get_valid_response(prompt + "\n\n" + text, valid))


In [None]:
for row in tqdm(injuries_to_verify.itertuples(), total=len(injuries_to_verify), desc='Processing all entries...'):
    print(f"\n=== Processing row {row.Index} for player: {row.player} ===")
    player = row.player
    injury = INJURY_NAME # same string as before by default
    text = 'LINK=' + row.link + ' TEXT=' + row.text
    tokens = row.tokens
    print(f">> Tokens for this row: {tokens}")
    
    if tokens < max_tokens:
        # Optional general skip based on link prefixes (user-controlled)
        if SKIP_LINK_PREFIXES and any(str(row.link).startswith(p) for p in SKIP_LINK_PREFIXES):
            print(">> Link matches a skip prefix; skipping LLM calls and setting predictions to 0.")
            veredict = 0
            text_classification = 0
            surgery = 0
            partial = 0
            associated_injuries = 0
        else:
            print(">> Calling verifier...")
            veredict = mistral_verifier(text, player, injury)
            print(">> Veredict:", veredict)
            
            print(">> Calling text classifier...")
            text_classification = mistral_text_classifier(text)
            print(">> Text Classification:", text_classification)
            
            if veredict == 1:
                print(">> Veredict positive; calling surgery discriminator...")
                surgery = mistral_surgery_discriminator(text, player, injury)
                print(">> Surgery:", surgery)
                
                print(">> Calling partial discriminator...")
                partial = mistral_partial_discriminator(text, player, injury)
                print(">> Partial:", partial)
                
                print(">> Calling associated injuries discriminator...")
                associated_injuries = mistral_associated_injuries_discriminator(text, player, injury)
                print(">> Associated Injuries:", associated_injuries)
            else:
                surgery = 0
                partial = 0
                associated_injuries = 0
                print(">> Veredict negative; skipping further checks.")
    else:
        print(">> Token count exceeds maximum; skipping API calls.")
        veredict = "Skipped"
        text_classification = "Skipped"
        surgery = "Skipped"
        partial = "Skipped"
        associated_injuries = "Skipped"
        
    injuries_to_verify.at[row.Index, 'mistral_injury'] = veredict
    injuries_to_verify.at[row.Index, 'mistral_text_class'] = text_classification
    injuries_to_verify.at[row.Index, 'mistral_surgery'] = surgery
    injuries_to_verify.at[row.Index, 'mistral_partial'] = partial
    injuries_to_verify.at[row.Index, 'mistral_associated_injuries'] = associated_injuries


In [None]:
# Save csv

# Create output folder if needed
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)

injuries_to_verify.to_csv(OUTPUT_CSV, index=True)
print(f"Saved to {OUTPUT_CSV}")
