In [1]:
import mysql.connector
from dotenv import load_dotenv
import os
import pandas as pd
import numpy as np
import unicodedata
import re
from datetime import timedelta, datetime
from rapidfuzz import fuzz
from sklearn.impute import KNNImputer
from typing import Dict
from sklearn.preprocessing import OneHotEncoder
import pickle
import torch
import torch.nn as nn
from pathlib import Path
import joblib
from difflib import SequenceMatcher
from typing import List, Tuple, Dict, Optional

In [2]:
pd.set_option("display.max_columns", None)

Run etl.ipynb

In [3]:
load_dotenv()

# Event data
conn = mysql.connector.connect(
    host=os.getenv("DB_HOST"),
    user=os.getenv("DB_USER"),
    database=os.getenv("DB_NAME"),
    password=os.getenv("DB_PASSWORD")
    )

cursor = conn.cursor()

query = ("SELECT * FROM events")

cursor.execute(query)

columns = [desc[0] for desc in cursor.description]

rows = cursor.fetchall()

df_events = pd.DataFrame(rows, columns=columns)

cursor.close()
conn.close()

# Convert to datetime
df_events['event_date'] = pd.to_datetime(df_events['event_date'], format="%Y-%m-%d")
# Drop duplicates
df_events[df_events.duplicated(keep=False)]
df_events = df_events.drop_duplicates()
# Drop where fights are null
df_events = df_events[~((df_events['fighter_red'].isna()) & (df_events['fighter_blue'].isna()))]
# Drop where winner is null
df_events = df_events[~df_events[['winner']].isnull().all(axis=1)]


# Fill stances
df_fighters_red = df_events[['fighter_red', 'stance_red']].rename(
    columns={'fighter_red': 'fighter', 'stance_red': 'stance'}
)
df_fighters_blue = df_events[['fighter_blue', 'stance_blue']].rename(
    columns={'fighter_blue': 'fighter', 'stance_blue': 'stance'}
)

# Combine into one fighter dataframe
df_fighters = pd.concat([df_fighters_red, df_fighters_blue], ignore_index=True)

# Drop duplicate fighters (keep their first known stance)
df_fighters = df_fighters.drop_duplicates(subset=['fighter'], keep='first')

# Count unique stances across fighters
stance_counts = df_fighters['stance'].value_counts(dropna=True)

top_stances = stance_counts.index[:1].tolist()

df_events['stance_red'] = df_events['stance_red'].fillna(top_stances[0])
df_events['stance_blue'] = df_events['stance_blue'].fillna(top_stances[0])


# Convert to int
for col in df_events.select_dtypes(include=['number']).columns:
    df_events[col] = df_events[col].astype('int64')
    
# Change winner column to bolean
df_events['winner'] = (df_events['winner'] == df_events['fighter_red']).astype(int)

# Normalize weight classes
def clean_text(val: str) -> str:
    if not isinstance(val, str):
        return val
    val = val.strip().lower()  # lowercase + trim
    val = re.sub(r"\s+", " ", val)  # collapse multiple spaces
    val = val.replace("womens", "women's")  # normalize missing apostrophe
    val = val.replace("women ", "women's ") # normalize if missing "'s"
    return val

# master map of cleaned values → canonical
mapping = {
    "lightweight": "Lightweight",
    "welterweight": "Welterweight",
    "middleweight": "Middleweight",
    "featherweight": "Featherweight",
    "bantamweight": "Bantamweight",
    "heavyweight": "Heavyweight",
    "light heavyweight": "Light Heavyweight",
    "flyweight": "Flyweight",
    "women's strawweight": "Women's Strawweight",
    "women's flyweight": "Women's Flyweight",
    "women's bantamweight": "Women's Bantamweight",
    "open weight": "Open Weight",
    "catch weight": "Catch Weight"
}

def normalize_weight_class(val):
    # clean first
    cleaned = clean_text(val)

    # if it matches exactly after cleaning
    if cleaned in mapping:
        return mapping[cleaned]
    
    # fuzzy keyword matching for tournament-style
    if "heavyweight" in cleaned and "light" not in cleaned:
        return "Heavyweight"
    elif "lightweight" in cleaned and "feather" not in cleaned:
        return "Lightweight"
    elif "middleweight" in cleaned:
        return "Middleweight"
    elif "featherweight" in cleaned:
        return "Featherweight"
    elif "bantamweight" in cleaned:
        # if it’s a women’s version
        if "women" in cleaned:
            return "Women's Bantamweight"
        return "Bantamweight"
    elif "flyweight" in cleaned:
        if "women" in cleaned:
            return "Women's Flyweight"
        return "Flyweight"
    elif "strawweight" in cleaned:
        return "Women's Strawweight"
    
    # default
    return "Open Weight"

df_events["weight_class"] = df_events["weight_class"].apply(normalize_weight_class)

df_events.drop(columns=['id'], axis=1, inplace=True)

df_events = df_events.reset_index(drop=True)

# Normalize names
def normalize_name(name):

    if pd.isna(name):
        return ""
    
    # Convert to string if not already
    name = str(name)

    # Remove extra whitespace
    #name = name.strip()

    # Normalize unicode characters
    name = unicodedata.normalize('NFKD', name)
    name = ''.join(c for c in name if not unicodedata.combining(c))

    # Convert to lower
    name = name.lower()

    # Remove apostrophes and replace with nothing or space
    #name = re.sub(r"'", "", name)

    # Remove periods and other punctuation
    name = re.sub(r'[^\w\s-]', '', name)

    # Normalize spaces (multiple spaces to single space)
    name = re.sub(r'\s+', ' ', name)

    words = name.split()

    return ' '.join(words).strip()

df_events[["fighter_red", "fighter_blue"]] = df_events[["fighter_red", "fighter_blue"]].map(normalize_name)

events_cleaned = df_events.copy()

In [4]:
# Stats data
conn = mysql.connector.connect(
    host=os.getenv("DB_HOST"),
    user=os.getenv("DB_USER"),
    database=os.getenv("DB_NAME"),
    password=os.getenv("DB_PASSWORD")
    )

cursor = conn.cursor()

query = ("SELECT * FROM stats")

cursor.execute(query)

columns = [desc[0] for desc in cursor.description]

rows = cursor.fetchall()

df_stats = pd.DataFrame(rows, columns=columns)

cursor.close()
conn.close()

# Drop duplicate names
df_stats = df_stats.assign(nan_count=df_stats.isnull().sum(axis=1)) \
            .sort_values(['name', 'nan_count']) \
            .drop_duplicates('name', keep='first') \
            .drop('nan_count', axis=1)

# Keep necessary columns
df_stats = df_stats[['name', 'octagon_debut', 'height', 'weight', 'reach',
    'leg_reach', 'sig_strikes_landed_per_minute', 'sig_strikes_absorbed_per_minute', 
    'takedowns_avg', 'submission_avg', 'knockdown_avg', 'fight_time_avg']]

# Fill with null the 0 values
df_stats[['height', 'weight']] = df_stats[['height', 'weight']].replace(0, np.nan)

# Fill with 0
df_stats[['sig_strikes_landed_per_minute', 'sig_strikes_absorbed_per_minute','takedowns_avg',
        'submission_avg','knockdown_avg']] = df_stats[['sig_strikes_landed_per_minute', 
                                        'sig_strikes_absorbed_per_minute','takedowns_avg','submission_avg',
                                        'knockdown_avg']].fillna(0)

# Fill null
df_stats[['fight_time_avg']] = df_stats[['fight_time_avg']].fillna('00:00')

# Convert time to seconds
df_stats['fight_time_avg'] = pd.to_timedelta('00:' + df_stats['fight_time_avg']).dt.total_seconds().astype(int)

# Impute missing values
# Load the fitted imputer
with open("../models/knn_imputer_stats.pkl", "rb") as f:
    imputer = pickle.load(f)
missing_cols = df_stats.columns[df_stats.isnull().any()]

# Impute using the existing fitted imputer
df_stats[missing_cols] = imputer.transform(df_stats[missing_cols])

# Normalize names
df_stats["name"] = df_stats["name"].map(normalize_name)

stats_cleaned = df_stats.copy()

Run feature_engineering.ipynb

In [5]:
# Fuzzy match
def create_fuzzy_mapping(event_names, stats_names, threshold=85):
    """
    Create a mapping dictionary from events fighter names to stats names
    using fuzzy matching.
    """
    mapping = {}

    for event_name in event_names:
        if pd.isna(event_name):
            continue

        best_match = None
        best_score = 0

        for stats_name in stats_names:
            if pd.isna(stats_name):
                continue

            # Calculate fuzzy match scores
            ratio = fuzz.ratio(event_name, stats_name)
            token_sort_ratio = fuzz.token_sort_ratio(event_name, stats_name)
            token_set_ratio = fuzz.token_set_ratio(event_name, stats_name)
            score = max(ratio, token_sort_ratio, token_set_ratio)
            
            if score >= threshold and score > best_score:
                best_match = stats_name
                best_score = score

        if best_match:
            mapping[event_name] = best_match
        else:
            mapping[event_name] = event_name # Keep original if no match found

    return mapping

# Get unique fighter names from events
blue_fighters = events_cleaned['fighter_blue'].dropna().unique()
red_fighters = events_cleaned['fighter_red'].dropna().unique()
all_event_fighters = set(blue_fighters) | set(red_fighters)

# Get unique names from stats
stats_names = stats_cleaned['name'].dropna().unique()

# Create fuzzy mapping
print("Creating fuzzy mapping...")
fuzzy_mapping = create_fuzzy_mapping(all_event_fighters, stats_names, threshold=85)

# Apply fuzzy mapping to create new columns for merging
events_cleaned['fighter_blue_mapped'] = events_cleaned['fighter_blue'].map(fuzzy_mapping)
events_cleaned['fighter_red_mapped'] = events_cleaned['fighter_red'].map(fuzzy_mapping)

# Fill NaN values with original names (for cases where mapping didn't work)
events_cleaned['fighter_blue_mapped'] = events_cleaned['fighter_blue_mapped'].fillna(events_cleaned['fighter_blue'])
events_cleaned['fighter_red_mapped'] = events_cleaned['fighter_red_mapped'].fillna(events_cleaned['fighter_red'])

# Now perform the merges using the mapped names
# Merge stats for fighter_blue
df_merged = events_cleaned.merge(
    stats_cleaned,
    how='left',
    left_on='fighter_blue_mapped',
    right_on='name',
    suffixes=('', '_drop')
)

# Rename fighter_blue stats columns with _blue suffix (excluding the original 'name')
cols_to_rename_blue = [col for col in stats_cleaned.columns if col != 'name']
df_merged.rename(columns={col: f"{col}_blue" for col in cols_to_rename_blue}, inplace=True)
df_merged.drop(columns=['name', 'fighter_blue_mapped'], inplace=True)

# Merge stats for fighter_red
df_merged = df_merged.merge(
    stats_cleaned,
    how='left',
    left_on='fighter_red_mapped',
    right_on='name',
    suffixes=('', '_drop')
)

# Rename fighter_red stats columns with _red suffix
cols_to_rename_red = [col for col in stats_cleaned.columns if col != 'name']
df_merged.rename(columns={col: f"{col}_red" for col in cols_to_rename_red}, inplace=True)
df_merged.drop(columns=['name', 'fighter_red_mapped'], inplace=True)

# Fill octagon_debut with min value of the event_date of the fighter
df_merged['octagon_debut_blue'] = pd.to_datetime(df_merged['octagon_debut_blue'], errors='coerce')
df_merged['octagon_debut_red'] = pd.to_datetime(df_merged['octagon_debut_red'], errors='coerce')

fighters_long = pd.concat([
    events_cleaned[['fighter_red', 'event_date']].rename(columns={'fighter_red': 'fighter'}),
    events_cleaned[['fighter_blue', 'event_date']].rename(columns={'fighter_blue': 'fighter'})
], ignore_index=True)

fighter_debuts = fighters_long.groupby('fighter')['event_date'].min().reset_index()
fighter_debuts.rename(columns={'event_date': 'octagon_debut'}, inplace=True)

df_merged = df_merged.merge(
    fighter_debuts, left_on='fighter_red', right_on='fighter', how='left'
)

df_merged.loc[df_merged['octagon_debut_red'].isna(), 'octagon_debut_red'] = \
    df_merged.loc[df_merged['octagon_debut_red'].isna(), 'octagon_debut']

df_merged.drop(columns='fighter', inplace=True)
df_merged.drop(columns='octagon_debut', inplace=True)

df_merged = df_merged.merge(
    fighter_debuts, left_on='fighter_blue', right_on='fighter', how='left'
)

df_merged.loc[df_merged['octagon_debut_blue'].isna(), 'octagon_debut_blue'] = \
    df_merged.loc[df_merged['octagon_debut_blue'].isna(), 'octagon_debut']

df_merged.drop(columns='fighter', inplace=True)
df_merged.drop(columns='octagon_debut', inplace=True)

# Impute missing values
# Load the fitted imputer
with open("../models/knn_imputer_feature_engineering.pkl", "rb") as f:
    imputer = pickle.load(f)
missing_cols = df_merged.columns[df_merged.isnull().any()]

df_merged[missing_cols] = imputer.fit_transform(df_merged[missing_cols])

# Convert to datetime
df_merged['event_date'] = pd.to_datetime(df_merged['event_date'], errors='coerce')

df_merged.sort_values(by=['event_date'], inplace=True)
df_merged.reset_index(drop=True, inplace=True)

Creating fuzzy mapping...


Input names to predict and find matches

In [6]:
class FighterNameMatcher:
    def __init__(self, dataframe: pd.DataFrame):
        """
        Initialize the matcher with a dataframe containing fighter names.
        
        Args:
            dataframe: DataFrame with 'fighter_red' and 'fighter_blue' columns
        """
        self.df = dataframe
        self.unique_fighters = self._get_unique_fighters()
        
    def _get_unique_fighters(self) -> List[str]:
        """Extract unique fighter names from both red and blue columns."""
        red_fighters = self.df['fighter_red'].dropna().unique()
        blue_fighters = self.df['fighter_blue'].dropna().unique()
        
        # Combine and get unique names
        all_fighters = list(set(list(red_fighters) + list(blue_fighters)))
        return sorted(all_fighters)
    
    def _normalize_name(self, name: str) -> str:
        """
        Normalize fighter name for better matching.
        
        Args:
            name: Raw fighter name
            
        Returns:
            Normalized name
        """
        if pd.isna(name):
            return ""
            
        # Convert to lowercase and remove extra spaces
        normalized = str(name).lower().strip()
        
        # Remove common prefixes/suffixes and special characters
        normalized = re.sub(r'\b(jr|sr|iii|ii|iv)\b\.?', '', normalized)
        normalized = re.sub(r'[^\w\s-]', '', normalized)  # Keep letters, numbers, spaces, hyphens
        normalized = re.sub(r'\s+', ' ', normalized).strip()  # Normalize spaces
        
        return normalized
    
    def _calculate_similarity(self, name1: str, name2: str) -> float:
        """
        Calculate similarity between two names using multiple methods.
        
        Args:
            name1, name2: Names to compare
            
        Returns:
            Similarity score (0-1, higher is better)
        """
        norm_name1 = self._normalize_name(name1)
        norm_name2 = self._normalize_name(name2)
        
        if not norm_name1 or not norm_name2:
            return 0.0
        
        # Exact match after normalization
        if norm_name1 == norm_name2:
            return 1.0
        
        # SequenceMatcher for overall similarity
        seq_similarity = SequenceMatcher(None, norm_name1, norm_name2).ratio()
        
        # Check for partial matches (useful for nicknames or name variations)
        words1 = set(norm_name1.split())
        words2 = set(norm_name2.split())
        
        if words1 and words2:
            # Jaccard similarity for word overlap
            intersection = len(words1.intersection(words2))
            union = len(words1.union(words2))
            jaccard_similarity = intersection / union if union > 0 else 0
            
            # Weighted combination
            similarity = 0.7 * seq_similarity + 0.3 * jaccard_similarity
        else:
            similarity = seq_similarity
        
        return similarity
    
    def find_best_matches(self, input_name: str, threshold: float = 0.6, top_k: int = 3) -> List[Tuple[str, float]]:
        """
        Find best matching fighter names.
        
        Args:
            input_name: Name to search for
            threshold: Minimum similarity threshold
            top_k: Number of top matches to return
            
        Returns:
            List of (fighter_name, similarity_score) tuples
        """
        similarities = []
        
        for fighter in self.unique_fighters:
            similarity = self._calculate_similarity(input_name, fighter)
            if similarity >= threshold:
                similarities.append((fighter, similarity))
        
        # Sort by similarity (descending) and return top k
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[:top_k]
    
    def get_fighter_match(self, input_name: str, auto_select_threshold: float = 0.95) -> Dict:
        """
        Get fighter match with user interaction for ambiguous cases.
        
        Args:
            input_name: Name to search for
            auto_select_threshold: Threshold for automatic selection
            
        Returns:
            Dictionary with match results
        """
        matches = self.find_best_matches(input_name)
        
        if not matches:
            return {
                'status': 'no_match',
                'input_name': input_name,
                'matched_name': None,
                'similarity': 0.0,
                'suggestions': []
            }
        
        best_match, best_score = matches[0]
        
        # Auto-select if confidence is high enough
        if best_score >= auto_select_threshold:
            return {
                'status': 'exact_match',
                'input_name': input_name,
                'matched_name': best_match,
                'similarity': best_score,
                'suggestions': matches
            }
        
        # Return suggestions for user selection
        return {
            'status': 'suggestions',
            'input_name': input_name,
            'matched_name': None,
            'similarity': best_score,
            'suggestions': matches
        }
    
    def interactive_match(self, input_name: str) -> Optional[str]:
        """
        Interactive matching with user prompts - always shows top 3 unless 100% match.
        
        Args:
            input_name: Name to search for
            
        Returns:
            Selected fighter name or None if no match
        """
        matches = self.find_best_matches(input_name, threshold=0.3, top_k=3)
        
        if not matches:
            print(f"✗ No matches found for '{input_name}'")
            return None
        
        best_match, best_score = matches[0]
        
        # Only auto-select if 100% match (similarity = 1.0)
        if best_score == 1.0:
            print(f"✓ Perfect match found: '{best_match}'")
            return best_match
        
        # Always show suggestions if not 100% match
        print(f"\nSuggestions for '{input_name}':")
        for i, (name, score) in enumerate(matches, 1):
            print(f"{i}. {name.title()} (similarity: {score:.2%})")
        
        print(f"{len(matches) + 1}. None of the above")
        
        while True:
            try:
                choice = input(f"Select option (1-{len(matches) + 1}): ").strip()
                choice_idx = int(choice) - 1
                
                if choice_idx == len(matches):
                    print("No fighter selected.")
                    return None
                elif 0 <= choice_idx < len(matches):
                    selected_name = matches[choice_idx][0]
                    print(f"✓ Selected: '{selected_name.title()}'")
                    return selected_name
                else:
                    print("Invalid choice. Please try again.")
            except ValueError:
                print("Invalid input. Please enter a number.")
    
    def filter_dataframe(self, fighter_red: str, fighter_blue: str, interactive: bool = True) -> pd.DataFrame:
        """
        Filter dataframe based on fighter names with fuzzy matching.
        
        Args:
            fighter_red: Red fighter name
            fighter_blue: Blue fighter name
            interactive: Whether to use interactive matching
            
        Returns:
            Filtered dataframe
        """
        # Get matched names
        if interactive:
            print("=== Matching Red Fighter ===")
            matched_red = self.interactive_match(fighter_red)
            
            print("\n=== Matching Blue Fighter ===")
            matched_blue = self.interactive_match(fighter_blue)
        else:
            # Non-interactive mode - auto-select best matches
            red_result = self.get_fighter_match(fighter_red)
            blue_result = self.get_fighter_match(fighter_blue)
            
            matched_red = red_result['matched_name'] if red_result['status'] == 'exact_match' else None
            matched_blue = blue_result['matched_name'] if blue_result['status'] == 'exact_match' else None
        
        # Filter dataframe
        names = []
        if matched_red:
            names.append(matched_red)
        if matched_blue:
            names.append(matched_blue)
        
        if not names:
            print("No valid fighter names found. Returning empty dataframe.")
            return pd.DataFrame()
        
        print(f"\n=== Filtering with names: {names} ===")
        
        df_filtered = self.df[
            (self.df['fighter_red'].isin(names)) |
            (self.df['fighter_blue'].isin(names))
        ]
        
        print(f"Found {len(df_filtered)} matching records.")
        return df_filtered

    def get_fighter_input(self, corner_color: str, corner_emoji: str) -> Optional[str]:
        """
        Get fighter input with retry functionality.
        
        Args:
            corner_color: Color name for display (RED/BLUE)
            corner_emoji: Emoji for display (🔴/🔵)
            
        Returns:
            Matched fighter name or None if user chooses to skip
        """
        while True:
            print(f"\nEnter the {corner_color} corner fighter name:")
            fighter_input = input(f"{corner_color.title()} fighter: ").strip()
            
            if not fighter_input:
                retry = input("No fighter name entered. Try again? (y/n): ").strip().lower()
                if retry not in ['y', 'yes']:
                    return None
                continue
            
            print(f"\n{corner_emoji} Searching for {corner_color.title()} fighter: '{fighter_input}'")
            matched_fighter = self.interactive_match(fighter_input)
            
            if matched_fighter:
                return matched_fighter
            else:
                print(f"\n❌ Could not match '{fighter_input}'")
                retry = input(f"Would you like to try a different name for the {corner_color.title()} fighter? (y/n): ").strip().lower()
                if retry not in ['y', 'yes']:
                    return None

    def run_fighter_matching(self) -> pd.DataFrame:
        """
        Run the complete interactive fighter matching process with retry functionality.
        
        Returns:
            Filtered dataframe with matched fighters
        """
        print("=== UFC Fighter Matching System ===")
        print(f"Available fighters: {len(self.unique_fighters)} unique fighters in database")
        
        while True:
            # Get red fighter
            matched_red = self.get_fighter_input("RED", "🔴")
            if not matched_red:
                print("Red fighter is required to continue.")
                restart = input("\nWould you like to start over? (y/n): ").strip().lower()
                if restart not in ['y', 'yes']:
                    print("Exiting fighter matching.")
                    return pd.DataFrame()
                continue
            
            # Get blue fighter
            matched_blue = self.get_fighter_input("BLUE", "🔵")
            if not matched_blue:
                print("Blue fighter is required to continue.")
                restart = input("\nWould you like to start over? (y/n): ").strip().lower()
                if restart not in ['y', 'yes']:
                    print("Exiting fighter matching.")
                    return pd.DataFrame()
                continue
            
            # Both fighters matched successfully
            break
        
        # Save matched names to the instance for retrieval
        self.matched_names = [matched_red, matched_blue]
        
        # Filter dataframe
        names = [matched_red, matched_blue]
        print(f"\n=== Final Selection ===")
        print(f"🔴 Red Fighter: {matched_red.title()}")
        print(f"🔵 Blue Fighter: {matched_blue.title()}")
        
        df_filtered = self.df[
            (self.df['fighter_red'].isin(names)) |
            (self.df['fighter_blue'].isin(names))
        ]
        
        print(f"\n✅ Found {len(df_filtered)} matching fight records in database.")
        
        if len(df_filtered) > 0:
            print(f"Dataframe shape: {df_filtered.shape}")
        else:
            print("No fight records found for these fighters.")
        
        return df_filtered
    
    def get_matched_names(self) -> List[str]:
        """
        Get the matched fighter names as a list.
        
        Returns:
            List with [red_fighter, blue_fighter] or empty list if no matches
        """
        if hasattr(self, 'matched_names'):
            return self.matched_names.copy()
        else:
            return []

# Ready-to-use functions for your workflow
def match_fighters_and_filter(df_merged):
    """
    Main function to run fighter matching on your dataframe.
    
    Args:
        df_merged: Your dataframe with 'fighter_red' and 'fighter_blue' columns
        
    Returns:
        Filtered dataframe with selected fighters
    """
    global _current_matcher  # Store matcher globally to access names later
    _current_matcher = FighterNameMatcher(df_merged)
    return _current_matcher.run_fighter_matching()

def get_fighter_names():
    """
    Get the matched fighter names as a list [red_fighter, blue_fighter].
    Must be called after match_fighters_and_filter().
    
    Returns:
        List with names[0] = red fighter, names[1] = blue fighter
    """
    if '_current_matcher' in globals():
        names = _current_matcher.get_matched_names()
        if names:
            print(f"Fighter names: {names}")
            print(f"Red fighter (names[0]): {names[0]}")
            print(f"Blue fighter (names[1]): {names[1]}")
            return names
        else:
            print("No matched fighters found. Run match_fighters_and_filter() first.")
            return []
    else:
        print("No matcher found. Run match_fighters_and_filter() first.")
        return []

In [7]:
df_filtered = match_fighters_and_filter(df_merged)

=== UFC Fighter Matching System ===
Available fighters: 2586 unique fighters in database

Enter the RED corner fighter name:

🔴 Searching for Red fighter: 'alex pere'

Suggestions for 'alex pere':
1. Alex Perez (similarity: 76.32%)
2. Alex Pereira (similarity: 70.00%)
3. Alex Caceres (similarity: 63.33%)
4. None of the above
✓ Selected: 'Alex Pereira'

Enter the BLUE corner fighter name:

🔵 Searching for Blue fighter: 'ankalaev'

Suggestions for 'ankalaev':
1. Magomed Ankalaev (similarity: 61.67%)
2. Sean Alvarez (similarity: 42.00%)
3. Sultan Aliev (similarity: 42.00%)
4. None of the above
✓ Selected: 'Magomed Ankalaev'

=== Final Selection ===
🔴 Red Fighter: Alex Pereira
🔵 Blue Fighter: Magomed Ankalaev

✅ Found 23 matching fight records in database.
Dataframe shape: (23, 75)


In [8]:
names = get_fighter_names()

Fighter names: ['alex pereira', 'magomed ankalaev']
Red fighter (names[0]): alex pereira
Blue fighter (names[1]): magomed ankalaev


Create a new row with the fight to predict

In [9]:
# Define columns for each fighter
red_columns = [
    'sig_strikes_landed_per_minute_red', 'sig_strikes_absorbed_per_minute_red', 
    'takedowns_avg_red', 'submission_avg_red', 'knockdown_avg_red', 'fight_time_avg_red',
    'stance_red', 'octagon_debut_red', 'height_red', 'weight_red', 'reach_red', 'leg_reach_red'
]

blue_columns = [
    'sig_strikes_landed_per_minute_blue', 'sig_strikes_absorbed_per_minute_blue', 
    'takedowns_avg_blue', 'submission_avg_blue', 'knockdown_avg_blue', 'fight_time_avg_blue',
    'stance_blue', 'octagon_debut_blue', 'height_blue', 'weight_blue', 'reach_blue', 'leg_reach_blue'
]

# Get the last row where diego lopes was fighter_red
last_red_row = df_merged[df_merged['fighter_red'] == names[0]].iloc[-1] if len(df_merged[df_merged['fighter_red'] == names[0]]) > 0 else None

# Get the last row where jean silva was fighter_blue  
last_blue_row = df_merged[df_merged['fighter_blue'] == names[1]].iloc[-1] if len(df_merged[df_merged['fighter_blue'] == names[1]]) > 0 else None

# Create new row data
new_row_data = {
    'event_date': pd.to_datetime('today').date(),
    'fighter_red': names[0],  # 'diego lopes'
    'fighter_blue': names[1], # 'jean silva'
}

# Copy red fighter stats from their last fight as red fighter
if last_red_row is not None:
    for col in red_columns:
        if col in df_merged.columns:
            new_row_data[col] = last_red_row[col]

# Copy blue fighter stats from their last fight as blue fighter
if last_blue_row is not None:
    for col in blue_columns:
        if col in df_merged.columns:
            new_row_data[col] = last_blue_row[col]
    # Also copy weight_class from blue fighter's last fight
    if 'weight_class' in df_merged.columns:
        new_row_data['weight_class'] = last_blue_row['weight_class']

# Add the new row
df_filtered = pd.concat([df_filtered, pd.DataFrame([new_row_data])], ignore_index=True)

# Ensure event_date is datetime
df_filtered['event_date'] = pd.to_datetime(df_filtered['event_date'], errors='coerce').dt.date

df_filtered.tail(1)

Unnamed: 0,event_date,event_name,fighter_red,fighter_blue,round,time,weight_class,win_method,winner,stance_red,stance_blue,knockdowns_red,knockdowns_blue,sig_attempts_red,sig_attempts_blue,sig_strikes_red,sig_strikes_blue,total_strikes_attempts_red,total_strikes_attempts_blue,total_strikes_red,total_strikes_blue,sub_attempts_red,sub_attempts_blue,takedowns_red,takedowns_blue,takedown_attempts_red,takedown_attempts_blue,control_time_red,control_time_blue,head_strikes_red,head_strikes_blue,head_attempts_red,head_attempts_blue,body_strikes_red,body_strikes_blue,body_attempts_red,body_attempts_blue,leg_strikes_red,leg_strikes_blue,leg_attempts_red,leg_attempts_blue,distance_red,distance_blue,distance_attempts_red,distance_attempts_blue,clinch_strikes_red,clinch_strikes_blue,clinch_attempts_red,clinch_attempts_blue,ground_strikes_red,ground_strikes_blue,ground_attempts_red,ground_attempts_blue,octagon_debut_blue,height_blue,weight_blue,reach_blue,leg_reach_blue,sig_strikes_landed_per_minute_blue,sig_strikes_absorbed_per_minute_blue,takedowns_avg_blue,submission_avg_blue,knockdown_avg_blue,fight_time_avg_blue,octagon_debut_red,height_red,weight_red,reach_red,leg_reach_red,sig_strikes_landed_per_minute_red,sig_strikes_absorbed_per_minute_red,takedowns_avg_red,submission_avg_red,knockdown_avg_red,fight_time_avg_red
23,2025-09-25,,alex pereira,magomed ankalaev,,,Open Weight,,,Orthodox,Orthodox,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-03-17,75.0,205.0,75.0,46.0,4.0,2.0,0.8,0.0,0.56,751.0,2021-11-06,76.0,205.0,79.0,44.0,5.0,4.0,0.11,0.23,0.8,717.0


Create average value per stat feature

In [10]:
red_fights = df_filtered[['event_date','event_name','fighter_red','round','time','weight_class','win_method','winner','stance_red',
                        'knockdowns_red','sig_attempts_red','sig_strikes_red','total_strikes_attempts_red','total_strikes_red',
                        'sub_attempts_red','takedowns_red','takedown_attempts_red','control_time_red','head_strikes_red','head_attempts_red',
                        'body_strikes_red','body_attempts_red','leg_strikes_red','leg_attempts_red','distance_red','distance_attempts_red',
                        'clinch_strikes_red','clinch_attempts_red','ground_strikes_red','ground_attempts_red',]].copy()

# Rename columns but keep fighter_red as fighter_name to avoid conflict
red_fights.columns = ['fighter_name' if col == 'fighter_red' else col.replace('_red', '') if '_red' in col else col for col in red_fights.columns]
red_fights.columns = [col.replace('_red', '') if '_red' in col else col for col in red_fights.columns]
red_fights['corner'] = 'red'

blue_fights = df_filtered[['event_date','event_name','fighter_blue','round','time','weight_class','win_method','winner','stance_blue',
                        'knockdowns_blue','sig_attempts_blue','sig_strikes_blue','total_strikes_attempts_blue','total_strikes_blue',
                        'sub_attempts_blue','takedowns_blue','takedown_attempts_blue','control_time_blue','head_strikes_blue','head_attempts_blue',
                        'body_strikes_blue','body_attempts_blue','leg_strikes_blue','leg_attempts_blue','distance_blue','distance_attempts_blue',
                        'clinch_strikes_blue','clinch_attempts_blue','ground_strikes_blue','ground_attempts_blue',]].copy()

blue_fights.columns = ['fighter_name' if col == 'fighter_blue' else col.replace('_blue', '') if '_blue' in col else col for col in blue_fights.columns]
blue_fights.columns = [col.replace('_blue', '') if '_blue' in col else col for col in blue_fights.columns]
blue_fights['corner'] = 'blue'

red_fights['opponent'] = df_filtered['fighter_blue']
blue_fights['opponent'] = df_filtered['fighter_red']

all_fights = pd.concat([red_fights, blue_fights]).sort_values(['fighter_name', 'event_date'])

def calculate_fighter_features(df):

    df = df.sort_values('event_date')

    # Calculate expanding/rolling features (using previous fights only)
    df['avg_rounds'] = df['round'].expanding().mean().shift(1)
    df['avg_time'] = df['time'].expanding().mean().shift(1)
    df['avg_knockdowns'] = df['knockdowns'].expanding().mean().shift(1)
    df['avg_sig_attempts'] = df['sig_attempts'].expanding().mean().shift(1)
    df['avg_sig_strikes'] = df['sig_strikes'].expanding().mean().shift(1)
    df['avg_total_strikes_attempts'] = df['total_strikes_attempts'].expanding().mean().shift(1)
    df['avg_total_strikes'] = df['total_strikes'].expanding().mean().shift(1)
    df['avg_sub_attempts'] = df['sub_attempts'].expanding().mean().shift(1)
    df['avg_takedowns'] = df['takedowns'].expanding().mean().shift(1)
    df['avg_takedown_attempts'] = df['takedown_attempts'].expanding().mean().shift(1)
    df['avg_head_strikes'] = df['head_strikes'].expanding().mean().shift(1)
    df['avg_head_attempts'] = df['head_attempts'].expanding().mean().shift(1)
    df['avg_body_strikes'] = df['body_strikes'].expanding().mean().shift(1)
    df['avg_body_attempts'] = df['body_attempts'].expanding().mean().shift(1)
    df['avg_leg_strikes'] = df['leg_strikes'].expanding().mean().shift(1)
    df['avg_leg_attempts'] = df['leg_attempts'].expanding().mean().shift(1)
    df['avg_distance'] = df['distance'].expanding().mean().shift(1)
    df['avg_distance_attempts'] = df['distance_attempts'].expanding().mean().shift(1)
    df['avg_clinch_strikes'] = df['clinch_strikes'].expanding().mean().shift(1)
    df['avg_clinch_attempts'] = df['clinch_attempts'].expanding().mean().shift(1)
    df['avg_ground_strikes'] = df['ground_strikes'].expanding().mean().shift(1)
    df['avg_ground_attempts'] = df['ground_attempts'].expanding().mean().shift(1)
    
    return df

fighter_features = all_fights.groupby('fighter_name').apply(calculate_fighter_features, include_groups=False)

# Reset index first
fighter_features = fighter_features.reset_index()

# Separate red and blue features
red_features = fighter_features[fighter_features['corner'] == 'red'].copy()
blue_features = fighter_features[fighter_features['corner'] == 'blue'].copy()

# Add suffixes to feature columns
feature_cols = [col for col in fighter_features.columns if col.startswith('avg_')]

red_rename = {col: f'{col}_red' for col in feature_cols}
blue_rename = {col: f'{col}_blue' for col in feature_cols}

red_features = red_features.rename(columns=red_rename)
blue_features = blue_features.rename(columns=blue_rename)

# Merge back to original dataframe
df_filtered = df_filtered.merge(
    red_features[['fighter_name', 'event_date', 'opponent'] + list(red_rename.values())],
    left_on=['fighter_red', 'event_date', 'fighter_blue'],
    right_on=['fighter_name', 'event_date', 'opponent'],
    how='left'
).merge(
    blue_features[['fighter_name', 'event_date', 'opponent'] + list(blue_rename.values())],
    left_on=['fighter_blue', 'event_date', 'fighter_red'],
    right_on=['fighter_name', 'event_date', 'opponent'],
    how='left',
    suffixes=('', '_blue_temp')
)

# Clean up duplicate columns
df_filtered = df_filtered.drop(['fighter_name', 'fighter_name_blue_temp', 'opponent', 
                            'opponent_blue_temp'], axis=1, errors='ignore')

# Drop duplicates based on event_date, fighter_red, and fighter_blue
df_filtered = df_filtered.drop_duplicates(subset=['event_date', 'fighter_red', 'fighter_blue'])

# Reset the index
df_filtered = df_filtered.reset_index(drop=True)

Create new features

In [11]:
def create_fighter_record_features(df):
    """
    Create win/loss tracking features for UFC fighters while avoiding data leakage.
    
    Parameters:
    df: DataFrame with UFC fight data
    
    Returns:
    DataFrame with additional features for fighter records
    """
    # Create a copy to avoid modifying the original
    df_copy = df.copy()
    
    # Ensure event_date is datetime
    df_copy['event_date'] = pd.to_datetime(df_copy['event_date'])
    
    # Sort by date to process fights chronologically
    df_copy = df_copy.sort_values('event_date').reset_index(drop=True)
    
    # Initialize tracking dictionaries for each fighter
    fighter_wins = {}
    fighter_losses = {}
    fighter_total_fights = {}
    
    # Initialize the new columns
    df_copy['wins_before_red'] = 0
    df_copy['losses_before_red'] = 0
    df_copy['total_fights_before_red'] = 0
    df_copy['wins_before_blue'] = 0
    df_copy['losses_before_blue'] = 0
    df_copy['total_fights_before_blue'] = 0
    
    # Process each fight chronologically
    for idx, row in df_copy.iterrows():
        red_fighter = row['fighter_red']
        blue_fighter = row['fighter_blue']
        winner = row['winner']
        
        # Get current records BEFORE this fight (to avoid data leakage)
        wins_before_red = fighter_wins.get(red_fighter, 0)
        losses_before_red = fighter_losses.get(red_fighter, 0)
        red_total_before = fighter_total_fights.get(red_fighter, 0)
        
        wins_before_blue = fighter_wins.get(blue_fighter, 0)
        losses_before_blue = fighter_losses.get(blue_fighter, 0)
        blue_total_before = fighter_total_fights.get(blue_fighter, 0)
        
        # Assign the records before this fight
        df_copy.at[idx, 'wins_before_red'] = wins_before_red
        df_copy.at[idx, 'losses_before_red'] = losses_before_red
        df_copy.at[idx, 'total_fights_before_red'] = red_total_before
        df_copy.at[idx, 'wins_before_blue'] = wins_before_blue
        df_copy.at[idx, 'losses_before_blue'] = losses_before_blue
        df_copy.at[idx, 'total_fights_before_blue'] = blue_total_before
        
        # Update records AFTER processing this fight
        # Red fighter
        if red_fighter not in fighter_wins:
            fighter_wins[red_fighter] = 0
            fighter_losses[red_fighter] = 0
            fighter_total_fights[red_fighter] = 0
        
        # Blue fighter
        if blue_fighter not in fighter_wins:
            fighter_wins[blue_fighter] = 0
            fighter_losses[blue_fighter] = 0
            fighter_total_fights[blue_fighter] = 0
        
        # Update based on fight result
        if winner == 1:  # Red fighter wins
            fighter_wins[red_fighter] += 1
            fighter_losses[blue_fighter] += 1
        else:  # Blue fighter wins
            fighter_wins[blue_fighter] += 1
            fighter_losses[red_fighter] += 1
        
        # Update total fights for both fighters
        fighter_total_fights[red_fighter] += 1
        fighter_total_fights[blue_fighter] += 1
    
    return df_copy

def create_recent_performance_features(df, recent_fights=3):
    """
    Create features based on recent fight performance
    This is more complex and requires tracking recent results
    """
    df_copy = df.copy()
    df_copy['event_date'] = pd.to_datetime(df_copy['event_date'])
    df_copy = df_copy.sort_values('event_date').reset_index(drop=True)
    
    # Track recent results for each fighter
    fighter_recent_results = {}  # Will store list of recent results for each fighter
    
    # Initialize columns
    df_copy[f'wins_last_{recent_fights}_red'] = 0
    df_copy[f'wins_last_{recent_fights}_blue'] = 0
    df_copy[f'fights_last_{recent_fights}_red'] = 0
    df_copy[f'fights_last_{recent_fights}_blue'] = 0
    
    for idx, row in df_copy.iterrows():
        red_fighter = row['fighter_red']
        blue_fighter = row['fighter_blue']
        winner = row['winner']
        
        # Initialize if first time seeing fighter
        if red_fighter not in fighter_recent_results:
            fighter_recent_results[red_fighter] = []
        if blue_fighter not in fighter_recent_results:
            fighter_recent_results[blue_fighter] = []
        
        # Get recent form BEFORE this fight
        red_recent = fighter_recent_results[red_fighter][-recent_fights:]
        blue_recent = fighter_recent_results[blue_fighter][-recent_fights:]
        
        # Calculate recent performance
        df_copy.at[idx, f'wins_last_{recent_fights}_red'] = sum(red_recent)
        df_copy.at[idx, f'wins_last_{recent_fights}_blue'] = sum(blue_recent)
        df_copy.at[idx, f'fights_last_{recent_fights}_red'] = len(red_recent)
        df_copy.at[idx, f'fights_last_{recent_fights}_blue'] = len(blue_recent)
        
        # Update recent results AFTER processing this fight
        if winner == 1:  # Red wins
            fighter_recent_results[red_fighter].append(1)
            fighter_recent_results[blue_fighter].append(0)
        else:  # Blue wins
            fighter_recent_results[red_fighter].append(0)
            fighter_recent_results[blue_fighter].append(1)
    
    return df_copy

def create_win_ratio_record(df):
    """
    Create additional derived features from win/loss records
    """
    df_processed = df.copy()
    
    # Win percentage (handle division by zero)
    df_processed['win_pct_before_red'] = np.where(
        df_processed['total_fights_before_red'] > 0,
        (df_processed['wins_before_red'] / df_processed['total_fights_before_red']).round(3),
        0
    )
    
    df_processed['win_pct_before_blue'] = np.where(
        df_processed['total_fights_before_blue'] > 0,
        (df_processed['wins_before_blue'] / df_processed['total_fights_before_blue']).round(3),
        0
    )
        
    return df_processed

def create_days_since_debut_features(df):
    """
    Calculate days since each fighter's debut, avoiding data leakage.
    
    Parameters:
    df: DataFrame with UFC fight data (must be sorted by event_date)
    
    Returns:
    DataFrame with days_since_debut features added
    """
    df_copy = df.copy()
    
    # Ensure event_date is datetime and data is sorted
    df_copy['event_date'] = pd.to_datetime(df_copy['event_date'])
    df_copy = df_copy.sort_values('event_date').reset_index(drop=True)
    
    # Track each fighter's debut date
    fighter_debut_dates = {}
    
    # Initialize the new columns
    df_copy['days_since_debut_red'] = 0
    df_copy['days_since_debut_blue'] = 0
    
    # Process each fight chronologically
    for idx, row in df_copy.iterrows():
        red_fighter = row['fighter_red']
        blue_fighter = row['fighter_blue']
        current_date = row['event_date']
        
        # Check if this is the fighter's debut (first time we see them)
        if red_fighter not in fighter_debut_dates:
            # This is red fighter's debut
            fighter_debut_dates[red_fighter] = current_date
            df_copy.at[idx, 'days_since_debut_red'] = 0
        else:
            # Calculate days since debut
            days_since = (current_date - fighter_debut_dates[red_fighter]).days
            df_copy.at[idx, 'days_since_debut_red'] = days_since
        
        if blue_fighter not in fighter_debut_dates:
            # This is blue fighter's debut
            fighter_debut_dates[blue_fighter] = current_date
            df_copy.at[idx, 'days_since_debut_blue'] = 0
        else:
            # Calculate days since debut
            days_since = (current_date - fighter_debut_dates[blue_fighter]).days
            df_copy.at[idx, 'days_since_debut_blue'] = days_since
    
    return df_copy

def create_days_since_last_win_features(df):
    """
    Calculate days since each fighter's last win, avoiding data leakage.
    
    Parameters:
    df: DataFrame with UFC fight data (must be sorted by event_date)
    
    Returns:
    DataFrame with days_since_last_win features added
    """
    df_copy = df.copy()
    
    # Ensure event_date is datetime and data is sorted
    df_copy['event_date'] = pd.to_datetime(df_copy['event_date'])
    df_copy = df_copy.sort_values('event_date').reset_index(drop=True)
    
    # Track each fighter's last win date
    fighter_last_win_dates = {}
    
    # Initialize the new columns
    df_copy['days_since_last_win_red'] = np.nan  # NaN for fighters who never won
    df_copy['days_since_last_win_blue'] = np.nan
    
    # Process each fight chronologically
    for idx, row in df_copy.iterrows():
        red_fighter = row['fighter_red']
        blue_fighter = row['fighter_blue']
        current_date = row['event_date']
        winner = row['winner']
        
        # Calculate days since last win BEFORE this fight (to avoid data leakage)
        if red_fighter in fighter_last_win_dates:
            days_since = (current_date - fighter_last_win_dates[red_fighter]).days
            df_copy.at[idx, 'days_since_last_win_red'] = days_since
        else:
            # Fighter has never won before, or this is their first fight
            df_copy.at[idx, 'days_since_last_win_red'] = np.nan
        
        if blue_fighter in fighter_last_win_dates:
            days_since = (current_date - fighter_last_win_dates[blue_fighter]).days
            df_copy.at[idx, 'days_since_last_win_blue'] = days_since
        else:
            # Fighter has never won before, or this is their first fight
            df_copy.at[idx, 'days_since_last_win_blue'] = np.nan
        
        # Update last win dates AFTER processing this fight
        if winner == 1:  # Red fighter wins
            fighter_last_win_dates[red_fighter] = current_date
        elif winner == 0:  # Blue fighter wins
            fighter_last_win_dates[blue_fighter] = current_date
    
    # Add derived features
    
    # Binary indicators
    #df_copy['red_never_won'] = df_copy['days_since_last_win_red'].isna().astype(int)
    #df_copy['blue_never_won'] = df_copy['days_since_last_win_blue'].isna().astype(int)
    
    df_copy['recent_winner_red'] = (df_copy['days_since_last_win_red'] <= 365).astype(int)  # Won within last year
    df_copy['recent_winner_blue'] = (df_copy['days_since_last_win_blue'] <= 365).astype(int)
    
    # Handle NaN values for modeling (replace with a large number or separate indicator)
    # Replace NaN with a large number (e.g., 9999 days =~ 27.3 years) to indicate "never won"
    df_copy['days_since_last_win_red'] = df_copy['days_since_last_win_red'].fillna(9999)
    df_copy['days_since_last_win_blue'] = df_copy['days_since_last_win_blue'].fillna(9999)
    
    return df_copy

def create_wins_last_year_features(df):
    """
    Calculate wins in the last 365 days for each fighter, avoiding data leakage.
    
    Parameters:
    df: DataFrame with UFC fight data (must be sorted by event_date)
    
    Returns:
    DataFrame with wins_last_year features added
    """
    df_copy = df.copy()
    
    # Ensure event_date is datetime and data is sorted
    df_copy['event_date'] = pd.to_datetime(df_copy['event_date'])
    df_copy = df_copy.sort_values('event_date').reset_index(drop=True)
    
    # Track fight history for each fighter (date, result pairs)
    fighter_fight_history = {}
    
    # Initialize the new columns
    df_copy['wins_last_365_days_red'] = 0
    df_copy['wins_last_365_days_blue'] = 0
    df_copy['fights_last_365_days_red'] = 0
    df_copy['fights_last_365_days_blue'] = 0
    
    # Process each fight chronologically
    for idx, row in df_copy.iterrows():
        red_fighter = row['fighter_red']
        blue_fighter = row['fighter_blue']
        current_date = row['event_date']
        winner = row['winner']
        
        # Initialize fighter history if first time seeing them
        if red_fighter not in fighter_fight_history:
            fighter_fight_history[red_fighter] = []
        if blue_fighter not in fighter_fight_history:
            fighter_fight_history[blue_fighter] = []
        
        # Calculate cutoff date for last 365 days (before current fight to avoid leakage)
        cutoff_date = current_date - timedelta(days=365)
        
        # Count wins and total fights in last 365 days for red fighter
        red_recent_fights = [
            fight for fight in fighter_fight_history[red_fighter] 
            if fight['date'] > cutoff_date
        ]
        red_wins_365 = sum(1 for fight in red_recent_fights if fight['won'])
        red_fights_365 = len(red_recent_fights)
        
        # Count wins and total fights in last 365 days for blue fighter
        blue_recent_fights = [
            fight for fight in fighter_fight_history[blue_fighter] 
            if fight['date'] > cutoff_date
        ]
        blue_wins_365 = sum(1 for fight in blue_recent_fights if fight['won'])
        blue_fights_365 = len(blue_recent_fights)
        
        # Assign the counts BEFORE this fight (avoiding data leakage)
        df_copy.at[idx, 'wins_last_365_days_red'] = red_wins_365
        df_copy.at[idx, 'wins_last_365_days_blue'] = blue_wins_365
        df_copy.at[idx, 'fights_last_365_days_red'] = red_fights_365
        df_copy.at[idx, 'fights_last_365_days_blue'] = blue_fights_365
        
        # Update fight history AFTER processing this fight
        # Add red fighter's result
        red_won = (winner == 1)
        fighter_fight_history[red_fighter].append({
            'date': current_date,
            'won': red_won
        })
        
        # Add blue fighter's result  
        blue_won = (winner == 0)
        fighter_fight_history[blue_fighter].append({
            'date': current_date,
            'won': blue_won
        })
    
    # Add derived features
    # Win rate in last 365 days
    '''
    df_copy['win_rate_last_365_days_red'] = np.where(
        df_copy['fights_last_365_days_red'] > 0,
        df_copy['wins_last_365_days_red'] / df_copy['fights_last_365_days_red'],
        0
    )
    
    df_copy['win_rate_last_365_days_blue'] = np.where(
        df_copy['fights_last_365_days_blue'] > 0,
        df_copy['wins_last_365_days_blue'] / df_copy['fights_last_365_days_blue'],
        0
    )
    '''

    df_copy['undefeated_last_year_red'] = (
        (df_copy['fights_last_365_days_red'] > 0) & 
        (df_copy['wins_last_365_days_red'] == df_copy['fights_last_365_days_red'])
    ).astype(int)
    
    df_copy['undefeated_last_year_blue'] = (
        (df_copy['fights_last_365_days_blue'] > 0) & 
        (df_copy['wins_last_365_days_blue'] == df_copy['fights_last_365_days_blue'])
    ).astype(int)
    
    return df_copy

def create_win_lose_streak_features(df):
    """
    Calculate current win/lose streaks for each fighter, avoiding data leakage.
    
    Parameters:
    df: DataFrame with UFC fight data (must be sorted by event_date)
    
    Returns:
    DataFrame with win/lose streak features added
    """
    df_copy = df.copy()
    
    # Ensure event_date is datetime and data is sorted
    df_copy['event_date'] = pd.to_datetime(df_copy['event_date'])
    df_copy = df_copy.sort_values('event_date').reset_index(drop=True)
    
    # Track streak information for each fighter
    fighter_streaks = {}
    
    # Initialize the new columns
    df_copy['win_streak_red'] = 0
    df_copy['win_streak_blue'] = 0
    df_copy['lose_streak_red'] = 0
    df_copy['lose_streak_blue'] = 0
    
    # Process each fight chronologically
    for idx, row in df_copy.iterrows():
        red_fighter = row['fighter_red']
        blue_fighter = row['fighter_blue']
        winner = row['winner']
        
        # Initialize fighter streak tracking if first time seeing them
        if red_fighter not in fighter_streaks:
            fighter_streaks[red_fighter] = {
                'current_win_streak': 0,
                'current_lose_streak': 0,
                'last_result': None  # 'win', 'loss', or None
            }
        
        if blue_fighter not in fighter_streaks:
            fighter_streaks[blue_fighter] = {
                'current_win_streak': 0,
                'current_lose_streak': 0,
                'last_result': None
            }
        
        # Get current streaks BEFORE this fight (to avoid data leakage)
        win_streak_red = fighter_streaks[red_fighter]['current_win_streak']
        lose_streak_red = fighter_streaks[red_fighter]['current_lose_streak']
        win_streak_blue = fighter_streaks[blue_fighter]['current_win_streak']
        lose_streak_blue = fighter_streaks[blue_fighter]['current_lose_streak']
        
        # Assign streaks before this fight
        df_copy.at[idx, 'win_streak_red'] = win_streak_red
        df_copy.at[idx, 'lose_streak_red'] = lose_streak_red
        df_copy.at[idx, 'win_streak_blue'] = win_streak_blue
        df_copy.at[idx, 'lose_streak_blue'] = lose_streak_blue
        
        # Update streaks AFTER processing this fight
        if winner == 1:  # Red fighter wins
            # Red fighter wins - update win streak, reset lose streak
            fighter_streaks[red_fighter]['current_win_streak'] += 1
            fighter_streaks[red_fighter]['current_lose_streak'] = 0
            fighter_streaks[red_fighter]['last_result'] = 'win'
            
            # Blue fighter loses - update lose streak, reset win streak
            fighter_streaks[blue_fighter]['current_lose_streak'] += 1
            fighter_streaks[blue_fighter]['current_win_streak'] = 0
            fighter_streaks[blue_fighter]['last_result'] = 'loss'
            
        elif winner == 0:  # Blue fighter wins
            # Blue fighter wins - update win streak, reset lose streak
            fighter_streaks[blue_fighter]['current_win_streak'] += 1
            fighter_streaks[blue_fighter]['current_lose_streak'] = 0
            fighter_streaks[blue_fighter]['last_result'] = 'win'
            
            # Red fighter loses - update lose streak, reset win streak
            fighter_streaks[red_fighter]['current_lose_streak'] += 1
            fighter_streaks[red_fighter]['current_win_streak'] = 0
            fighter_streaks[red_fighter]['last_result'] = 'loss'
    
    # Add derived features
    
    # Momentum indicators
    df_copy['on_win_streak_red'] = (df_copy['win_streak_red'] >= 1).astype(int)
    df_copy['on_win_streak_blue'] = (df_copy['win_streak_blue'] >= 1).astype(int)
    #df_copy['red_on_lose_streak'] = (df_copy['lose_streak_red'] >= 1).astype(int)
    #df_copy['blue_on_lose_streak'] = (df_copy['lose_streak_blue'] >= 1).astype(int)
    
    # Long streak indicators (3+ wins/losses)
    df_copy['long_win_streak_red'] = (df_copy['win_streak_red'] >= 3).astype(int)
    df_copy['long_win_streak_blue'] = (df_copy['win_streak_blue'] >= 3).astype(int)
    #df_copy['red_long_lose_streak'] = (df_copy['lose_streak_red'] >= 3).astype(int)
    #df_copy['blue_long_lose_streak'] = (df_copy['lose_streak_blue'] >= 3).astype(int)
    
    return df_copy

# Example usage with your dataset
def process_ufc_data(df):
    """
    Complete pipeline to add all fighter record features
    """
    print("Processing UFC data for feature engineering...")
    print(f"Original dataset shape: {df.shape}")
    
    # Step 1: Create win/loss features
    df_with_records = create_fighter_record_features(df)
    print("✓ Added win/loss tracking features")

    # Step 2: Add a win ratio
    df_processed = create_win_ratio_record(df_with_records)
    print("✓ Added win ratio features")
    
    # Step 3: Add recent form features
    df_processed = create_recent_performance_features(df_processed, recent_fights=3)
    print("✓ Added recent performance features")

    # Step 4: Days since debut
    df_processed = create_days_since_debut_features(df_processed)
    print("✓ Added days since debut features")

    # Step 5: Add days since last win
    df_processed = create_days_since_last_win_features(df_processed)
    print("✓ Added days since last win/loss features")

    # Step 6: Add wins in last calendar year
    df_processed = create_wins_last_year_features(df_processed)
    print("✓ Added calendar year features")

    # Step 7: Add win/loss streak
    df_final = create_win_lose_streak_features(df_processed)
    print("✓ Added win/loss streak features")
    
    print(f"Final dataset shape: {df_final.shape}")
    print(f"Added {df_final.shape[1] - df.shape[1]} new features")
    
    return df_final

df_processed = process_ufc_data(df_filtered)

Processing UFC data for feature engineering...
Original dataset shape: (24, 119)
✓ Added win/loss tracking features
✓ Added win ratio features
✓ Added recent performance features
✓ Added days since debut features
✓ Added days since last win/loss features
✓ Added calendar year features
✓ Added win/loss streak features
Final dataset shape: (24, 151)
Added 32 new features


Temporal Feature Engineering

In [12]:
def get_fighter_stats(df: pd.DataFrame, fighter_name: str, fight_index: int) -> Dict:
    """
    Get the stats for a specific fighter in a specific fight, regardless of red/blue corner
    """
    try:
        row = df.iloc[fight_index]
        
        if row['fighter_red'] == fighter_name:
            corner = 'red'
        elif row['fighter_blue'] == fighter_name:
            corner = 'blue'
        else:
            return None
        
        # Define the key stats we want to extract (focusing on most important ones)
        stat_names = [
            'sig_strikes', 'sig_attempts', 'knockdowns', 'takedowns', 'takedown_attempts', 
            'total_strikes', 'total_strikes_attempts','sub_attempts', 'control_time',
            'head_strikes', 'head_attempts'
        ]
        
        stats = {}
        for stat in stat_names:
            col_name = f"{stat}_{corner}"
            stats[stat] = row.get(col_name, np.nan)
        
        # Also get fight outcome and method
        stats['won'] = 1 if row['winner'] == fighter_name else 0
        stats['win_method'] = row.get('win_method', np.nan)
        stats['round'] = row.get('round', np.nan)
        stats['event_date'] = row.get('event_date')
    except (IndexError, KeyError) as e:
        print(f"Error getting stats for {fighter_name} at index {fight_index}: {e}")
        return None
    
    return stats

def calculate_essential_ema_features(df: pd.DataFrame, alpha: float = 0.3) -> pd.DataFrame:
    """
    Calculate EMAs for only the most important performance metrics
    Features end with _red or _blue
    """
    result_df = df.copy()
    
    # Get all unique fighters
    all_fighters = set(df['fighter_red'].dropna().unique()) | set(df['fighter_blue'].dropna().unique())
    
    # Focus on essential stats only
    essential_stats = ['sig_strikes', 'sig_attempts', 'knockdowns', 'takedowns', 'takedown_attempts', 
            'total_strikes', 'total_strikes_attempts', 'head_strikes', 'head_attempts']
    
    # Initialize EMA columns
    for corner in ['red', 'blue']:
        for stat in essential_stats:
            result_df[f"{stat}_ema_{corner}"] = np.nan
            if 'attempts' not in stat:
                result_df[f"{stat}_success_rate_ema_{corner}"] = np.nan
    
    # Calculate EMAs for each fighter
    for fighter in all_fighters:
        if pd.isna(fighter):
            continue
        
        # Get all fights for this fighter in chronological order
        fighter_fights = []
        for idx, row in df.iterrows():
            if row['fighter_red'] == fighter or row['fighter_blue'] == fighter:
                stats = get_fighter_stats(df, fighter, df.index.get_loc(idx))
                if stats:
                    fighter_fights.append({
                        'index': idx,
                        'corner': 'red' if row['fighter_red'] == fighter else 'blue',
                        'stats': stats
                    })
        
        if len(fighter_fights) <= 1:
            continue
        
        # Calculate EMAs for this fighter
        fighter_emas = {}
        
        for i, fight in enumerate(fighter_fights):
            if i == 0:
                # First fight - initialize EMAs with current values
                for stat in essential_stats:
                    if pd.notna(fight['stats'][stat]):
                        fighter_emas[f"{stat}_ema"] = fight['stats'][stat]
                        
                        # Initialize success rate EMAs
                        if 'attempts' not in stat and f"{stat}_attempts" in fight['stats']:
                            attempts = fight['stats'][f"{stat}_attempts"]
                            if attempts > 0:
                                fighter_emas[f"{stat}_success_rate_ema"] = fight['stats'][stat] / attempts
                            else:
                                fighter_emas[f"{stat}_success_rate_ema"] = 0
                continue
            
            # Update EMAs using previous fight's values (avoid data leakage)
            prev_fight = fighter_fights[i-1]
            current_corner = fight['corner']
            
            for stat in essential_stats:
                ema_key = f"{stat}_ema"
                success_rate_key = f"{stat}_success_rate_ema"
                
                # Update volume EMA
                if ema_key in fighter_emas and pd.notna(prev_fight['stats'][stat]):
                    new_ema = alpha * prev_fight['stats'][stat] + (1 - alpha) * fighter_emas[ema_key]
                    fighter_emas[ema_key] = new_ema
                    result_df.loc[fight['index'], f"{stat}_ema_{current_corner}"] = fighter_emas[ema_key]
                
                # Update success rate EMA
                if 'attempts' not in stat and f"{stat}_attempts" in prev_fight['stats']:
                    prev_attempts = prev_fight['stats'][f"{stat}_attempts"]
                    if prev_attempts > 0:
                        prev_success_rate = prev_fight['stats'][stat] / prev_attempts
                        
                        if success_rate_key in fighter_emas:
                            new_success_ema = alpha * prev_success_rate + (1 - alpha) * fighter_emas[success_rate_key]
                            fighter_emas[success_rate_key] = new_success_ema
                            result_df.loc[fight['index'], f"{stat}_success_rate_ema_{current_corner}"] = fighter_emas[success_rate_key]
    
    return result_df

def calculate_essential_rolling_features(df: pd.DataFrame, window: int = 5) -> pd.DataFrame:
    """
    Calculate rolling features for essential metrics only, using 5-fight window
    Features end with _red or _blue
    """
    result_df = df.copy()
    
    # Get all unique fighters
    all_fighters = set(df['fighter_red'].dropna().unique()) | set(df['fighter_blue'].dropna().unique())
    
    # Essential stats only
    essential_stats = ['sig_strikes', 'sig_attempts',
                    'takedowns', 'takedown_attempts', 'control_time',
                    'knockdowns', 'total_strikes', 'total_strikes_attempts']
    
    # Initialize rolling columns
    for corner in ['red', 'blue']:
        for stat in essential_stats:
            result_df[f"{stat}_roll_{window}_{corner}"] = np.nan
            if stat in ['sig_strikes', 'takedowns']:  # Only for stats with success rates
                result_df[f"{stat}_success_rate_roll_{window}_{corner}"] = np.nan
    
    # Calculate rolling features for each fighter
    for fighter in all_fighters:
        if pd.isna(fighter):
            continue
        
        # Get all fights for this fighter
        fighter_fights = []
        for idx, row in df.iterrows():
            if row['fighter_red'] == fighter or row['fighter_blue'] == fighter:
                stats = get_fighter_stats(df, fighter, df.index.get_loc(idx))
                if stats:
                    fighter_fights.append({
                        'index': idx,
                        'corner': 'red' if row['fighter_red'] == fighter else 'blue',
                        'stats': stats
                    })
        
        if len(fighter_fights) <= 1:
            continue
        
        # Create series for each stat for this fighter
        fighter_stats_series = {}
        indices = []
        
        for fight in fighter_fights:
            indices.append(fight['index'])
            for stat in essential_stats:
                if stat not in fighter_stats_series:
                    fighter_stats_series[stat] = []
                fighter_stats_series[stat].append(fight['stats'][stat])
        
        # Convert to pandas series
        for stat in essential_stats:
            fighter_stats_series[stat] = pd.Series(fighter_stats_series[stat], index=indices)
        
        # Calculate rolling features
        for stat in essential_stats:
            if stat in fighter_stats_series:
                series = fighter_stats_series[stat]
                
                # Rolling mean (shifted by 1 to avoid data leakage)
                rolling_mean = series.rolling(window=window, min_periods=1).mean().shift(1)
                
                # Apply to correct corner for each fight
                for fight in fighter_fights:
                    idx = fight['index']
                    corner = fight['corner']
                    
                    if idx in rolling_mean.index:
                        result_df.loc[idx, f"{stat}_roll_{window}_{corner}"] = rolling_mean.loc[idx]
                
                # Calculate success rate rolling features for applicable stats
                if stat == 'sig_strikes':
                    attempts_stat = 'sig_attempts'
                elif stat == 'takedowns':
                    attempts_stat = 'takedown_attempts'
                else:
                    continue
                
                if attempts_stat in fighter_stats_series:
                    success_series = fighter_stats_series[stat] / (fighter_stats_series[attempts_stat] + 1e-8)
                    rolling_success = success_series.rolling(window=window, min_periods=1).mean().shift(1)
                    
                    for fight in fighter_fights:
                        idx = fight['index']
                        corner = fight['corner']
                        
                        if idx in rolling_success.index:
                            result_df.loc[idx, f"{stat}_success_rate_roll_{window}_{corner}"] = rolling_success.loc[idx]
    
    return result_df

def calculate_essential_momentum_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate only the most valuable momentum features that aren't redundant
    with existing win/loss tracking. Features end with _red or _blue
    """
    result_df = df.copy()
    
    # Get all unique fighters
    all_fighters = set(df['fighter_red'].dropna().unique()) | set(df['fighter_blue'].dropna().unique())
    
    # Initialize momentum columns (only the essential ones)
    for corner in ['red', 'blue']:
        result_df[f'performance_trend_{corner}'] = np.nan          # Shows trajectory
        result_df[f'finish_momentum_{corner}'] = np.nan           # How they win
        result_df[f'dominance_momentum_{corner}'] = np.nan        # Quality of wins
    
    for fighter in all_fighters:
        if pd.isna(fighter):
            continue
        
        # Get all fights for this fighter
        fighter_fights = []
        for idx, row in df.iterrows():
            if row['fighter_red'] == fighter or row['fighter_blue'] == fighter:
                stats = get_fighter_stats(df, fighter, df.index.get_loc(idx))
                if stats:
                    fighter_fights.append({
                        'index': idx,
                        'corner': 'red' if row['fighter_red'] == fighter else 'blue',
                        'stats': stats
                    })
        
        if len(fighter_fights) <= 2:  # Need at least 3 fights for trends
            continue
        
        # Extract win/loss sequence
        win_sequence = [fight['stats']['won'] for fight in fighter_fights]
        
        # 1. Performance Trend (linear trend of recent results)
        performance_trend = []
        for i in range(len(win_sequence)):
            if i < 2:
                performance_trend.append(np.nan)
            else:
                recent_results = win_sequence[max(0, i-4):i]  # Last 5 fights before current
                if len(recent_results) >= 3:
                    x = np.arange(len(recent_results))
                    try:
                        trend = np.polyfit(x, recent_results, 1)[0]  # Slope
                        performance_trend.append(trend)
                    except:
                        performance_trend.append(np.nan)
                else:
                    performance_trend.append(np.nan)
        
        # 2. Finish Momentum (recent finish rate)
        finish_momentum = []
        for i, fight in enumerate(fighter_fights):
            if i == 0:
                finish_momentum.append(np.nan)
                continue
            
            # Look at previous fights
            recent_fights = fighter_fights[max(0, i-5):i]  # Last 5 fights before current
            wins = [f for f in recent_fights if f['stats']['won'] == 1]
            
            if len(wins) > 0:
                finishes = [f for f in wins if pd.notna(f['stats']['win_method']) and 
                        f['stats']['win_method'] in ['Submission', 'KO/TKO', 'TKO']]
                finish_rate = len(finishes) / len(wins)
                finish_momentum.append(finish_rate)
            else:
                finish_momentum.append(0.0)
        
        # 3. Dominance Momentum (how decisively they win)
        dominance_momentum = []
        for i, fight in enumerate(fighter_fights):
            if i == 0:
                dominance_momentum.append(np.nan)
                continue
            
            # Look at previous fights
            recent_fights = fighter_fights[max(0, i-3):i]  # Last 3 fights before current
            dominance_scores = []
            
            for f in recent_fights:
                if f['stats']['won'] == 1:
                    # Score based on how they won
                    if pd.notna(f['stats']['round']) and f['stats']['round'] == 1:
                        dominance_scores.append(1.0)  # First round finish
                    elif pd.notna(f['stats']['win_method']) and f['stats']['win_method'] in ['Submission', 'KO/TKO', 'TKO']:
                        dominance_scores.append(0.8)  # Later round finish
                    else:
                        dominance_scores.append(0.6)  # Decision win
                else:
                    dominance_scores.append(0.0)  # Loss
            
            if len(dominance_scores) > 0:
                dominance_momentum.append(np.mean(dominance_scores))
            else:
                dominance_momentum.append(np.nan)
        
        # Apply momentum features to the dataframe
        for i, fight in enumerate(fighter_fights):
            idx = fight['index']
            corner = fight['corner']
            
            result_df.loc[idx, f'performance_trend_{corner}'] = performance_trend[i]
            result_df.loc[idx, f'finish_momentum_{corner}'] = finish_momentum[i]
            result_df.loc[idx, f'dominance_momentum_{corner}'] = dominance_momentum[i]
    
    return result_df

def engineer_essential_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Main function to apply essential temporal feature engineering only
    All features end with _red or _blue
    """
    print("Calculating essential EMA features...")
    df = calculate_essential_ema_features(df, alpha=0.3)
    
    print("Calculating essential rolling window features (5-fight window)...")
    df = calculate_essential_rolling_features(df, window=5)
    
    print("Calculating essential momentum features...")
    df = calculate_essential_momentum_features(df)
    
    print("Essential feature engineering complete!")
    
    # Print summary of new features
    temporal_cols = [col for col in df.columns if any(x in col for x in 
                    ['_ema_', '_roll_', '_momentum_', '_trend_'])]
    print(f"Added {len(temporal_cols)} new temporal features:")
    
    feature_types = {
        'EMA': [col for col in temporal_cols if '_ema_' in col],
        'Rolling': [col for col in temporal_cols if '_roll_' in col], 
        'Momentum': [col for col in temporal_cols if '_momentum_' in col or '_trend_' in col]
    }
    
    for ftype, cols in feature_types.items():
        print(f"  {ftype}: {len(cols)} features")
    
    return df

def handle_temporal_nans(df: pd.DataFrame) -> pd.DataFrame:
    """
    Handle NaN values in temporal features based on their specific context
    """
    df_filled = df.copy()
    
    # Get all temporal columns
    temporal_cols = [col for col in df.columns if any(x in col for x in 
                    ['_ema_', '_roll_', '_momentum_', '_trend_'])]
    
    print(f"Handling NaN values in {len(temporal_cols)} temporal features...")
    
    # Get all fighters
    all_fighters = set(df['fighter_red'].dropna().unique()) | set(df['fighter_blue'].dropna().unique())
    
    # 1. Handle early career NaNs (fighters with insufficient history)
    print("1. Filling early career NaNs...")
    for fighter in all_fighters:
        if pd.isna(fighter):
            continue
            
        fighter_fights = df_filled[
            (df_filled['fighter_red'] == fighter) | 
            (df_filled['fighter_blue'] == fighter)
        ].sort_values('event_date')
        
        if len(fighter_fights) <= 1:
            continue
        
        # Forward fill from first valid value for each temporal feature
        for col in temporal_cols:
            if col in fighter_fights.columns:
                fighter_values = fighter_fights[col].copy()
                first_valid_idx = fighter_values.first_valid_index()
                
                if first_valid_idx is not None:
                    first_valid_value = fighter_values.loc[first_valid_idx]
                    # Fill earlier NaNs with first valid value
                    early_fights = fighter_fights[fighter_fights.index < first_valid_idx]
                    for early_idx in early_fights.index:
                        if pd.isna(df_filled.loc[early_idx, col]):
                            df_filled.loc[early_idx, col] = first_valid_value
    
    # 2. Fill momentum features with contextually meaningful defaults
    print("2. Filling momentum feature NaNs...")
    
    # Performance trends: 0 (no trend)
    trend_cols = [col for col in temporal_cols if 'trend' in col]
    for col in trend_cols:
        df_filled[col] = df_filled[col].fillna(0.0)
    
    # Finish momentum: 0 (no finishing history)
    finish_cols = [col for col in temporal_cols if 'finish_momentum' in col]
    for col in finish_cols:
        df_filled[col] = df_filled[col].fillna(0.0)
    
    # Dominance momentum: Use conservative estimate (0.3 = low dominance)
    dominance_cols = [col for col in temporal_cols if 'dominance_momentum' in col]
    for col in dominance_cols:
        df_filled[col] = df_filled[col].fillna(0.3)
    
    # 3. Fill success rate features with population medians
    print("3. Filling success rate NaNs...")
    success_rate_cols = [col for col in temporal_cols if 'success_rate' in col]
    for col in success_rate_cols:
        if col in df_filled.columns:
            # Use dropna() here too to avoid warnings
            non_nan_pop = df_filled[col].dropna()
            if len(non_nan_pop) > 0:
                population_median = non_nan_pop.median()
                df_filled[col] = df_filled[col].fillna(population_median)
            else:
                # Fallback defaults based on typical MMA stats
                if 'sig_strikes' in col:
                    df_filled[col] = df_filled[col].fillna(0.45)  # ~45% striking accuracy
                elif 'takedowns' in col:
                    df_filled[col] = df_filled[col].fillna(0.35)  # ~35% takedown success
                else:
                    df_filled[col] = df_filled[col].fillna(0.0)

    # 4. Fill remaining EMA and rolling features with fighter-specific medians
    print("4. Filling remaining EMA and rolling NaNs...")
    remaining_cols = [col for col in temporal_cols if col not in (trend_cols + finish_cols + dominance_cols + success_rate_cols)]

    for fighter in all_fighters:
        if pd.isna(fighter):
            continue
            
        fighter_mask = (df_filled['fighter_red'] == fighter) | (df_filled['fighter_blue'] == fighter)
        fighter_data = df_filled[fighter_mask]
        
        if len(fighter_data) <= 1:
            continue
        
        for col in remaining_cols:
            if col in fighter_data.columns:
                # Use dropna() to avoid empty slice warnings
                non_nan_values = fighter_data[col].dropna()
                
                if len(non_nan_values) > 0:
                    fighter_median = non_nan_values.median()
                    df_filled.loc[fighter_mask & df_filled[col].isna(), col] = fighter_median
                else:
                    # No valid values for this fighter, use population median
                    pop_non_nan = df_filled[col].dropna()
                    if len(pop_non_nan) > 0:
                        pop_median = pop_non_nan.median()
                        df_filled.loc[fighter_mask & df_filled[col].isna(), col] = pop_median
                    else:
                        # Entire column is NaN, use 0 as fallback
                        df_filled.loc[fighter_mask & df_filled[col].isna(), col] = 0.0
    
    # 5. Final cleanup
    print("5. Final cleanup...")
    remaining_nans = df_filled[temporal_cols].isna().sum().sum()
    if remaining_nans > 0:
        print(f"   Filling {remaining_nans} remaining NaNs with 0...")
        for col in temporal_cols:
            df_filled[col] = df_filled[col].fillna(0.0)
    
    # Validation
    final_nans = df_filled[temporal_cols].isna().sum().sum()
    print(f"\nNaN Filling Complete!")
    print(f"All {len(temporal_cols)} temporal features now NaN-free")
    print(f"{len(df_filled)} fights ready for ML training")
    
    return df_filled

# Example usage
if __name__ == "__main__":
    print("Essential UFC Feature Engineering Ready!")

Essential UFC Feature Engineering Ready!


In [13]:
# Add essential temporal features
df_temporal_features = engineer_essential_temporal_features(df_processed)

# Handle NaN values appropriately
df_temporal_features_clean = handle_temporal_nans(df_temporal_features)

Calculating essential EMA features...
Calculating essential rolling window features (5-fight window)...
Calculating essential momentum features...
Essential feature engineering complete!
Added 54 new temporal features:
  EMA: 28 features
  Rolling: 20 features
  Momentum: 6 features
Handling NaN values in 54 temporal features...
1. Filling early career NaNs...
2. Filling momentum feature NaNs...
3. Filling success rate NaNs...
4. Filling remaining EMA and rolling NaNs...
5. Final cleanup...

NaN Filling Complete!
All 54 temporal features now NaN-free
24 fights ready for ML training


Keep last row, which is the fight to predict

In [14]:
df_temporal_features_clean = df_temporal_features_clean.tail(1)

In [15]:
# Convert event_date
df_temporal_features_clean['year'] = pd.to_datetime(df_temporal_features_clean['event_date']).dt.year
df_temporal_features_clean['month'] = pd.to_datetime(df_temporal_features_clean['event_date']).dt.month  
df_temporal_features_clean['day_of_week'] = pd.to_datetime(df_temporal_features_clean['event_date']).dt.dayofweek

In [16]:
# Load encoder
with open("../models/encoder_stance.pkl", "rb") as f:
    encoder_stance = pickle.load(f)

# Encode stance_red and stance_blue separately to preserve naming
encoder_stance = OneHotEncoder(sparse_output=False, drop='first')

encoder_stance.fit(df_temporal_features_clean[['stance_red']])

# Transform red stance
stance_red_encoded = encoder_stance.transform(df_temporal_features_clean[['stance_red']])
stance_red_names = [name.replace('stance_red_', '') + '_red'
                    for name in encoder_stance.get_feature_names_out(['stance_red'])]

# Transform blue stance (rename column temporarily to match fit)
stance_blue_encoded = encoder_stance.transform(
    df_temporal_features_clean[['stance_blue']].rename(columns={'stance_blue': 'stance_red'})
)
stance_blue_names = [name.replace('stance_red_', '') + '_blue'
                    for name in encoder_stance.get_feature_names_out(['stance_red'])]

# Combine all encoded features
all_encoded = np.concatenate([stance_red_encoded, stance_blue_encoded], axis=1)
all_feature_names = stance_red_names + stance_blue_names

# Create final DataFrame
encoded_df = pd.DataFrame(all_encoded, columns=all_feature_names, index=df_temporal_features_clean.index)

# Concatenate and drop originals
df_temporal_features_clean = pd.concat([df_temporal_features_clean.drop(['stance_red', 'stance_blue'], axis=1), encoded_df], axis=1)

# Drop columns
df_temporal_features_clean.drop(columns=['event_date', 'event_name', 'round', 'time', 'weight_class', 'win_method', 'fighter_blue', 'fighter_red', 'octagon_debut_blue', 'octagon_debut_red',
                                        'knockdowns_red','knockdowns_blue','sig_attempts_red','sig_attempts_blue','sig_strikes_red','sig_strikes_blue','total_strikes_attempts_red',
                                        'total_strikes_attempts_blue','total_strikes_red','total_strikes_blue','sub_attempts_red','sub_attempts_blue','takedowns_red',
                                        'takedowns_blue','takedown_attempts_red','takedown_attempts_blue','control_time_red','control_time_blue','head_strikes_red',
                                        'head_strikes_blue','head_attempts_red','head_attempts_blue','body_strikes_red','body_strikes_blue','body_attempts_red',
                                        'body_attempts_blue','leg_strikes_red','leg_strikes_blue','leg_attempts_red','leg_attempts_blue','distance_red','distance_blue',
                                        'distance_attempts_red','distance_attempts_blue','clinch_strikes_red','clinch_strikes_blue','clinch_attempts_red','clinch_attempts_blue',
                                        'ground_strikes_red','ground_strikes_blue','ground_attempts_red','ground_attempts_blue'], inplace=True)

# Calculate differences
# Find numeric _blue columns
blue_cols = [col for col in df_temporal_features_clean.columns 
            if col.endswith('_blue') and pd.api.types.is_numeric_dtype(df_temporal_features_clean[col])]

# Create all difference columns at once
diff_data = {}
cols_to_drop = []

for blue_col in blue_cols:
    red_col = blue_col.replace('_blue', '_red')
    if red_col in df_temporal_features_clean.columns and pd.api.types.is_numeric_dtype(df_temporal_features_clean[red_col]):
        diff_col = blue_col.replace('_blue', '_diff')
        diff_data[diff_col] = df_temporal_features_clean[blue_col] - df_temporal_features_clean[red_col]
        cols_to_drop.extend([blue_col, red_col])

# Create difference DataFrame and concatenate
if diff_data:
    diff_df = pd.DataFrame(diff_data, index=df_temporal_features_clean.index)
    
    # Drop old columns and add new ones in one operation
    df_temporal_features_clean = df_temporal_features_clean.drop(cols_to_drop, axis=1)
    df_temporal_features_clean = pd.concat([df_temporal_features_clean, diff_df], axis=1)

# Defragment the DataFrame
df_temporal_features_clean = df_temporal_features_clean.copy()

In [17]:
df_temporal_features_clean

Unnamed: 0,winner,year,month,day_of_week,height_diff,weight_diff,reach_diff,leg_reach_diff,sig_strikes_landed_per_minute_diff,sig_strikes_absorbed_per_minute_diff,takedowns_avg_diff,submission_avg_diff,knockdown_avg_diff,fight_time_avg_diff,avg_rounds_diff,avg_time_diff,avg_knockdowns_diff,avg_sig_attempts_diff,avg_sig_strikes_diff,avg_total_strikes_attempts_diff,avg_total_strikes_diff,avg_sub_attempts_diff,avg_takedowns_diff,avg_takedown_attempts_diff,avg_head_strikes_diff,avg_head_attempts_diff,avg_body_strikes_diff,avg_body_attempts_diff,avg_leg_strikes_diff,avg_leg_attempts_diff,avg_distance_diff,avg_distance_attempts_diff,avg_clinch_strikes_diff,avg_clinch_attempts_diff,avg_ground_strikes_diff,avg_ground_attempts_diff,wins_before_diff,losses_before_diff,total_fights_before_diff,win_pct_before_diff,wins_last_3_diff,fights_last_3_diff,days_since_debut_diff,days_since_last_win_diff,recent_winner_diff,wins_last_365_days_diff,fights_last_365_days_diff,undefeated_last_year_diff,win_streak_diff,lose_streak_diff,on_win_streak_diff,long_win_streak_diff,sig_strikes_ema_diff,sig_strikes_success_rate_ema_diff,sig_attempts_ema_diff,knockdowns_ema_diff,knockdowns_success_rate_ema_diff,takedowns_ema_diff,takedowns_success_rate_ema_diff,takedown_attempts_ema_diff,total_strikes_ema_diff,total_strikes_success_rate_ema_diff,total_strikes_attempts_ema_diff,head_strikes_ema_diff,head_strikes_success_rate_ema_diff,head_attempts_ema_diff,sig_strikes_roll_5_diff,sig_strikes_success_rate_roll_5_diff,sig_attempts_roll_5_diff,takedowns_roll_5_diff,takedowns_success_rate_roll_5_diff,takedown_attempts_roll_5_diff,control_time_roll_5_diff,knockdowns_roll_5_diff,total_strikes_roll_5_diff,total_strikes_attempts_roll_5_diff,performance_trend_diff,finish_momentum_diff,dominance_momentum_diff
23,,2025,9,3,-1.0,0.0,-4.0,2.0,-1.0,-2.0,0.69,-0.23,-0.24,34.0,-0.034965,29.622378,-0.097902,-11.874126,-13.958042,-7.545455,-11.545455,-0.181818,0.447552,2.356643,-2.300699,2.832168,-3.384615,-3.020979,-8.272727,-11.685315,-16.909091,-16.391608,-1.468531,-1.160839,4.41958,5.678322,3,-1,2,0.105,1,0,1330,-154.0,0,1,0,1,12,-1,1,1,-6.808362,0.0,3.834912,-0.451126,0.0,0.224209,0.0,4.000431,1.61085,-0.086426,17.345572,-5.187544,0.0,6.497481,1.8,-0.116551,19.8,0.2,0.1,2.6,155.8,-0.8,14.2,36.8,0.0,0.0,0.0


Feature Selection

In [18]:
# Use features selceted from training
# Load top features
top_features_df = pd.read_csv('../data/notebooks/top_features.csv')
top_features = top_features_df['feature'].tolist()  # convert to list

# Keep only those columns in your DataFrame
X_selected = df_temporal_features_clean[top_features]

In [19]:
X_selected

Unnamed: 0,year,fight_time_avg_diff,days_since_debut_diff,win_pct_before_diff,knockdown_avg_diff,sig_strikes_landed_per_minute_diff,takedowns_avg_diff,days_since_last_win_diff,avg_ground_strikes_diff,avg_ground_attempts_diff,avg_body_strikes_diff,control_time_roll_5_diff,avg_clinch_strikes_diff,avg_sub_attempts_diff,takedowns_ema_diff,avg_body_attempts_diff,avg_clinch_attempts_diff,avg_takedowns_diff,avg_time_diff,takedowns_success_rate_roll_5_diff,weight_diff,takedown_attempts_ema_diff,avg_rounds_diff,submission_avg_diff,avg_takedown_attempts_diff,avg_leg_attempts_diff,reach_diff,avg_knockdowns_diff,losses_before_diff,head_strikes_ema_diff,avg_leg_strikes_diff,knockdowns_ema_diff,avg_sig_strikes_diff,sig_strikes_ema_diff,takedown_attempts_roll_5_diff,avg_head_strikes_diff,head_attempts_ema_diff,total_fights_before_diff,total_strikes_ema_diff,avg_distance_attempts_diff,total_strikes_attempts_ema_diff,avg_distance_diff,sig_attempts_roll_5_diff,avg_head_attempts_diff,leg_reach_diff,avg_total_strikes_diff,avg_total_strikes_attempts_diff,total_strikes_roll_5_diff,total_strikes_attempts_roll_5_diff,sig_attempts_ema_diff,takedowns_roll_5_diff,sig_strikes_roll_5_diff,avg_sig_attempts_diff,sig_strikes_absorbed_per_minute_diff,wins_before_diff,month,win_streak_diff,height_diff,knockdowns_roll_5_diff,wins_last_365_days_diff,fights_last_365_days_diff,lose_streak_diff,wins_last_3_diff
23,2025,34.0,1330,0.105,-0.24,-1.0,0.69,-154.0,4.41958,5.678322,-3.384615,155.8,-1.468531,-0.181818,0.224209,-3.020979,-1.160839,0.447552,29.622378,0.1,0.0,4.000431,-0.034965,-0.23,2.356643,-11.685315,-4.0,-0.097902,-1,-5.187544,-8.272727,-0.451126,-13.958042,-6.808362,2.6,-2.300699,6.497481,2,1.61085,-16.391608,17.345572,-16.909091,19.8,2.832168,2.0,-11.545455,-7.545455,14.2,36.8,3.834912,0.2,1.8,-11.874126,-2.0,3,9,12,-1.0,-0.8,1,0,-1,1


Scale data

In [20]:
# Load the scaler
with open("../models/scaler.pkl", "rb") as f:
    scaler_loaded = pickle.load(f)

# Transform new data
X_scaled  = scaler_loaded.transform(X_selected)
# Convert to dictionary (column name -> scaled value)
# Assuming you have a single row; if multiple rows, you can iterate
fighter_differences = dict(zip(X_selected.columns, X_scaled[0]))

In [21]:
X_scaled 

array([[ 1.87307776,  0.27226907,  1.20366984,  0.54404842, -0.50660269,
        -0.66144396,  0.50293063, -0.14173701,  0.68460385,  0.60691007,
        -0.52329637,  1.03504585, -0.25334641, -0.25690851,  0.18514249,
        -0.32342182, -0.14910756,  0.36467552,  0.45335774,  0.36513335,
         0.04786133,  1.22147199,  0.026284  , -0.25727302,  0.77982479,
        -1.47754776, -1.22113876, -0.23161861, -0.16640393, -0.23429698,
        -1.32334258, -1.07815054, -0.61788088, -0.19721249,  0.78758451,
        -0.11873723,  0.20308597,  0.57721799,  0.10045516, -0.32020932,
         0.33614695, -0.87715206,  0.41928512,  0.10105075,  0.96771853,
        -0.33617015, -0.10322097,  0.44024991,  0.62831089,  0.14230661,
         0.15843882,  0.15186139, -0.20185089, -1.51619012,  1.01991673,
         0.63628001,  5.58365062, -0.40760136, -1.90993001,  0.87845721,
         0.01585169, -0.93513526,  1.02382306]])

In [22]:
fighter_differences

{'year': 1.873077760255086,
 'fight_time_avg_diff': 0.27226907097835107,
 'days_since_debut_diff': 1.203669841061651,
 'win_pct_before_diff': 0.5440484155938219,
 'knockdown_avg_diff': -0.5066026876096668,
 'sig_strikes_landed_per_minute_diff': -0.6614439593608067,
 'takedowns_avg_diff': 0.5029306286594222,
 'days_since_last_win_diff': -0.141737008087085,
 'avg_ground_strikes_diff': 0.6846038504817763,
 'avg_ground_attempts_diff': 0.6069100687363218,
 'avg_body_strikes_diff': -0.5232963728971759,
 'control_time_roll_5_diff': 1.0350458502612077,
 'avg_clinch_strikes_diff': -0.25334640735246866,
 'avg_sub_attempts_diff': -0.2569085113276613,
 'takedowns_ema_diff': 0.18514248621754245,
 'avg_body_attempts_diff': -0.32342181750347093,
 'avg_clinch_attempts_diff': -0.14910755900758546,
 'avg_takedowns_diff': 0.364675523574307,
 'avg_time_diff': 0.4533577443592152,
 'takedowns_success_rate_roll_5_diff': 0.3651333543569673,
 'weight_diff': 0.0478613331887618,
 'takedown_attempts_ema_diff': 1.

Predict fight

In [23]:
class UniversalPredictor:
    """
    Universal predictor that can handle multiple model types including PyTorch neural networks
    """
    
    def __init__(self):
        self.supported_models = {
            'LogisticRegression': self._predict_sklearn,
            'SGDClassifier': self._predict_sklearn,
            'RandomForestClassifier': self._predict_sklearn,
            'DecisionTreeClassifier': self._predict_sklearn,
            'GradientBoostingClassifier': self._predict_sklearn,
            'KNeighborsClassifier': self._predict_sklearn,
            'GaussianNB': self._predict_sklearn,
            'XGBClassifier': self._predict_xgb,
            'AdaBoostClassifier': self._predict_sklearn,
            'SVC': self._predict_sklearn,
            'Neural Network': self._predict_pytorch
        }
    
    def load_model(self, model_path, model_type='auto'):
        """
        Load a model from file
        
        Parameters:
        model_path: str - path to the model file
        model_type: str - type of model ('auto' for auto-detection)
        """
        model_path = Path(model_path)
        
        if model_type == 'auto':
            model_type = self._detect_model_type(model_path)
        
        if model_type == 'Neural Network':
            return self._load_pytorch_model(model_path)
        else:
            return self._load_sklearn_model(model_path)
    
    def _detect_model_type(self, model_path):
        """Auto-detect model type based on file extension and content"""
        if model_path.suffix in ['.pth', '.pt']:
            return 'Neural Network'
        elif model_path.suffix in ['.pkl', '.pickle']:
            # Try to load and check the type
            try:
                with open(model_path, 'rb') as f:
                    model = pickle.load(f)
                return type(model).__name__
            except:
                return 'Unknown'
        elif model_path.suffix == '.joblib':
            try:
                model = joblib.load(model_path)
                return type(model).__name__
            except:
                return 'Unknown'
        else:
            return 'Unknown'
    
    def _load_sklearn_model(self, model_path):
        """Load sklearn/xgboost models"""
        try:
            with open(model_path, 'rb') as f:
                return pickle.load(f)
        except:
            try:
                return joblib.load(model_path)
            except:
                raise ValueError(f"Could not load model from {model_path}")
    
    def _load_pytorch_model(self, model_path, input_dim=None):
        """Load PyTorch model from state_dict"""
        try:
            # Auto-detect input dimension from fighter_differences if not provided
            if input_dim is None:
                raise ValueError("input_dim must be provided for PyTorch model")

            # Use your trained Deep class
            class Deep(nn.Module):
                def __init__(self, input_dim):
                    super().__init__()
                    self.network = nn.Sequential(
                        nn.Linear(input_dim, 128),
                        nn.BatchNorm1d(128),
                        nn.ReLU(),
                        nn.Linear(128, 64),
                        nn.BatchNorm1d(64),
                        nn.ReLU(),
                        nn.Linear(64, 64),
                        nn.ReLU(),
                        nn.Dropout(0.3),
                        nn.Linear(64, 32),
                        nn.ReLU(),
                        nn.Dropout(0.3),
                        nn.Linear(32, 1),
                    )

                def forward(self, x):
                    return self.network(x)

            model = Deep(input_dim=input_dim)
            state_dict = torch.load(model_path, map_location="cpu")
            model.load_state_dict(state_dict)
            model.eval()
            return model

        except Exception as e:
            raise ValueError(f"Could not load PyTorch model from {model_path}: {e}")
    
    def _create_default_nn(self):
        """
        Create a default neural network architecture
        Replace this with your actual model architecture
        """
        class DefaultNN(nn.Module):
            def __init__(self, input_size=10, hidden_size=64, output_size=2):
                super(DefaultNN, self).__init__()
                self.fc1 = nn.Linear(input_size, hidden_size)
                self.fc2 = nn.Linear(hidden_size, hidden_size)
                self.fc3 = nn.Linear(hidden_size, output_size)
                self.relu = nn.ReLU()
                self.dropout = nn.Dropout(0.2)
                
            def forward(self, x):
                x = self.relu(self.fc1(x))
                x = self.dropout(x)
                x = self.relu(self.fc2(x))
                x = self.dropout(x)
                x = self.fc3(x)
                return x
        
        return DefaultNN()
    
    def predict(self, model, fighter_differences, model_type='auto'):
        """
        Make prediction using any supported model
        
        Parameters:
        model: loaded model object
        fighter_differences: dict with difference values
        model_type: str - type of model ('auto' for auto-detection)
        """
        if model_type == 'auto':
            model_type = type(model).__name__
        
        # Handle PyTorch models
        if isinstance(model, nn.Module):
            model_type = 'Neural Network'
        
        if model_type in self.supported_models:
            return self.supported_models[model_type](model, fighter_differences)
        else:
            # Fallback to sklearn method
            return self._predict_sklearn(model, fighter_differences)
    
    def _predict_sklearn(self, model, fighter_differences):
        """Handle sklearn-compatible models"""
        features = list(fighter_differences.values())
        
        # SOLUTION 1: Use pandas DataFrame to maintain feature names
        feature_names = list(fighter_differences.keys())
        X = pd.DataFrame([features], columns=feature_names)
        
        # Alternative solutions (choose one):
        
        # SOLUTION 2: Convert to numpy and suppress warnings
        # X = np.array(features).reshape(1, -1)
        # with warnings.catch_warnings():
        #     warnings.simplefilter("ignore", UserWarning)
        #     prediction = model.predict(X)[0]
        #     # ... rest of prediction code with warnings suppressed
        
        # SOLUTION 3: Set feature names after creating numpy array (sklearn 1.2+)
        # X = np.array(features).reshape(1, -1)
        # if hasattr(X, 'feature_names_in_'):
        #     X.feature_names_in_ = np.array(feature_names)
        
        # Check if model expects more features
        try:
            # Try to get expected number of features
            if hasattr(model, 'n_features_in_'):
                expected_features = model.n_features_in_
            elif hasattr(model, 'coef_'):
                expected_features = model.coef_.shape[1] if len(model.coef_.shape) > 1 else len(model.coef_)
            else:
                expected_features = X.shape[1]
            
            if X.shape[1] != expected_features:
                print(f"Warning: Model expects {expected_features} features, but got {X.shape[1]}")
                print(f"Current features: {list(fighter_differences.keys())}")
                print("Please ensure your fighter_differences contains all the features the model was trained on.")
                raise ValueError(f"Feature mismatch: Model expects {expected_features} features, got {X.shape[1]}")
                
        except AttributeError:
            pass  # Some models don't have these attributes
        
        prediction = model.predict(X)[0]
        
        # Try different probability methods
        try:
            probabilities = model.predict_proba(X)[0]
            if len(probabilities) > 1:
                fighter_red_prob = probabilities[1]
                fighter_blue_prob = probabilities[0]
            else:
                fighter_red_prob = probabilities[0]
                fighter_blue_prob = 1 - probabilities[0]
            confidence = max(probabilities)
        except AttributeError:
            try:
                # Use decision_function for SGD
                decision_score = model.decision_function(X)[0]
                fighter_red_prob = 1 / (1 + np.exp(-decision_score))
                fighter_blue_prob = 1 - fighter_red_prob
                confidence = max(fighter_red_prob, fighter_blue_prob)
            except AttributeError:
                # Fallback: binary prediction only
                fighter_red_prob = float(prediction)
                fighter_blue_prob = 1.0 - float(prediction)
                confidence = 1.0
        
        return {
            'prediction': prediction,
            'fighter_red_win_prob': fighter_red_prob,
            'fighter_blue_win_prob': fighter_blue_prob,
            'confidence': confidence,
            'model_name': type(model).__name__
        }
    
    def _predict_xgb(self, model, fighter_differences):
        """Handle XGBoost models (similar to sklearn but with specific handling)"""
        return self._predict_sklearn(model, fighter_differences)
    
    def _predict_pytorch(self, model, fighter_differences):
        features = list(fighter_differences.values())
        X = torch.tensor(features, dtype=torch.float32).unsqueeze(0)

        with torch.no_grad():
            outputs = model(X)  # shape [1,1]
            prob = torch.sigmoid(outputs).item()

            fighter_red_prob = prob
            fighter_blue_prob = 1 - prob
            prediction = int(prob > 0.5)

        confidence = max(fighter_red_prob, fighter_blue_prob)

        return {
            'prediction': prediction,
            'fighter_red_win_prob': fighter_red_prob,
            'fighter_blue_win_prob': fighter_blue_prob,
            'confidence': confidence,
            'model_name': 'Deep NN (PyTorch)'
        }

def load_and_predict_model(fighter_differences, model_path='../models/sgdclassifier.pkl', model_type='auto'):
    """
    Enhanced version of your original function that works with all model types
    
    Parameters:
    fighter_differences: dict with difference values
    model_path: str - path to model file
    model_type: str - specify model type or 'auto' for auto-detection
    """
    # First, let's check what features we have
    print(f"Features provided: {list(fighter_differences.keys())}")
    print(f"Number of features: {len(fighter_differences)}")
    
    predictor = UniversalPredictor()
    try:
        # For PyTorch models, pass input_dim
        if model_path.endswith('.pth') or model_type == 'Neural Network':
            input_dim = len(fighter_differences)
            model = predictor._load_pytorch_model(model_path, input_dim=input_dim)
        else:
            model = predictor.load_model(model_path, model_type)
        
        result = predictor.predict(model, fighter_differences, model_type)
        return result
    except ValueError as e:
        print(f"Error: {e}")
        return None

def predict_with_multiple_models(fighter_differences, model_paths):
    """
    Make predictions with multiple models and return ensemble results
    
    Parameters:
    fighter_differences: dict with difference values
    model_paths: list of tuples [(model_path, model_type), ...]
    """
    predictor = UniversalPredictor()
    results = []
    
    for model_path, model_type in model_paths:
        try:
            # For PyTorch models saved as state_dict
            if model_path.endswith('.pth') or model_type == 'Neural Network':
                input_dim = len(fighter_differences)
                model = predictor._load_pytorch_model(model_path, input_dim=input_dim)
            else:
                model = predictor.load_model(model_path, model_type)
            
            result = predictor.predict(model, fighter_differences, model_type)
            results.append(result)

            print(f"{result['model_name']} Prediction:")
            print(f"Winner: Fighter {'Red' if result['prediction'] == 1 else 'Blue'}")
            print(f"Fighter Red probability: {result['fighter_red_win_prob']:.3f}")
            print(f"Fighter Blue probability: {result['fighter_blue_win_prob']:.3f}")
            print(f"Confidence: {result['confidence']:.3f}")
            print("-" * 50)
        
        except Exception as e:
            print(f"Error with model {model_path}: {str(e)}")
    
    # Ensemble prediction
    if results:
        avg_red_prob = np.mean([r['fighter_red_win_prob'] for r in results])
        avg_blue_prob = 1 - avg_red_prob
        ensemble_prediction = int(avg_red_prob > 0.5)
        
        print("ENSEMBLE PREDICTION:")
        print(f"Winner: Fighter {'Red' if ensemble_prediction == 1 else 'Blue'}")
        print(f"Fighter Red probability: {avg_red_prob:.3f}")
        print(f"Fighter Blue probability: {avg_blue_prob:.3f}")
        print(f"Models used: {len(results)}")
    
    return results

# Usage with your existing fighter_differences:
'''
# Test the enhanced function with your existing data
result = load_and_predict_model(fighter_differences, '../models/adaboostclassifier.pkl')
print("Enhanced SGD Prediction:")
print(f"Winner: Fighter {'Red' if result['prediction'] == 1 else 'Blue'}")
print(f"Fighter Red probability: {result['fighter_red_win_prob']:.3f}")
print(f"Fighter Blue probability: {result['fighter_blue_win_prob']:.3f}")
print(f"Confidence: {result['confidence']:.3f}")
print(f"Model: {result['model_name']}")
'''
# Load Multiple Models

model_paths = [
    ('../models/adaboostclassifier.pkl', 'auto'),
    ('../models/decisiontreeclassifier.pkl', 'auto'),
    ('../models/gaussiannb.pkl', 'auto'),
    ('../models/gradientboostingclassifier.pkl', 'auto'),
    ('../models/kneighborsclassifier.pkl', 'auto'),
    ('../models/logisticregression.pkl', 'auto'),
    ('../models/PyTorch_state_dict.pth', 'Neural Network'),
    ('../models/randomforestclassifier.pkl', 'auto'),
    ('../models/sgdclassifier.pkl', 'auto'),
    ('../models/svc.pkl', 'auto'),
    ('../models/xgbclassifier.pkl', 'auto')
]

print("\n" + "="*60)
print("MULTIPLE MODEL PREDICTIONS:")
print("="*60)
results = predict_with_multiple_models(fighter_differences, model_paths)


MULTIPLE MODEL PREDICTIONS:
AdaBoostClassifier Prediction:
Winner: Fighter Red
Fighter Red probability: 0.881
Fighter Blue probability: 0.119
Confidence: 0.881
--------------------------------------------------
DecisionTreeClassifier Prediction:
Winner: Fighter Red
Fighter Red probability: 0.514
Fighter Blue probability: 0.486
Confidence: 0.514
--------------------------------------------------
GaussianNB Prediction:
Winner: Fighter Red
Fighter Red probability: 0.817
Fighter Blue probability: 0.183
Confidence: 0.817
--------------------------------------------------
GradientBoostingClassifier Prediction:
Winner: Fighter Red
Fighter Red probability: 0.605
Fighter Blue probability: 0.395
Confidence: 0.605
--------------------------------------------------
KNeighborsClassifier Prediction:
Winner: Fighter Blue
Fighter Red probability: 0.467
Fighter Blue probability: 0.533
Confidence: 0.533
--------------------------------------------------
LogisticRegression Prediction:
Winner: Fighter Bl