In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import quote

def find_best_snapshot(snapshots, fight_date):
    fight_datetime = datetime.strptime(fight_date, "%Y-%m-%d")
    best_snapshot = None
    best_time = None

    for snap in snapshots:
        timestamp, _ = snap
        snapshot_datetime = datetime.strptime(timestamp, "%Y%m%d%H%M%S")
        if snapshot_datetime <= fight_datetime:
            if not best_time or snapshot_datetime > best_time:
                best_snapshot = snap
                best_time = snapshot_datetime

    return best_snapshot, best_time

def get_historical_rankings(fight_date):
    rankings_url = "https://www.ufc.com/rankings"
    cdx_url = "http://web.archive.org/cdx/search/cdx"
    params = {
        "url": rankings_url,
        "output": "json",
        "fl": "timestamp,original",
        "filter": "statuscode:200",
        "collapse": "digest"
    }
    response = requests.get(cdx_url, params=params)
    if response.status_code != 200:
        print("Error fetching CDX API for UFC rankings page.")
        return None

    snapshots = response.json()
    snapshots = snapshots[1:] if len(snapshots) > 1 else []

    best_snapshot, best_time = find_best_snapshot(snapshots, fight_date)

    if not best_snapshot:
        print(f"No suitable snapshot found for UFC rankings page before {fight_date}.")
        return None

    timestamp, original_url = best_snapshot
    snapshot_url = f"https://web.archive.org/web/{timestamp}/{original_url}"
    print(f"Using historical UFC rankings page from {datetime.strptime(timestamp, '%Y%m%d%H%M%S').strftime('%Y-%m-%d')}: {snapshot_url}")

    try:
        response = requests.get(snapshot_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup.get_text(separator='\n', strip=True)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching historical UFC rankings page: {e}")
        return None

def get_lightweight_rankings(rankings_text):
    if not rankings_text:
        return []  # Return an empty list instead of a string

    lightweight_start = rankings_text.find("LIGHTWEIGHT")
    if lightweight_start == -1:
        return []  # Return an empty list if not found

    next_division_starts = []
    potential_next_divisions = ["Featherweight", "Welterweight", "Women's Strawweight"]
    for division in potential_next_divisions:
        index = rankings_text.find(division, lightweight_start)
        if index != -1:
            next_division_starts.append(index)

    lightweight_end = min(next_division_starts) if next_division_starts else len(rankings_text)
    lightweight_rankings_text = rankings_text[lightweight_start:lightweight_end]

    ranked_fighters = []
    for line in lightweight_rankings_text.split('\n'):
        line = line.strip()
        parts = line.split()
        if parts and parts[0].isdigit():
            try:
                rank = int(parts[0])
                fighter_name = " ".join(parts[1:])
                if fighter_name and not fighter_name.lower().startswith(('rank', 'champion')):
                    ranked_fighters.append((rank, fighter_name))
            except ValueError:
                pass
    return ranked_fighters

# Let's run it
fight_date_example = "2023-02-23"
rankings_text = get_historical_rankings(fight_date_example)

if rankings_text:
    lightweight_ranking = get_lightweight_rankings(rankings_text)
    print("\n--- Lightweight Rankings (from historical page) ---")
    for rank, fighter in lightweight_ranking:
        print(f"{rank}. {fighter}")
else:
    print("\nCould not retrieve historical rankings page.")

Using historical UFC rankings page from 2023-02-22: https://web.archive.org/web/20230222023702/https://www.ufc.com/rankings

--- Lightweight Rankings (from historical page) ---


In [None]:
import requests
from bs4 import BeautifulSoup

def scrape_ufc_athlete_page_explore(fighter_name):
    # Construct the UFC athlete page URL
    formatted_name = fighter_name.replace(' ', '-')
    ufc_url = f"https://www.ufc.com/athlete/{formatted_name}"
    print(f"Exploring UFC page: {ufc_url}")
    try:
        response = requests.get(ufc_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Print the text content of the main sections to see available data
        main_content = soup.find('div', class_='l-main__content')
        if main_content:
            print("\n--- Main Content Text ---")
            print(main_content.get_text(separator='\n', strip=True))
        else:
            print("Could not find the main content section.")

        return True

    except requests.exceptions.RequestException as e:
        print(f"Error fetching UFC athlete page: {e}")
        return False

# Example usage:
fighter_to_explore = "charles oliveira"
scrape_ufc_athlete_page_explore(fighter_to_explore)

Exploring UFC page: https://www.ufc.com/athlete/charles-oliveira

--- Main Content Text ---
#2
        Lightweight Division
#14 PFP
Active
"Do Bronxs"
Charles Oliveira
Lightweight Division
35-10-0 (W-L-D)
10
Wins by Knockout
21
Wins by Submission
Last fight
Win
Oliveira
vs
Chandler
Nov. 16, 2024
Watch Replay
Fight Card
View Fight History
UFC Spotlight
Charles Oliveira Named 2025 Recipient Of Forrest Griffin Community Award
Play
Video
Charles Oliveira Recognized With 2025 Forrest Griffin Community Award
Play
Video
One Hour Of The Greatest UFC Submissions
Charles Oliveira Open To Rematches Against Makhachev Or Holloway
Gallery
13
Charles Oliveira And Tracy Cortez Host Jiu-Jitsu Seminar In East LA
Play
Video
10 Epic UFC Debuts
Five Fighters Who’ve Thrived Moving Up A Weight Division
Play
Video
Charles Oliveira Post-Fight Press Conference | UFC 309
Play
Video
Charles Oliveira Post-Fight Interview | UFC 309
Play
Video
Charles Oliveira Octagon Interview | UFC 309
Play
Video
Post-Fight Press 

True

In [None]:
import requests
from bs4 import BeautifulSoup

def scrape_knockdown_avg(fighter_name):
    formatted_name = fighter_name.replace(' ', '-')
    ufc_url = f"https://www.ufc.com/athlete/{formatted_name}"
    print(f"Scraping UFC page for {fighter_name}: {ufc_url}")
    try:
        response = requests.get(ufc_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        knockdown_avg = None

        stat_compares = soup.find_all('div', class_='c-stat-compare__group')
        for stat_group in stat_compares:
            label_element = stat_group.find('div', class_='c-stat-compare__label')
            value_element = stat_group.find('div', class_='c-stat-compare__number')
            if label_element and value_element:
                label_text = label_element.text.strip()
                value_text = value_element.text.strip()
                if label_text == 'Knockdown Avg':
                    knockdown_avg = value_text
                    break  # Found it

        print(f"\n--- Knockdown Avg. for {fighter_name} ---")
        print(f"Knockdown Avg.: {knockdown_avg}")
        return knockdown_avg

    except requests.exceptions.RequestException as e:
        print(f"Error fetching UFC athlete page: {e}")
        return None

# Example usage:
fighter_to_scrape = "charles oliveira"
knockdown = scrape_knockdown_avg(fighter_to_scrape)

Scraping UFC page for charles oliveira: https://www.ufc.com/athlete/charles-oliveira

--- Knockdown Avg. for charles oliveira ---
Knockdown Avg.: 0.46


In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import quote

# Returns the UFCStats URL for a given fighter based on a hardcoded ID lookup.
def get_fighter_url(fighter_name):
    fighter_ids = {
        "alex pereira": "e5549c82bfb5582d",
        "magomed ankalaev": "d802174b0c0c1f4e",
        "charles oliveira": "07225ba28ae309b6",
        "islam makhachev": "275aca31f61ba28c",
        "cory sandhagen": "65f09bacd3957381",
        "deiveson figueiredo": "aa72b0f831d0bfe5",
        "alexander volkanovski": "e1248941344b3288",
        "diego lopes": "f166e93d04a8c274",
        "michael chandler": "4b93a88f3b1de35b",
        "paddy pimblett": "7826923b47f8d72a",
        "bryce mitchell": "d9c6f19f958643e9",
        "jean silva": "52ef95b5860fb28c",
        "leon edwards": "f1fac969a1d70b08",
        "sean brady": "45f7cb591c3ab00b"
    }
    fighter_id = fighter_ids.get(fighter_name)
    if not fighter_id:
        return None
    return f"http://www.ufcstats.com/fighter-details/{fighter_id}"

# Retrieves a list of archived snapshots of a fighter's UFCStats page from the Wayback Machine.
def get_snapshots(fighter_url):
    cdx_url = "http://web.archive.org/cdx/search/cdx"
    params = {
        "url": fighter_url,
        "output": "json",
        "fl": "timestamp,original",
        "filter": "statuscode:200",
        "collapse": "digest"
    }
    response = requests.get(cdx_url, params=params)
    if response.status_code != 200:
        return []

    snapshots = response.json()
    return snapshots[1:] if len(snapshots) > 1 else []

# Finds the closest archived snapshot before the specified fight date.
def find_best_snapshot(snapshots, fight_date):
    fight_datetime = datetime.strptime(fight_date, "%Y-%m-%d")
    best_snapshot = None
    best_time = None

    for snap in snapshots:
        timestamp, _ = snap
        snapshot_datetime = datetime.strptime(timestamp, "%Y%m%d%H%M%S")
        if snapshot_datetime <= fight_datetime:
            if not best_time or snapshot_datetime > best_time:
                best_snapshot = snap
                best_time = snapshot_datetime

    return best_snapshot, best_time

# Converts height from feet and inches to centimeters.
def convert_height_to_cm(height_str):
    parts = height_str.replace('"', '').split("' ")
    if len(parts) == 2:
        feet = int(parts[0])
        inches = int(parts[1])
        return round((feet * 12 + inches) * 2.54, 1)
    return None

# Converts weight from pounds to kilograms.
def convert_weight_to_kg(weight_str):
    return round(int(weight_str.replace(' lbs.', '')) * 0.453592, 1) if 'lbs.' in weight_str else None

# Converts reach from inches to centimeters.
def convert_reach_to_cm(reach_str):
    return round(int(reach_str.replace('"', '')) * 2.54, 1) if '"' in reach_str else None

# Extracts the birth year from a date-of-birth string.
def get_birth_year(dob_str):
    try:
        return datetime.strptime(dob_str, '%b %d, %Y').year
    except (ValueError, TypeError):
        return None

# Scrapes archived UFCStats data for a fighter including physical stats, record, and career averages.
def scrape_fighter_data(snapshot_url, fighter_name):
    try:
        response = requests.get(snapshot_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        fighter_data = {}

        # Fighter Record (Wins & Losses)
        record_span = soup.find("span", class_="b-content__title-record")
        record_text = record_span.text.strip() if record_span else "N/A"
        parts = record_text.split('-')
        if len(parts) >= 2:
            fighter_data['wins'] = int(parts[0].replace('Record:', '').strip())
            fighter_data['losses'] = int(parts[1].strip())
        else:
            fighter_data['wins'] = None
            fighter_data['losses'] = None
        print(f"\n--- Stats for {fighter_name} ---")
        print(f"Record: {record_text} (Wins: {fighter_data.get('wins', 'N/A')}, Losses: {fighter_data.get('losses', 'N/A')})")

        stat_divs = soup.find_all("li", class_="b-list__box-list-item")
        for div in stat_divs:
            text = div.get_text(strip=True).replace("\n", "").split(":")
            if len(text) == 2:
                key, value = text[0].strip(), text[1].strip()
                if key == "Height":
                    fighter_data['height_cm'] = convert_height_to_cm(value)
                    print(f"Height (cm): {fighter_data['height_cm']}")
                elif key == "Weight":
                    fighter_data['weight_kg'] = convert_weight_to_kg(value)
                    print(f"Weight (kg): {fighter_data['weight_kg']}")
                elif key == "Reach":
                    fighter_data['reach_cm'] = convert_reach_to_cm(value)
                    print(f"Reach (cm): {fighter_data['reach_cm']}")
                elif key == "DOB":
                    fighter_data['birth_year'] = get_birth_year(value)
                    print(f"Birth Year: {fighter_data['birth_year']}")

        # Career Statistics (without percentages)
        career_stats_map = {
            "SLpM": "slpm",
            "Str. Acc.": "striking_accuracy",
            "SApM": "sapm",
            "Str. Def": "striking_defense",
            "TD Avg.": "takedown_average",
            "TD Acc.": "takedown_accuracy",
            "TD Def.": "takedown_defense",
            "Sub. Avg.": "submission_average"
        }
        stat_divs_block = soup.find_all("li", class_="b-list__box-list-item b-list__box-list-item_type_block")
        for div in stat_divs_block:
            text = div.get_text(strip=True).replace("\n", "").split(":")
            if len(text) == 2:
                label, value = text[0].strip(), text[1].strip()
                if label in career_stats_map:
                    cleaned_value = value.replace('%', '').strip()
                    fighter_data[career_stats_map[label]] = cleaned_value
                    print(f"{label}: {cleaned_value}")

        # Last 5 Fights Form Score
        fight_table = soup.find('table', class_="b-fight-details__table")
        if fight_table:
            rows = fight_table.find_all('tr')[1:]  # Skip header row
            form_score = calculate_form_score(rows)
            fighter_data['form_score'] = form_score
            print(f"Form Score (last 5 fights): {form_score}")
        else:
            print("No fight history found.")
            fighter_data['form_score'] = "N/A"

        return fighter_data
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
        return None
    except Exception as e:
        print(f"Error during scraping: {e}")
        return None

# Assigns a score based on fight result and how far the fighter lasted in the match.
def calculate_fight_score(result, round_, time):
    if result == "loss":
        return 0
    elif result == "draw":
        return 0.5
    if round_.isdigit():
        round_num = int(round_)
        if round_num == 5 and time == "5:00":
            return 1
        elif round_num == 3 and time == "5:00":
            return 3
        else:
            return 7 - round_num
    return 0 # Default score if round is not a digit

# Calculates a form score based on the outcomes of the last five fights.
def calculate_form_score(fight_rows):
    form_score = 0
    fight_count = 0
    for row in fight_rows:
        cols = row.find_all('td')
        if len(cols) >= 9:
            result = cols[0].text.strip().lower()
            if result in ["win", "loss", "draw"]:
                round_ = cols[8].text.strip()
                time = cols[9].text.strip() if len(cols) > 9 else "N/A"
                form_score += calculate_fight_score(result, round_, time)
                fight_count += 1
                if fight_count == 5:
                    break
    return form_score

# Scrapes the knockdown average stat from an archived UFC.com page for a fighter.
def scrape_knockdown_avg(fighter_name, fight_date):
    formatted_name = fighter_name.replace(' ', '-')
    ufc_url = f"https://www.ufc.com/athlete/{formatted_name}"

    # Find the best snapshot for the UFC athlete page
    cdx_url = "http://web.archive.org/cdx/search/cdx"
    params = {
        "url": ufc_url,
        "output": "json",
        "fl": "timestamp,original",
        "filter": "statuscode:200",
        "collapse": "digest"
    }
    response = requests.get(cdx_url, params=params)
    if response.status_code != 200:
        print(f"Error fetching CDX API for UFC page of {fighter_name}")
        return None

    snapshots = response.json()
    snapshots = snapshots[1:] if len(snapshots) > 1 else []

    best_snapshot, best_time = find_best_snapshot(snapshots, fight_date)

    if not best_snapshot:
        print(f"No suitable snapshot found for UFC page of {fighter_name} before {fight_date}. Using current data.")
        return scrape_knockdown_avg(fighter_name) # Fallback to current data

    timestamp, original_url = best_snapshot
    snapshot_url = f"https://web.archive.org/web/{timestamp}/{original_url}"
    print(f"Scraping historical UFC page for {fighter_name} (KD Avg) from {datetime.strptime(timestamp, '%Y%m%d%H%M%S').strftime('%Y-%m-%d')}: {snapshot_url}")

    try:
        response = requests.get(snapshot_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        knockdown_avg = None

        stat_compares = soup.find_all('div', class_='c-stat-compare__group')
        for stat_group in stat_compares:
            label_element = stat_group.find('div', class_='c-stat-compare__label')
            value_element = stat_group.find('div', class_='c-stat-compare__number')
            if label_element and value_element:
                label_text = label_element.text.strip()
                value_text = value_element.text.strip()
                if label_text == 'Knockdown Avg':
                    knockdown_avg = value_text
                    break  # Found it
        return knockdown_avg

    except requests.exceptions.RequestException as e:
        print(f"Error fetching historical UFC athlete page: {e}")
        return None

# Retrieves all relevant pre-fight stats for a fighter from archived pages.
def get_fighter_data_before_fight(fighter_name, fight_date):
    fighter_data_ufcstats = None
    best_time = None
    fighter_url = get_fighter_url(fighter_name)
    if not fighter_url:
        print(f"Could not find URL for {fighter_name} on ufcstats")
    else:
        snapshots = get_snapshots(fighter_url)
        if not snapshots:
            print(f"No snapshots found for {fighter_name} on ufcstats")
        else:
            best_snapshot, best_time = find_best_snapshot(snapshots, fight_date)
            if not best_snapshot:
                print(f"No suitable snapshot found for {fighter_name} on ufcstats before {fight_date}")
            else:
                timestamp, original_url = best_snapshot
                snapshot_url = f"https://web.archive.org/web/{timestamp}/{original_url}"
                print(f"Using ufcstats snapshot for {fighter_name} from {datetime.strptime(timestamp, '%Y%m%d%H%M%S').strftime('%Y-%m-%d')}: {snapshot_url}")
                fighter_data_ufcstats = scrape_fighter_data(snapshot_url, fighter_name)

    kd_avg = scrape_knockdown_avg(fighter_name, fight_date)
    if fighter_data_ufcstats:
        fighter_data_ufcstats['kd_avg'] = kd_avg
    elif kd_avg is not None:
        fighter_data_ufcstats = {'kd_avg': kd_avg}

    return fighter_data_ufcstats, best_time

# Handles user input for multiple fights, collects data for both fighters, and saves everything into a CSV file.
def main():
    import csv
    all_fight_data = []
    while True:
        fighter1 = input("Enter Fighter 1 Name (or 'pause' to finish): ").strip().lower()
        if fighter1 == 'pause':
            break
        fighter2 = input("Enter Fighter 2 Name: ").strip().lower()
        fight_year = input("Enter Fight Year: ").strip()
        fight_month = input("Enter Fight Month (MM): ").strip()
        fight_day = input("Enter Fight Day (DD): ").strip()
        fight_date = f"{fight_year}-{fight_month.zfill(2)}-{fight_day.zfill(2)}"
        weight_class = input("Enter Weight Class (e.g. 155): ").strip()
        outcome = input("Enter Outcome (0 or 1, 0 if fighter1 lost or 1 if fighter1 won): ").strip()
        round_finished = input("Enter Round Finished (if any, else leave blank): ").strip()
        fight_rounds = input("Enter Total Fight Rounds: ").strip()


        fighter1_data, _ = get_fighter_data_before_fight(fighter1, fight_date)
        fighter2_data, _ = get_fighter_data_before_fight(fighter2, fight_date)

        if fighter1_data and fighter2_data:
            fight_record = {
                'fight_year': fight_year,
                'fighter1': fighter1,
                'fighter2': fighter2,
                'fighter1_wins': fighter1_data.get('wins'),
                'fighter2_wins': fighter2_data.get('wins'),
                'fighter1_losses': fighter1_data.get('losses'),
                'fighter2_losses': fighter2_data.get('losses'),
                'fighter1_height_cm': fighter1_data.get('height_cm'),
                'fighter2_height_cm': fighter2_data.get('height_cm'),
                'fighter1_reach_cm': fighter1_data.get('reach_cm'),
                'fighter2_reach_cm': fighter2_data.get('reach_cm'),
                'fighter1_birth_year': fighter1_data.get('birth_year'),
                'fighter2_birth_year': fighter2_data.get('birth_year'),
                'fighter1_slpm': fighter1_data.get('slpm'),
                'fighter2_slpm': fighter2_data.get('slpm'),
                'fighter1_striking_accuracy': fighter1_data.get('striking_accuracy'),
                'fighter2_striking_accuracy': fighter2_data.get('striking_accuracy'),
                'fighter1_sapm': fighter1_data.get('sapm'),
                'fighter2_sapm': fighter2_data.get('sapm'),
                'fighter1_striking_defense': fighter1_data.get('striking_defense'),
                'fighter2_striking_defense': fighter2_data.get('striking_defense'),
                'fighter1_takedown_average': fighter1_data.get('takedown_average'),
                'fighter2_takedown_average': fighter2_data.get('takedown_average'),
                'fighter1_takedown_accuracy': fighter1_data.get('takedown_accuracy'),
                'fighter2_takedown_accuracy': fighter2_data.get('takedown_accuracy'),
                'fighter1_takedown_defense': fighter1_data.get('takedown_defense'),
                'fighter2_takedown_defense': fighter2_data.get('takedown_defense'),
                'fighter1_submission_average': fighter1_data.get('submission_average'),
                'fighter2_submission_average': fighter2_data.get('submission_average'),
                'fighter1_kd_avg': fighter1_data.get('kd_avg'),
                'fighter2_kd_avg': fighter2_data.get('kd_avg'),
                'fighter1_form_score': fighter1_data.get('form_score'),
                'fighter2_form_score': fighter2_data.get('form_score'),
                'weight_class': weight_class,
                'round_finished': round_finished,
                'fight_rounds': fight_rounds,
                'outcome': outcome
            }
            all_fight_data.append(fight_record)
            print("\nFight data added.")
        else:
            print("Could not retrieve data for one or both fighters for that date.")

    if all_fight_data:
        csv_file = "ufc_fight_data.csv"
        csv_columns = all_fight_data[0].keys()
        try:
            with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
                writer.writeheader()
                for data in all_fight_data:
                    writer.writerow(data)
            print(f"\nData written to {csv_file}. You can now download it.")
        except IOError:
            print("I/O error")
    else:
        print("No fight data to save.")

if __name__ == "__main__":
    main()

Enter Fighter 1 Name (or 'pause' to finish): Alex pereira
Enter Fighter 2 Name: magomed ankalaev
Enter Fight Year: 2025
Enter Fight Month (MM): 03
Enter Fight Day (DD): 08
Enter Weight Class (e.g. 155): 205
Enter Outcome (0 or 1, 0 if fighter1 lost or 1 if fighter1 won): 0
Enter Round Finished (if any, else leave blank): 6
Enter Total Fight Rounds: 5
Using ufcstats snapshot for alex pereira from 2025-03-07: https://web.archive.org/web/20250307112955/http://ufcstats.com/fighter-details/e5549c82bfb5582d

--- Stats for alex pereira ---
Record: Record: 12-2-0 (Wins: 12, Losses: 2)
Height (cm): 193.0
Weight (kg): 93.0
Reach (cm): 200.7
Birth Year: 1987
SLpM: 5.46
Str. Acc.: 63
SApM: 3.44
Str. Def: 55
TD Avg.: 0.14
TD Acc.: 100
TD Def.: 70
Sub. Avg.: 0.3
Form Score (last 5 fights): 22
Scraping historical UFC page for alex pereira (KD Avg) from 2025-02-10: https://web.archive.org/web/20250210002741/https://www.ufc.com/athlete/alex-pereira
Using ufcstats snapshot for magomed ankalaev from 2024