In [None]:
import pickle
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

In [None]:
def ppj(data):
    """
    Takes a JSON object (dictionary or string) and prints it in a pretty format.
    
    Args:
    data (dict or str): The JSON object to be pretty printed.
    
    Returns:
    None
    """
    if isinstance(data, str):
        # If the input is a JSON string, parse it into a dictionary
        data = json.loads(data)
    
    # Pretty print the JSON with indentation
    print(json.dumps(data, indent=4, sort_keys=True))

In [None]:
import pickle
from datetime import datetime

def save_to_pickle_with_timestamp(my_dict, filename_prefix):
    """
    Save the bins dictionary to a pickle file with timestamp in the filename.
    
    Args:
        bins_dict (dict): The cleaned up bins dictionary to save
        filename_prefix (str): Prefix for the filename (default: "pace_grammar_pace_to_bin_lookup_table_")
    
    Returns:
        str: The filename that was created
    """
    # Get current date and time
    now = datetime.now()
    
    # Format as YYYY-MM-DD_HH:MM
    datetime_str = now.strftime("%Y-%m-%d_%H:%M")
    
    # Create filename
    filename = f"{filename_prefix}{datetime_str}.pickle"
    
    # Save to pickle file
    if os.path.dirname(filename):
        os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, 'wb') as f:
        pickle.dump(my_dict, f)
    
    print(f"Bins dictionary saved to: {filename}")
    print(f"File contains {len(my_dict)} keys")
    
    return

In [None]:
import os
import glob

# Read all pickle files from race_finishers folder
race_finishers = {}

# Get all pickle files in the race_finishers directory
pickle_files = glob.glob("../race_finishers/*.pickle")

print(f"Found {len(pickle_files)} pickle files in race_finishers folder")

# Load each pickle file
for file_path in pickle_files:
    # Extract year from filename (e.g., "race_finishers_for_year_1970.pickle" -> "1970")
    filename = os.path.basename(file_path)
    year = filename.replace("race_finishers_for_year_", "").replace(".pickle", "")
    
    try:
        with open(file_path, "rb") as f:
            race_finishers[year] = pickle.load(f)
        print(f"Loaded {year}: {len(race_finishers[year])} races")
    except Exception as e:
        print(f"Error loading {year}: {e}")

print(f"\nTotal years loaded: {len(race_finishers)}")
print(f"Years: {sorted(race_finishers.keys())}")

# Show sample data structure
if race_finishers:
    sample_year = list(race_finishers.keys())[0]
    sample_races = list(race_finishers[sample_year].keys())[:3]
    print(f"\nSample from {sample_year}:")
    for race_id in sample_races:
        print(f"  {race_id}: {len(race_finishers[sample_year][race_id])} finishers")


In [None]:
# Create race_finishers_with_birth_year dictionary
race_finishers_with_birth_year = {}

print("Computing birth years for all finishers...")

total_finishers = 0
total_races = 0

for year in race_finishers:
    race_finishers_with_birth_year[year] = {}
    
    for race_id in race_finishers[year]:
        race_finishers_with_birth_year[year][race_id] = []
        
        if race_finishers[year][race_id] is None:
            print(f"No finishers for {year}/{race_id}")
            continue
        
        for finisher_index, finisher_data in enumerate(race_finishers[year][race_id]):
            # Create a copy of the finisher data
            finisher_with_birth_year = finisher_data.copy()
            
            # Compute birth year: current year - age
            try:
                birth_year = int(year) - finisher_data["age"]
                finisher_with_birth_year["birthYear"] = birth_year
            except (KeyError, TypeError, ValueError) as e:
                # Handle cases where age might be missing or invalid
                finisher_with_birth_year["birthYear"] = None
                print(f"Warning: Could not compute birth year for {year}/{race_id}/finisher_{finisher_index}: {e}")
            
            race_finishers_with_birth_year[year][race_id].append(finisher_with_birth_year)
            total_finishers += 1
        
        total_races += 1

print(f"Completed processing:")
print(f"  Total years: {len(race_finishers_with_birth_year)}")
print(f"  Total races: {total_races}")
print(f"  Total finishers: {total_finishers}")

# Show sample data structure
if race_finishers_with_birth_year:
    sample_year = list(race_finishers_with_birth_year.keys())[0]
    sample_race = list(race_finishers_with_birth_year[sample_year].keys())[0]
    sample_finisher = race_finishers_with_birth_year[sample_year][sample_race][0]
    
    print(f"\nSample finisher from {sample_year}/{sample_race}:")
    print(f"  Age: {sample_finisher.get('age', 'N/A')}")
    print(f"  Birth Year: {sample_finisher.get('birthYear', 'N/A')}")
    print(f"  Available fields: {list(sample_finisher.keys())}")


In [None]:
ppj(race_finishers['1970'])

In [None]:
# Create race_finishers_with_birth_year dictionary
race_finishers_with_birth_year_flat = []

print("Computing birth years for all finishers...")

total_finishers = 0
total_races = 0

total_errors = 0

for year in [str(y) for y in range(1970,2026)]:
    print(year)
    race_finishers_with_birth_year[year] = {}
    
    for race_id in race_finishers[year]:
        race_finishers_with_birth_year[year][race_id] = []

        if race_finishers[year][race_id] is None:
            print(f"No finishers for {year}/{race_id}")
            continue
        
        for finisher_index, finisher_data in enumerate(race_finishers[year][race_id]):
            # Create a copy of the finisher data
            finisher_with_birth_year = finisher_data.copy()
            
            # Compute birth year: current year - age
            try:
                birth_year = int(year) - int(finisher_data["age"])
                finisher_with_birth_year["birthYear"] = birth_year
                finisher_with_birth_year["raceId"] = race_id
                finisher_with_birth_year["raceYear"] = year
            except (KeyError, TypeError, ValueError) as e:
                total_errors += 1
                # Handle cases where age might be missing or invalid
                finisher_with_birth_year["birthYear"] = None
                print(f"Warning: Could not compute birth year for {year}/{race_id}/finisher_{finisher_index}: {e}")
                #ppj(finisher_data)
            
            race_finishers_with_birth_year_flat.append(finisher_with_birth_year)
            total_finishers += 1
        
        total_races += 1

print(f"Completed processing:")
print(f"  Total years: {len(race_finishers_with_birth_year)}")
print(f"  Total races: {total_races}")
print(f"  Total finishers: {total_finishers}")
print(f"  Total errors: {total_errors}")


In [None]:
ppj(race_finishers_with_birth_year_flat[0])

In [None]:
# Filter out participants under 18 and group by firstName, lastName, and gender
print("Filtering participants 18+ and grouping by name and gender...")

# Convert race_finishers_with_birth_year_flat to DataFrame
df_finishers_flat = pd.DataFrame(race_finishers_with_birth_year_flat)

print(f"Original dataset: {len(df_finishers_flat)} records")

# Filter out participants under 18
df_adults = df_finishers_flat[df_finishers_flat['age'] >= 18].copy()

print(f"After filtering age >= 18: {len(df_adults)} records")
print(f"Removed {len(df_finishers_flat) - len(df_adults)} records (participants under 18)")

# Group by firstName, lastName, and gender - keep original records
grouped_by_name_gender = df_adults.groupby(['firstName', 'lastName', 'gender']).apply(lambda x: x.to_dict('records')).to_dict()

print(f"\nGrouped into {len(grouped_by_name_gender)} unique name/gender combinations")

# Show sample data
if grouped_by_name_gender:
    sample_key = list(grouped_by_name_gender.keys())[0]
    sample_records = grouped_by_name_gender[sample_key]
    
    print(f"\nSample group: {sample_key}")
    print(f"  Number of records: {len(sample_records)}")
    print(f"  Sample record fields: {list(sample_records[0].keys())}")
    print(f"  First record: {sample_records[0]}")


In [None]:
# Filter out records where firstName or lastName are empty strings
print("Filtering out records with empty firstName or lastName...")

# Create a new dictionary without empty names
grouped_by_name_gender_clean = {}

for key, records in grouped_by_name_gender.items():
    firstName, lastName, gender = key
    
    # Skip if firstName or lastName are empty strings
    if firstName == "" or lastName == "":
        continue
    
    # Keep records with valid names
    grouped_by_name_gender_clean[key] = records

print(f"Original groups: {len(grouped_by_name_gender)}")
print(f"After filtering empty names: {len(grouped_by_name_gender_clean)}")
print(f"Removed {len(grouped_by_name_gender) - len(grouped_by_name_gender_clean)} groups with empty names")

# Show sample data
if grouped_by_name_gender_clean:
    sample_key = list(grouped_by_name_gender_clean.keys())[0]
    sample_records = grouped_by_name_gender_clean[sample_key]
    
    print(f"\nSample clean group: {sample_key}")
    print(f"  Number of records: {len(sample_records)}")
    print(f"  First record: {sample_records[0]}")


In [None]:
# Filter out groups with only a single record
print("Filtering out groups with only a single record...")

# Create a new dictionary without single-record groups
grouped_by_name_gender_final = {}

for key, records in grouped_by_name_gender_clean.items():
    # Skip if group has only 1 record
    if len(records) == 1:
        continue
    
    # Keep groups with multiple records
    grouped_by_name_gender_final[key] = records

print(f"After filtering empty names: {len(grouped_by_name_gender_clean)}")
print(f"After filtering single records: {len(grouped_by_name_gender_final)}")
print(f"Removed {len(grouped_by_name_gender_clean) - len(grouped_by_name_gender_final)} single-record groups")

# Show sample data
if grouped_by_name_gender_final:
    sample_key = list(grouped_by_name_gender_final.keys())[0]
    sample_records = grouped_by_name_gender_final[sample_key]
    
    print(f"\nSample final group: {sample_key}")
    print(f"  Number of records: {len(sample_records)}")
    print(f"  First record: {sample_records[0]}")
    print(f" Last record: {sample_records[-1]}")


In [None]:
# Count groups with gender other than M or F
print("Counting groups with gender other than M or F...")

non_standard_gender_groups = 0
non_standard_genders = set()

for key, records in grouped_by_name_gender_final.items():
    firstName, lastName, gender = key
    
    # Check if gender is not M or W
    if gender not in ['M', 'W']:
        non_standard_gender_groups += 1
        non_standard_genders.add(gender)

print(f"Total groups in final dataset: {len(grouped_by_name_gender_final)}")
print(f"Groups with non-standard gender: {non_standard_gender_groups}")
print(f"Non-standard gender values found: {sorted(non_standard_genders)}")

# Show some examples of non-standard gender groups
if non_standard_gender_groups > 0:
    print(f"\nExamples of non-standard gender groups:")
    count = 0
    for key, records in grouped_by_name_gender_final.items():
        firstName, lastName, gender = key
        if gender not in ['M', 'W'] and count < 5:
            print(f"  {firstName} {lastName} - Gender: '{gender}' - Records: {len(records)}")
            count += 1


In [None]:
# Count groups with more than 2 different birth years
print("Counting groups with more than 2 different birth years...")

groups_with_multiple_birth_years = 0
examples_multiple_birth_years = []

for key, records in grouped_by_name_gender_final.items():
    firstName, lastName, gender = key
    
    # Get all birth years for this group
    birth_years = set()
    for record in records:
        birth_year = record.get('birthYear')
        if birth_year is not None:
            birth_years.add(birth_year)
    
    # Check if more than 2 different birth years
    if len(birth_years) > 2:
        groups_with_multiple_birth_years += 1
        if len(examples_multiple_birth_years) < 5:  # Keep first 5 examples
            examples_multiple_birth_years.append({
                'name': f"{firstName} {lastName}",
                'gender': gender,
                'birth_years': sorted(birth_years),
                'record_count': len(records)
            })

print(f"Total groups in final dataset: {len(grouped_by_name_gender_final)}")
print(f"Groups with more than 2 birth years: {groups_with_multiple_birth_years}")

# Show examples
if examples_multiple_birth_years:
    print(f"\nExamples of groups with multiple birth years:")
    for example in examples_multiple_birth_years:
        print(f"  {example['name']} ({example['gender']}) - Birth years: {example['birth_years']} - Records: {example['record_count']}")


In [None]:
# Count groups with more than 2 birth years that have 3+ consecutive years
print("Counting groups with 3+ consecutive birth years...")

def has_consecutive_run(birth_years, min_length=3):
    """Check if there's a run of at least min_length consecutive years"""
    sorted_years = sorted(birth_years)
    
    current_run = 1
    max_run = 1
    
    for i in range(1, len(sorted_years)):
        if sorted_years[i] == sorted_years[i-1] + 1:
            current_run += 1
            max_run = max(max_run, current_run)
        else:
            current_run = 1
    
    return max_run >= min_length

groups_with_consecutive_years = 0
examples_consecutive_years = []

for key, records in grouped_by_name_gender_final.items():
    firstName, lastName, gender = key
    
    # Get all birth years for this group
    birth_years = set()
    for record in records:
        birth_year = record.get('birthYear')
        if birth_year is not None:
            birth_years.add(birth_year)
    
    # Check if more than 2 different birth years AND has 3+ consecutive years
    if len(birth_years) > 2 and has_consecutive_run(birth_years, 3):
        groups_with_consecutive_years += 1
        if len(examples_consecutive_years) < 5:  # Keep first 5 examples
            examples_consecutive_years.append({
                'name': f"{firstName} {lastName}",
                'gender': gender,
                'birth_years': sorted(birth_years),
                'record_count': len(records)
            })

print(f"Groups with more than 2 birth years: {groups_with_multiple_birth_years}")
print(f"Groups with 3+ consecutive birth years: {groups_with_consecutive_years}")

# Show examples
if examples_consecutive_years:
    print(f"\nExamples of groups with 3+ consecutive birth years:")
    for example in examples_consecutive_years:
        print(f"  {example['name']} ({example['gender']}) - Birth years: {example['birth_years']} - Records: {example['record_count']}")
        
        # Show the consecutive run
        sorted_years = example['birth_years']
        current_run = [sorted_years[0]]
        max_run = [sorted_years[0]]
        
        for i in range(1, len(sorted_years)):
            if sorted_years[i] == sorted_years[i-1] + 1:
                current_run.append(sorted_years[i])
                if len(current_run) > len(max_run):
                    max_run = current_run.copy()
            else:
                current_run = [sorted_years[i]]
        
        print(f"    Consecutive run: {max_run} (length: {len(max_run)})")


In [None]:
# Add name_group_id to all groups, set to 0 for now
print("Adding name_group_id to all groups...")

grouped_by_name_gender_with_id = {}

for key, records in grouped_by_name_gender_final.items():
    firstName, lastName, gender = key
    
    # Create new key with name_group_id = 0
    new_key = (firstName, lastName, gender, 0)
    
    # Add name_group_id to each record
    records_with_id = []
    for record in records:
        record_with_id = record.copy()
        record_with_id['name_group_id'] = 0
        records_with_id.append(record_with_id)
    
    grouped_by_name_gender_with_id[new_key] = records_with_id

print(f"Original groups: {len(grouped_by_name_gender_final)}")
print(f"Groups with name_group_id: {len(grouped_by_name_gender_with_id)}")

# Show sample data
if grouped_by_name_gender_with_id:
    sample_key = list(grouped_by_name_gender_with_id.keys())[0]
    sample_records = grouped_by_name_gender_with_id[sample_key]
    
    print(f"\nSample group with name_group_id: {sample_key}")
    print(f"  Number of records: {len(sample_records)}")
    print(f"  Sample record fields: {list(sample_records[0].keys())}")
    print(f"  name_group_id value: {sample_records[0].get('name_group_id', 'N/A')}")


In [None]:
# Find groups with gaps in birth years and split them
print("Finding groups with gaps in birth years and splitting them...")

def find_gaps_in_years(birth_years):
    """Find gaps in a sorted list of birth years and return groups of consecutive years"""
    sorted_years = sorted(birth_years)
    
    if len(sorted_years) <= 1:
        return [sorted_years]
    
    groups = []
    current_group = [sorted_years[0]]
    
    for i in range(1, len(sorted_years)):
        # If there's a gap (difference > 1), start a new group
        if sorted_years[i] - sorted_years[i-1] > 1:
            groups.append(current_group)
            current_group = [sorted_years[i]]
        else:
            current_group.append(sorted_years[i])
    
    # Add the last group
    groups.append(current_group)
    
    return groups

def split_records_by_birth_year_gaps(records):
    """Split records into groups based on birth year gaps"""
    # Group records by birth year
    birth_year_groups = {}
    for record in records:
        birth_year = record.get('birthYear')
        if birth_year is not None:
            if birth_year not in birth_year_groups:
                birth_year_groups[birth_year] = []
            birth_year_groups[birth_year].append(record)
    
    # Find gaps in birth years
    birth_years = list(birth_year_groups.keys())
    consecutive_groups = find_gaps_in_years(birth_years)
    
    # Split records into groups based on consecutive birth years
    split_groups = []
    for consecutive_years in consecutive_groups:
        group_records = []
        for year in consecutive_years:
            group_records.extend(birth_year_groups[year])
        split_groups.append(group_records)
    
    return split_groups

# Process all groups and split those with gaps
grouped_by_name_gender_split = {}
groups_split_count = 0
total_new_groups = 0

for key, records in grouped_by_name_gender_with_id.items():
    firstName, lastName, gender, name_group_id = key
    
    # Get all birth years for this group
    birth_years = set()
    for record in records:
        birth_year = record.get('birthYear')
        if birth_year is not None:
            birth_years.add(birth_year)
    
    # Check if there are gaps
    sorted_years = sorted(birth_years)
    has_gaps = False
    for i in range(1, len(sorted_years)):
        if sorted_years[i] - sorted_years[i-1] > 1:
            has_gaps = True
            break
    
    if has_gaps:
        # Split the group
        split_groups = split_records_by_birth_year_gaps(records)
        groups_split_count += 1
        
        for i, group_records in enumerate(split_groups):
            new_key = (firstName, lastName, gender, i)
            grouped_by_name_gender_split[new_key] = group_records
            total_new_groups += 1
    else:
        # No gaps, keep as is
        grouped_by_name_gender_split[key] = records
        total_new_groups += 1

print(f"Original groups: {len(grouped_by_name_gender_with_id)}")
print(f"Groups with gaps that were split: {groups_split_count}")
print(f"Total groups after splitting: {total_new_groups}")

# Show examples of split groups
print(f"\nExamples of split groups:")
example_count = 0
for key, records in grouped_by_name_gender_split.items():
    firstName, lastName, gender, name_group_id = key
    
    # Get birth years for this group
    birth_years = set()
    for record in records:
        birth_year = record.get('birthYear')
        if birth_year is not None:
            birth_years.add(birth_year)
    
    # Show examples of groups with name_group_id > 0 (indicating they were split)
    if name_group_id > 0 and example_count < 5:
        print(f"  {firstName} {lastName} ({gender}) - name_group_id: {name_group_id}")
        print(f"    Birth years: {sorted(birth_years)}")
        print(f"    Records: {len(records)}")
        example_count += 1


In [None]:
# Find and print all records for A King (M)
print("Searching for A King (M) in split groups...")

a_king_records = []

# Search through all split groups
for key, records in grouped_by_name_gender_split.items():
    firstName, lastName, gender, name_group_id = key
    
    # Check if this is A King (M)
    if firstName == "A" and lastName == "King" and gender == "M":
        a_king_records.append({
            'key': key,
            'records': records
        })

if a_king_records:
    print(f"Found {len(a_king_records)} group(s) for A King (M):")
    print("=" * 80)
    
    for group_idx, group_data in enumerate(a_king_records):
        key, records = group_data['key'], group_data['records']
        firstName, lastName, gender, name_group_id = key
        
        print(f"\nGroup {group_idx + 1}: A King (M) - name_group_id: {name_group_id}")
        print(f"Number of records: {len(records)}")
        
        # Get birth years for this group
        birth_years = set()
        for record in records:
            birth_year = record.get('birthYear')
            if birth_year is not None:
                birth_years.add(birth_year)
        print(f"Birth years: {sorted(birth_years)}")
        
        # Sort records by year and race ID for better readability
        sorted_records = sorted(records, key=lambda x: (x.get('year', ''), x.get('raceId', '')))
        
        print(f"\nRecords:")
        for i, record in enumerate(sorted_records, 1):
            print(f"  Record {i}:")
            print(f"    Year: {record.get('year', 'N/A')}")
            print(f"    Race ID: {record.get('raceId', 'N/A')}")
            print(f"    Age: {record.get('age', 'N/A')}")
            print(f"    Birth Year: {record.get('birthYear', 'N/A')}")
            print(f"    Overall Time: {record.get('overallTime', 'N/A')}")
            print(f"    Overall Place: {record.get('overallPlace', 'N/A')}")
            print(f"    Gender Place: {record.get('genderPlace', 'N/A')}")
            print(f"    Pace: {record.get('pace', 'N/A')}")
            print(f"    Runner ID: {record.get('runnerId', 'N/A')}")
            print(f"    City: {record.get('city', 'N/A')}")
            print(f"    State: {record.get('stateProvince', 'N/A')}")
            print(f"    Name Group ID: {record.get('name_group_id', 'N/A')}")
            print()
        
else:
    print("A King (M) not found in the split groups.")
    print("Searching for similar names...")
    
    # Search for similar names
    similar_names = []
    for key, records in grouped_by_name_gender_split.items():
        firstName, lastName, gender, name_group_id = key
        
        # Check for similar names
        if (firstName == "A" and lastName.lower() == "king" and gender == "M") or \
           (firstName.lower() == "a" and lastName == "King" and gender == "M"):
            similar_names.append({
                'key': key,
                'records': records
            })
    
    if similar_names:
        print(f"Found {len(similar_names)} similar name(s):")
        for group_data in similar_names:
            key, records = group_data['key'], group_data['records']
            firstName, lastName, gender, name_group_id = key
            print(f"  {firstName} {lastName} ({gender}) - name_group_id: {name_group_id} - {len(records)} records")
    else:
        print("No similar names found.")


In [None]:
# Filter out groups with only a single record from split groups
print("Filtering out single-record groups from split groups...")

grouped_by_name_gender_split_final = {}

for key, records in grouped_by_name_gender_split.items():
    # Skip if group has only 1 record
    if len(records) == 1:
        continue
    
    # Keep groups with multiple records
    grouped_by_name_gender_split_final[key] = records

print(f"Split groups before filtering: {len(grouped_by_name_gender_split)}")
print(f"Split groups after filtering single records: {len(grouped_by_name_gender_split_final)}")
print(f"Removed {len(grouped_by_name_gender_split) - len(grouped_by_name_gender_split_final)} single-record groups")

# Show sample data
if grouped_by_name_gender_split_final:
    sample_key = list(grouped_by_name_gender_split_final.keys())[0]
    sample_records = grouped_by_name_gender_split_final[sample_key]
    firstName, lastName, gender, name_group_id = sample_key
    
    print(f"\nSample final split group: {firstName} {lastName} ({gender}) - name_group_id: {name_group_id}")
    print(f"  Number of records: {len(sample_records)}")
    
    # Get birth years for this group
    birth_years = set()
    for record in sample_records:
        birth_year = record.get('birthYear')
        if birth_year is not None:
            birth_years.add(birth_year)
    print(f"  Birth years: {sorted(birth_years)}")
    
    # Show years raced
    years = sorted(set(record.get('year', 'N/A') for record in sample_records))
    print(f"  Years raced: {years}")


In [None]:
# Drop all groups with 3 or more consecutive birth years
print("Dropping groups with 3 or more consecutive birth years...")

grouped_by_name_gender_clean_final = {}

for key, records in grouped_by_name_gender_split_final.items():
    firstName, lastName, gender, name_group_id = key
    
    # Get all birth years for this group
    birth_years = set()
    for record in records:
        birth_year = record.get('birthYear')
        if birth_year is not None:
            birth_years.add(birth_year)
    
    # Find longest consecutive run of birth years
    longest_run, run_length = find_longest_consecutive_run_birth_years(birth_years)
    
    # Skip if run is 3 or more consecutive birth years
    if run_length >= 3:
        continue
    
    # Keep groups with runs of 2 or fewer consecutive birth years
    grouped_by_name_gender_clean_final[key] = records

print(f"Groups before dropping consecutive birth year runs: {len(grouped_by_name_gender_split_final)}")
print(f"Groups after dropping consecutive birth year runs: {len(grouped_by_name_gender_clean_final)}")
print(f"Dropped {len(grouped_by_name_gender_split_final) - len(grouped_by_name_gender_clean_final)} groups with 3+ consecutive birth years")

# Show sample data
if grouped_by_name_gender_clean_final:
    sample_key = list(grouped_by_name_gender_clean_final.keys())[0]
    sample_records = grouped_by_name_gender_clean_final[sample_key]
    firstName, lastName, gender, name_group_id = sample_key
    
    print(f"\nSample clean group: {firstName} {lastName} ({gender}) - name_group_id: {name_group_id}")
    print(f"  Number of records: {len(sample_records)}")
    
    # Get birth years for this group
    birth_years = set()
    for record in sample_records:
        birth_year = record.get('birthYear')
        if birth_year is not None:
            birth_years.add(birth_year)
    print(f"  Birth years: {sorted(birth_years)}")
    
    # Show years raced
    years = sorted(set(record.get('year', 'N/A') for record in sample_records))
    print(f"  Years raced: {years}")


In [None]:
save_to_pickle_with_timestamp(grouped_by_name_gender_clean_final, "all_runners_grouped_by_name_gender_age_18_and_up_dropping_overlapping_groups_with_3_or_more_consecutive_birth_years")