In [130]:
import json
import numpy as np

# Q1: What's the average length of a stage, and how does differ between different tours over the years

For the first question I wanted to get both the average and median for every course

In [131]:
def get_grand_tour_distances(grand_tour):
    with open(f'data/{grand_tour}.json', 'r') as file:
        tour_json = json.load(file)

    # make a dict with the year and distances odf that year
    distances_per_year = [
        {'year': year[0], 'distances': [course['distance'] for course in year[1]]}
        for year in list(tour_json[grand_tour].items())
    ]

    return distances_per_year

def get_mean_average_per_year(tour_dict):
    # update the dict with the means and averages
    [tour_dict[i].update({'mean': np.median(year['distances']), 'average': np.average(year['distances'])})
    for i, year in enumerate(tour_dict, start=0)]

# use list comprehension and map to use the functions
tours = ['tour_de_france', 'giro_ditalia', 'vuelta_a_espagna']
france_dict, italia_dict, espagna_dict = [get_grand_tour_distances(name) for name in tours]
list(map(get_mean_average_per_year, (france_dict, italia_dict, espagna_dict)))

# print the averages and means for every 20 years
print('Tour de France')
for year in france_dict:
    print(f'Year: {year['year']} - Median: {round(year['mean'], 2)} - Average: {round(year['average'], 2)}')

print('\nGiro d\'Italia')
for year in italia_dict:
    print(f'Year: {year['year']} - Median: {round(year['mean'], 2)} - Average: {round(year['average'], 2)}')

print('\nVuelta a Espagna')
for year in espagna_dict:
    print(f'Year: {year['year']} - Median: {round(year['mean'], 2)} - Average: {round(year['average'], 2)}')

Tour de France
Year: 1906 - Median: 338.0 - Average: 349.46
Year: 1911 - Median: 348.0 - Average: 356.27
Year: 1919 - Median: 364.0 - Average: 370.67
Year: 1924 - Median: 360.0 - Average: 361.67
Year: 1929 - Median: 207.0 - Average: 239.0
Year: 1934 - Median: 183.0 - Average: 186.09
Year: 1939 - Median: 186.5 - Average: 182.39
Year: 1947 - Median: 230.0 - Average: 220.48
Year: 1952 - Median: 214.0 - Average: 209.0
Year: 1957 - Median: 223.0 - Average: 215.48
Year: 1962 - Median: 202.5 - Average: 191.05
Year: 1967 - Median: 209.0 - Average: 213.95
Year: 1972 - Median: 205.0 - Average: 185.53
Year: 1977 - Median: 193.0 - Average: 194.47
Year: 1982 - Median: 201.0 - Average: 180.53
Year: 1987 - Median: 196.5 - Average: 180.85
Year: 1992 - Median: 208.0 - Average: 202.28
Year: 1997 - Median: 194.0 - Average: 200.74
Year: 2002 - Median: 176.0 - Average: 161.79
Year: 2007 - Median: 190.5 - Average: 181.44
Year: 2012 - Median: 196.5 - Average: 174.15
Year: 2017 - Median: 187.0 - Average: 184.

# Q2: What rider has finished a tour the most with each jersey in the Tour de France

I wanted to know who won each jersey the most times in the Tour de France, so I scraped the pages and added up the winners

In [132]:
def get_yellow_data(type):
    with open('data/jerseys.json', 'r') as file:
        data = json.load(file)
    type_data = data[type][0]
    
    type_max = max(map(lambda x: int(x['jerseys']), type_data))
    top = [rider['name'] for rider in type_data if int(rider['jerseys']) == type_max]
    
    return {type: (type_max, top)}

def get_jersey_data(type):
    with open('data/jerseys.json', 'r') as file:
        data = json.load(file)
    type_data = data[type][0]
    
    type_max = max(type_data.values())
    top = [name for name, count in type_data.items() if count == type_max]
    
    return {type: (type_max, top)}

# Get yellow data separately
results = {'Yellow': get_yellow_data('yellow')['yellow']}

# Get other jersey data
for type, name in [('white', 'White'), ('dotted', 'Polka Dot'), ('green', 'Green')]:
    results[name] = get_jersey_data(type)[type]

for jersey, (count, riders) in results.items():
    print(f"\n{jersey} Jersey - {count} wins:")
    print(f"{', '.join(riders)}")


Yellow Jersey - 5 wins:
Eddy Merckx, Bernard Hinault, Miguel Induráin, Jacques Anquetil

White Jersey - 4 wins:
Tadej Pogačar(SLO)

Polka Dot Jersey - 7 wins:
Richard Virenque

Green Jersey - 7 wins:
Peter Sagan(SVK)


# Q3 Which riders switch teams most often and which are most loyal of the current riders

Here I sought to see which drivers stay at a team for the longest and which switch very often

In [133]:
with open('data/rider_teams.json', 'r') as f:
    data = json.load(f)

# Flatten all riders and calculate stats
rider_stats = []
for group in data['rider']:
    for rider in group:
        teams = rider['teams']
        rider_stats.append({
            'name': rider['name'],
            'unique_teams': len(set(teams)),
            'total_entries': len(teams),
            'teams': teams
        })

# Most switches (most unique teams)
most_switches = sorted(rider_stats, key=lambda x: x['unique_teams'], reverse=True)[:10]

# Most loyal (fewest unique teams, min 5 entries)
most_loyal = sorted(
    [r for r in rider_stats if r['total_entries'] >= 5], 
    key=lambda x: x['unique_teams']
)[:10]

print("\nTOP 10 TEAM SWITCHERS:")
for i, r in enumerate(most_switches, 1):
    ratio = r['total_entries'] / r['unique_teams']
    print(f"{i}. {r['name']}: {r['unique_teams']} teams ({r['total_entries']} entries, {ratio:.1f}x/team)")

print("\nTOP 10 MOST LOYAL (min 5 entries):")
for i, r in enumerate(most_loyal, 1):
    ratio = r['total_entries'] / r['unique_teams']
    
    print(f"{i}. {r['name']}: {r['unique_teams']} teams ({r['total_entries']} entries, {ratio:.1f}x/team)")

# Summary
avg_unique = sum(r['unique_teams'] for r in rider_stats) / len(rider_stats)
avg_entries = sum(r['total_entries'] for r in rider_stats) / len(rider_stats)
print(f"\nAvg: {avg_unique:.1f} teams, {avg_entries:.1f} entries, {avg_entries/avg_unique:.1f}x/team")


TOP 10 TEAM SWITCHERS:
1. DEGENKOLB John: 15 teams (21 entries, 1.4x/team)
2. TEUNS Dylan: 15 teams (20 entries, 1.3x/team)
3. BETTIOL Alberto: 14 teams (19 entries, 1.4x/team)
4. KAMP Alexander: 14 teams (17 entries, 1.2x/team)
5. BARGUIL Warren: 13 teams (19 entries, 1.5x/team)
6. TRATNIK Jan: 13 teams (19 entries, 1.5x/team)
7. VALGREN Michael: 13 teams (18 entries, 1.4x/team)
8. DELAPLACE Anthony: 13 teams (19 entries, 1.5x/team)
9. POELS Wout: 12 teams (21 entries, 1.8x/team)
10. BALLERINI Davide: 12 teams (14 entries, 1.2x/team)

TOP 10 MOST LOYAL (min 5 entries):
1. RUBIO Einer: 2 teams (8 entries, 4.0x/team)
2. PEDRERO Antonio: 2 teams (11 entries, 5.5x/team)
3. ARCAS Jorge: 2 teams (13 entries, 6.5x/team)
4. WITHEN PHILIPSEN Albert: 2 teams (6 entries, 3.0x/team)
5. RODRÍGUEZ Carlos: 2 teams (10 entries, 5.0x/team)
6. TARLING Joshua: 2 teams (7 entries, 3.5x/team)
7. AUGUST Andrew: 2 teams (5 entries, 2.5x/team)
8. BELOKI Markel: 2 teams (5 entries, 2.5x/team)
9. BUITRAGO San

# Q4 How has the age of the average tour winner evolved over the years



In [134]:
import json
import numpy as np

with open('data/ages.json', 'r') as f:
    data = json.load(f)

victors = data['victors'][0]

# Calculate ages
all_ages = []
for victor in victors:
    if 'birthyear' in victor and victor['birthyear']:
        year = int(victor['year'])
        age = year - int(victor['birthyear'])
        all_ages.append((year, age, victor['rider']))

# Sort by age
sorted_ages = sorted(all_ages, key=lambda x: x[1])

# Print youngest and oldest
print("YOUNGEST WINNERS:")
for year, age, rider in sorted_ages[:5]:
    print(f"{rider}: {age} years ({year})")

print("\nOLDEST WINNERS:")
for year, age, rider in sorted_ages[-5:]:
    print(f"{rider}: {age} years ({year})")

# Overall statistics
ages_only = [age for _, age, _ in all_ages]
print(f"\nOVERALL STATISTICS:")
print(f"Mean age: {np.mean(ages_only):.1f} years")
print(f"Median age: {np.median(ages_only):.1f} years")
print(f"Range: {min(ages_only)}-{max(ages_only)} years")

YOUNGEST WINNERS:
1POGAČAR Tadej: 27 years (2025)

OLDEST WINNERS:
1POGAČAR Tadej: 27 years (2025)

OVERALL STATISTICS:
Mean age: 27.0 years
Median age: 27.0 years
Range: 27-27 years


# Q5 How has the nationality of the podiums evolved over the years

In [135]:
import json
from collections import Counter, defaultdict

with open('data/ages.json', 'r') as f:
    data = json.load(f)

country_data = data['country'][1]  # Skip null at index 0

# Map country codes to full names
country_names = {
    'fr': 'France', 'be': 'Belgium', 'it': 'Italy', 'es': 'Spain', 'nl': 'Netherlands',
    'gb': 'Great Britain', 'us': 'USA', 'lu': 'Luxembourg', 'ch': 'Switzerland',
    'de': 'Germany', 'ie': 'Ireland', 'au': 'Australia', 'dk': 'Denmark',
    'co': 'Colombia', 'si': 'Slovenia', 'ec': 'Ecuador', 'at': 'Austria',
    'pt': 'Portugal', 'kz': 'Kazakhstan', 'lt': 'Lithuania', 'lv': 'Latvia',
    'pl': 'Poland', 'ru': 'Russia', 'se': 'Sweden'
}

# Analyze by decade
decades = defaultdict(lambda: {'countries': [], 'years': []})

for entry in country_data:
    year = int(entry['year'])
    decade = (year // 10) * 10
    
    for country_code in entry['countries']:
        country = country_names.get(country_code, country_code.upper())
        decades[decade]['countries'].append(country)
        decades[decade]['years'].append(year)

# Display results by decade
print("PODIUM NATIONALITIES BY DECADE:\n")
for decade in sorted(decades.keys()):
    counts = Counter(decades[decade]['countries'])
    total_podiums = len(decades[decade]['countries'])
    
    print(f"{decade}s ({len(set(decades[decade]['years']))} Tours):")
    for country, count in counts.most_common(5):
        pct = (count / total_podiums) * 100
        print(f"  {country}: {count} ({pct:.1f}%)")
    print()

# Era comparisons
eras = {
    'Early (1903-1939)': [y for y in range(1903, 1940)],
    'Post-War (1947-1970)': [y for y in range(1947, 1971)],
    'Modern (1971-2000)': [y for y in range(1971, 2001)],
    'Recent (2001-2025)': [y for y in range(2001, 2026)]
}

PODIUM NATIONALITIES BY DECADE:

1900s (5 Tours):
  France: 13 (86.7%)
  Luxembourg: 2 (13.3%)

1910s (6 Tours):
  France: 12 (66.7%)
  Belgium: 5 (27.8%)
  Luxembourg: 1 (5.6%)

1920s (10 Tours):
  Belgium: 15 (50.0%)
  Italy: 6 (20.0%)
  France: 5 (16.7%)
  Luxembourg: 4 (13.3%)

1930s (10 Tours):
  France: 11 (36.7%)
  Italy: 9 (30.0%)
  Belgium: 8 (26.7%)
  Switzerland: 1 (3.3%)
  Germany: 1 (3.3%)

1940s (3 Tours):
  Italy: 4 (44.4%)
  France: 4 (44.4%)
  Belgium: 1 (11.1%)

1950s (10 Tours):
  France: 13 (43.3%)
  Belgium: 5 (16.7%)
  Switzerland: 4 (13.3%)
  Italy: 3 (10.0%)
  Spain: 2 (6.7%)

1960s (10 Tours):
  France: 12 (40.0%)
  Italy: 6 (20.0%)
  Belgium: 5 (16.7%)
  Spain: 4 (13.3%)
  Netherlands: 2 (6.7%)

1970s (10 Tours):
  Belgium: 9 (30.0%)
  France: 8 (26.7%)
  Netherlands: 6 (20.0%)
  Spain: 3 (10.0%)
  Portugal: 2 (6.7%)

1980s (10 Tours):
  France: 11 (36.7%)
  Netherlands: 6 (20.0%)
  USA: 4 (13.3%)
  Spain: 4 (13.3%)
  Ireland: 2 (6.7%)

1990s (10 Tours):
  Ita

Q1: Average Distance went down

Q2: 
Yellow Jersey - 5 wins:
Eddy Merckx, Bernard Hinault, Miguel Induráin, Jacques Anquetil

White Jersey - 4 wins:
Tadej Pogačar(SLO)

Polka Dot Jersey - 7 wins:
Richard Virenque

Green Jersey - 7 wins:
Peter Sagan(SVK)

Q3: 
TOP 10 TEAM SWITCHERS:
1. DEGENKOLB John: 15 teams (21 entries, 1.4x/team)
2. TEUNS Dylan: 15 teams (20 entries, 1.3x/team)
3. BETTIOL Alberto: 14 teams (19 entries, 1.4x/team)
4. KAMP Alexander: 14 teams (17 entries, 1.2x/team)
5. BARGUIL Warren: 13 teams (19 entries, 1.5x/team)
6. TRATNIK Jan: 13 teams (19 entries, 1.5x/team)
7. VALGREN Michael: 13 teams (18 entries, 1.4x/team)
8. DELAPLACE Anthony: 13 teams (19 entries, 1.5x/team)
9. POELS Wout: 12 teams (21 entries, 1.8x/team)
10. BALLERINI Davide: 12 teams (14 entries, 1.2x/team)

TOP 10 MOST LOYAL (min 5 entries):
1. RUBIO Einer: 2 teams (8 entries, 4.0x/team)
2. PEDRERO Antonio: 2 teams (11 entries, 5.5x/team)
3. ARCAS Jorge: 2 teams (13 entries, 6.5x/team)
4. WITHEN PHILIPSEN Albert: 2 teams (6 entries, 3.0x/team)
5. RODRÍGUEZ Carlos: 2 teams (10 entries, 5.0x/team)
6. TARLING Joshua: 2 teams (7 entries, 3.5x/team)
7. AUGUST Andrew: 2 teams (5 entries, 2.5x/team)
8. BELOKI Markel: 2 teams (5 entries, 2.5x/team)
9. BUITRAGO Santiago: 2 teams (7 entries, 3.5x/team)
10. BOVEN Lars: 2 teams (6 entries, 3.0x/team)

Avg: 6.6 teams, 11.6 entries, 1.8x/team

Q4: Wasn't able to finish

Q5: 
PODIUM NATIONALITIES BY DECADE:

1900s (5 Tours):
  France: 13 (86.7%)
  Luxembourg: 2 (13.3%)

1910s (6 Tours):
  France: 12 (66.7%)
  Belgium: 5 (27.8%)
  Luxembourg: 1 (5.6%)

1920s (10 Tours):
  Belgium: 15 (50.0%)
  Italy: 6 (20.0%)
  France: 5 (16.7%)
  Luxembourg: 4 (13.3%)

1930s (10 Tours):
  France: 11 (36.7%)
  Italy: 9 (30.0%)
  Belgium: 8 (26.7%)
  Switzerland: 1 (3.3%)
  Germany: 1 (3.3%)

1940s (3 Tours):
  Italy: 4 (44.4%)
  France: 4 (44.4%)
  Belgium: 1 (11.1%)

1950s (10 Tours):
  France: 13 (43.3%)
  Belgium: 5 (16.7%)
  Switzerland: 4 (13.3%)
  Italy: 3 (10.0%)
  Spain: 2 (6.7%)

1960s (10 Tours):
  France: 12 (40.0%)
  Italy: 6 (20.0%)
  Belgium: 5 (16.7%)
  Spain: 4 (13.3%)
  Netherlands: 2 (6.7%)

1970s (10 Tours):
  Belgium: 9 (30.0%)
  France: 8 (26.7%)
  Netherlands: 6 (20.0%)
  Spain: 3 (10.0%)
  Portugal: 2 (6.7%)

1980s (10 Tours):
  France: 11 (36.7%)
  Netherlands: 6 (20.0%)
  USA: 4 (13.3%)
  Spain: 4 (13.3%)
  Ireland: 2 (6.7%)

1990s (10 Tours):
  Italy: 8 (26.7%)
  Spain: 6 (20.0%)
  USA: 3 (10.0%)
  Switzerland: 3 (10.0%)
  Germany: 3 (10.0%)

2000s (10 Tours):
  Spain: 8 (26.7%)
  USA: 8 (26.7%)
  Germany: 6 (20.0%)
  Australia: 2 (6.7%)
  Italy: 2 (6.7%)

2010s (10 Tours):
  Great Britain: 9 (30.0%)
  Colombia: 5 (16.7%)
  France: 4 (13.3%)
  Spain: 3 (10.0%)
  Luxembourg: 3 (10.0%)

2020s (6 Tours):
  Slovenia: 7 (38.9%)
  Denmark: 5 (27.8%)
  Great Britain: 2 (11.1%)
  Germany: 1 (5.6%)
  Belgium: 1 (5.6%)


AI use: 
Scraping mostly done by myself
Analysis page part ai part self