In [14]:
# Import necessary libraries
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt

## 1. Data Loading

In [15]:
# Read men's and women's world records from CSV files
men_records = pd.read_csv("data/men_world_records.csv", sep=",")
women_records = pd.read_csv("data/women_world_records.csv", sep=",")

## 2. Data Preparation

In [16]:
# Filter women's records to include only those disciplines that also exist in men's records
women_records = women_records[women_records['DISCIPLINE'].isin(men_records['DISCIPLINE'].unique())]

# Select and rename performance columns for men and women
men_perf = men_records[['DISCIPLINE', 'PERF']].rename(columns={'PERF': 'PERF_MEN'})
women_perf = women_records[['DISCIPLINE', 'PERF']].rename(columns={'PERF': 'PERF_WOMEN'})

# Merge men's and women's performance data on discipline
df = pd.merge(men_perf, women_perf, on='DISCIPLINE', how='inner')

# Keep only the last record for each discipline
df = df.groupby('DISCIPLINE').last().reset_index()

## 3. Time Conversion and Performance Gap Calculation

In [17]:
not_time = ['Decathlon', 'Discus Throw', 'Hammer Throw', 'High Jump', 'Javelin Throw', 'Long Jump', 'One Hour', 'Pole Vault', 'Shot Put', 'Triple Jump']
df['IS_TIME'] = df['DISCIPLINE'].apply(lambda x: 1 if x not in not_time else 0)

# Fonction pour convertir différents formats de temps en centièmes de seconde
def convert_to_centiseconds(time_str):
    if '.' in time_str:
        try:
            time_obj = datetime.strptime(time_str, '%H:%M:%S.%f')
        except ValueError:
            try:
                time_obj = datetime.strptime(time_str, '%M:%S.%f')
            except ValueError:
                time_obj = datetime.strptime(time_str, '%S.%f')
        total_centiseconds = (time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second) * 100 + int(time_obj.microsecond / 10000)
    else:
        try:
            time_obj = datetime.strptime(time_str, '%H:%M:%S')
        except ValueError:
            try:
                time_obj = datetime.strptime(time_str, '%M:%S')
            except ValueError:
                time_obj = datetime.strptime(time_str, '%S')
        total_centiseconds = (time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second) * 100
    return total_centiseconds

perf_gap_percent_list = []

for row in df.itertuples():
    if row.IS_TIME == 1:
        men_time = convert_to_centiseconds(row.PERF_MEN)
        women_time = convert_to_centiseconds(row.PERF_WOMEN)

        perf_gap_percent = (abs(men_time - women_time) / men_time) * 100
    else:
        perf_gap_percent = (abs(float(row.PERF_MEN) - float(row.PERF_WOMEN)) / float(row.PERF_WOMEN)) * 100
    
    perf_gap_percent_list.append(perf_gap_percent)

df['PERF_GAP_PERCENT'] = perf_gap_percent_list

## 4. Categorization of Disciplines

In [18]:
# Define discipline categories
Sprints = ["100 Metres", "200 Metres", "400 Metres"]
Middle_Long = ["800 Metres", "1500 Metres", "5000 Metres", "10000 Metres", "3000 Metres Steeplechase"]
Hurdles = ["400 Metres Hurdles"]
Road = ["Half Marathon", "Marathon"]
Jumps = ["High Jump", "Pole Vault", "Long Jump", "Triple Jump"]
Throws = ["Shot Put", "Discus Throw", "Hammer Throw", "Javelin Throw"]
Combined = ["Decathlon", "Heptathlon"]
Walks = ["20 Kilometres Walk", "50 Kilometres Walk"]
Relays = ["4x100 Metres Relay", "4x400 Metres Relay"]

# Categorize each discipline
for i, row in df.iterrows():
    if row['DISCIPLINE'] in Sprints:
        df.at[i, 'CATEGORY'] = 'Sprints'
    elif row['DISCIPLINE'] in Middle_Long:
        df.at[i, 'CATEGORY'] = 'Middle-Long Distance'
    elif row['DISCIPLINE'] in Hurdles:
        df.at[i, 'CATEGORY'] = 'Hurdles'
    elif row['DISCIPLINE'] in Road:
        df.at[i, 'CATEGORY'] = 'Road'
    elif row['DISCIPLINE'] in Jumps:
        df.at[i, 'CATEGORY'] = 'Jumps'
    elif row['DISCIPLINE'] in Throws:
        df.at[i, 'CATEGORY'] = 'Throws'
    elif row['DISCIPLINE'] in Combined:
        df.at[i, 'CATEGORY'] = 'Combined Events'
    elif row['DISCIPLINE'] in Walks:
        df.at[i, 'CATEGORY'] = 'Walks'
    elif row['DISCIPLINE'] in Relays:
        df.at[i, 'CATEGORY'] = 'Relays'
    else:
        df.at[i, 'CATEGORY'] = 'Other'