In [1]:

# SEAG qualification period is 22 Oct 2024 to 5 Sept 2025

#1. Segregate into Male and Femal 
#2. For each gender perform the following: 
#a. Sort data by mapped eent, then perf scalar (higher the better)
#b. Identify tiers based on performance - Tier 1 is meets bronze medal mark for SEAG, Tier 2 is 2% and Tier 3 is 3.5%
#c. Check - if athlete met bronze or 2%/3.5% then delta_benchmark is zero or +, delta2% is + and delta 3.5% is +
#d. Top ranked athletes for each event are chosen. Max number of athletes for each event is 3, except for 100m/400m which is 6
#    This includes athletes on spex scholarship and potential
#e. The max for each tier is 2. Lower ranked athletes move down one tier.
#3. If athlete qualifies for more than one event the higher tier event is given
#4. Jump and throws junior program to be solved separately

%load_ext autoreload
%autoreload 2

In [690]:
# Import usual modules
import pandas as pd
import csv
import math
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import openpyxl
import datetime
from scipy.stats import lognorm
import re
import string
from bs4 import BeautifulSoup
import requests
import unicodedata # for removing accented characters
import datetime
import icecream as ic
import dateutil.parser as parser 
import datacompy
import pytz

from google.cloud import storage



In [679]:
# PRODUCTION ENVIRONMENT
# Extract timed event records

import pandas_gbq
from google.oauth2 import service_account

credentials = service_account.Credentials.from_service_account_file(
    '/Users/veesheenyuen/Desktop/DataScience/Keys/saa-analytics-7c8937b70609.json',
    
    
)

sql1="""
SELECT NAME, RESULT, TEAM, AGE, RANK AS COMPETITION_RANK, DIVISION, EVENT, DISTANCE, EVENT_CLASS, UNIQUE_ID, DOB, NATIONALITY, WIND, CATEGORY_EVENT, GENDER, COMPETITION, DATE, YEAR, REGION, TIMESTAMP
FROM `saa-analytics.results.PRODUCTION` 
WHERE RESULT!='NM' AND RESULT!='-' AND RESULT!='DNS' AND RESULT!='DNF' AND RESULT!='DNQ' AND RESULT!='DQ' AND RESULT IS NOT NULL

"""

competitors = pandas_gbq.read_gbq(sql1, project_id="saa-analytics", credentials=credentials)




Downloading: 100%|[32m██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████[0m|[0m


In [680]:
competitors

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT,DISTANCE,EVENT_CLASS,UNIQUE_ID,DOB,NATIONALITY,WIND,CATEGORY_EVENT,GENDER,COMPETITION,DATE,YEAR,REGION,TIMESTAMP
0,Shanti Veronica Pereira,11.71,,,7.0,,100m,,,,20-Sep-96,SGP,-1.8,Sprint,Female,AtleticaGenève,2025-06-21 00:00:00+00:00,2025,International,2025-06-29 22:00:26.182128
1,Shanti Veronica Pereira,23.16w,,,7.0,,200m,,,,20-Sep-96,SGP,2.4,Sprint,Female,AtleticaGenève,2025-06-21 00:00:00+00:00,2025,International,2025-06-29 22:00:26.182128
2,Jun Yu Low,4.77,,,5.0,,Pole Vault,,,,21-Apr-01,SGP,,Jump,Male,Jan Dietvorst Memorial,2025-06-21 00:00:00+00:00,2025,International,2025-06-29 22:00:26.182128
3,Tung Hon Andrew Pak,2.05,,,,,High Jump,,,,10-Apr-02,SGP,,Jump,Male,National Championships,2025-06-25 00:00:00+00:00,2025,International,2025-06-29 22:00:26.182128
4,Tam Jong Hng,1.95,,,,,High Jump,,,,,SGP,,Jump,Male,National Championships,2025-06-25 00:00:00+00:00,2025,International,2025-06-29 22:00:26.182128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31280,Charlene Lee,8.90,NYGH,,15,C,Triple Jump,,,,,,,Jump,Female,National School Games,2024-04-01 00:00:00+00:00,2024,Local,
31281,"Ng Jing Wen, Ariel",10.33,RGS,,2,C,Triple Jump,,,,,,,Jump,Female,National School Games,2024-04-01 00:00:00+00:00,2024,Local,
31282,LEE GABRIEL JIN YI,14.5,SINGAPORE,21,3,,Triple jump,,,,2003,SGP,NWI,Jump,Male,Victor Saneev Memorial (Georgia),2024-10-26 00:00:00+00:00,2024,International,
31283,"SYED AHMED RIADH, SHARIFAH FALISHA",11.13,SINGAPORE SPORTS SCHOOL,18,,,Triple jump,,,,2006,SGP,NWI,Jump,Female,FTKLAA State Meet,2024-03-03 00:00:00+00:00,2024,International,


In [681]:
competitors[competitors['COMPETITION']=='Box Hill Burn']

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT,DISTANCE,EVENT_CLASS,UNIQUE_ID,DOB,NATIONALITY,WIND,CATEGORY_EVENT,GENDER,COMPETITION,DATE,YEAR,REGION,TIMESTAMP
43,Jayden Tan,02:31.1,,,5,,1000m,,,,24-Jun-06,SGP,,Mid,Male,Box Hill Burn,2025-03-06 00:00:00+00:00,2025,International,
44,Romaine Soh,02:56.0,,,7,,1000m,,,,26-Nov-94,SGP,,Mid,Female,Box Hill Burn,2025-03-06 00:00:00+00:00,2025,International,
7462,Shaun Goh,14:57.4,,,3,,5000m,,,,12-Jan-97,SGP,,Long,Male,Box Hill Burn,2025-03-06 00:00:00+00:00,2025,International,
7463,Vanessa Lee Ying Zhuang,17:06.7,,,11,,5000m,,,,23-Feb-98,SGP,,Long,Female,Box Hill Burn,2025-03-06 00:00:00+00:00,2025,International,
7464,Nicole Low,17:21.3,,,13,,5000m,,,,19-Jun-98,SGP,,Long,Female,Box Hill Burn,2025-03-06 00:00:00+00:00,2025,International,


In [682]:
os.chdir('/Users/veesheenyuen/Desktop/DataScience/SAA/SEAG/')


competitors.to_csv('database_prod.csv', sep=',', encoding='utf-8-sig', index=False)

In [683]:
competitors

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT,DISTANCE,EVENT_CLASS,UNIQUE_ID,DOB,NATIONALITY,WIND,CATEGORY_EVENT,GENDER,COMPETITION,DATE,YEAR,REGION,TIMESTAMP
0,Shanti Veronica Pereira,11.71,,,7.0,,100m,,,,20-Sep-96,SGP,-1.8,Sprint,Female,AtleticaGenève,2025-06-21 00:00:00+00:00,2025,International,2025-06-29 22:00:26.182128
1,Shanti Veronica Pereira,23.16w,,,7.0,,200m,,,,20-Sep-96,SGP,2.4,Sprint,Female,AtleticaGenève,2025-06-21 00:00:00+00:00,2025,International,2025-06-29 22:00:26.182128
2,Jun Yu Low,4.77,,,5.0,,Pole Vault,,,,21-Apr-01,SGP,,Jump,Male,Jan Dietvorst Memorial,2025-06-21 00:00:00+00:00,2025,International,2025-06-29 22:00:26.182128
3,Tung Hon Andrew Pak,2.05,,,,,High Jump,,,,10-Apr-02,SGP,,Jump,Male,National Championships,2025-06-25 00:00:00+00:00,2025,International,2025-06-29 22:00:26.182128
4,Tam Jong Hng,1.95,,,,,High Jump,,,,,SGP,,Jump,Male,National Championships,2025-06-25 00:00:00+00:00,2025,International,2025-06-29 22:00:26.182128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31280,Charlene Lee,8.90,NYGH,,15,C,Triple Jump,,,,,,,Jump,Female,National School Games,2024-04-01 00:00:00+00:00,2024,Local,
31281,"Ng Jing Wen, Ariel",10.33,RGS,,2,C,Triple Jump,,,,,,,Jump,Female,National School Games,2024-04-01 00:00:00+00:00,2024,Local,
31282,LEE GABRIEL JIN YI,14.5,SINGAPORE,21,3,,Triple jump,,,,2003,SGP,NWI,Jump,Male,Victor Saneev Memorial (Georgia),2024-10-26 00:00:00+00:00,2024,International,
31283,"SYED AHMED RIADH, SHARIFAH FALISHA",11.13,SINGAPORE SPORTS SCHOOL,18,,,Triple jump,,,,2006,SGP,NWI,Jump,Female,FTKLAA State Meet,2024-03-03 00:00:00+00:00,2024,International,


In [684]:
competitors[competitors['COMPETITION']=='Box Hill Burn']

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT,DISTANCE,EVENT_CLASS,UNIQUE_ID,DOB,NATIONALITY,WIND,CATEGORY_EVENT,GENDER,COMPETITION,DATE,YEAR,REGION,TIMESTAMP
43,Jayden Tan,02:31.1,,,5,,1000m,,,,24-Jun-06,SGP,,Mid,Male,Box Hill Burn,2025-03-06 00:00:00+00:00,2025,International,
44,Romaine Soh,02:56.0,,,7,,1000m,,,,26-Nov-94,SGP,,Mid,Female,Box Hill Burn,2025-03-06 00:00:00+00:00,2025,International,
7462,Shaun Goh,14:57.4,,,3,,5000m,,,,12-Jan-97,SGP,,Long,Male,Box Hill Burn,2025-03-06 00:00:00+00:00,2025,International,
7463,Vanessa Lee Ying Zhuang,17:06.7,,,11,,5000m,,,,23-Feb-98,SGP,,Long,Female,Box Hill Burn,2025-03-06 00:00:00+00:00,2025,International,
7464,Nicole Low,17:21.3,,,13,,5000m,,,,19-Jun-98,SGP,,Long,Female,Box Hill Burn,2025-03-06 00:00:00+00:00,2025,International,


In [685]:
'''
# Create mm/dd/yy column for date of event

import datetime


#datetime.datetime.fromtimestamp(s).strftime('%M:%S.%f')

for i in range(len(competitors)):
        
    rowIndex = competitors.index[i]

    date = competitors.loc[rowIndex,'DATE']
    year = competitors.loc[rowIndex,'YEAR']    
    
    if 'to' in date or ' - ' in date:
        
        if re.search('to|\s\-\s\d\s|\s\-\d\d', date):  # e.g. 03-04
              
            pos = re.search('to|\s\-\s\d', date)
            # Splice string to day and month

            split_pos_start=pos.start()+3
            


            final_date = date[split_pos_start:] # left string post splicing

            print(i, pos, date, final_date)
            final_year = year[2:]

            event_date = final_date + '/' + final_year

            print('old code', date, final_date, event_date)

            competitors.loc[rowIndex, 'event_date'] = event_date

        elif re.search('(\-\s\d\w)|(\-\s\d\d\w)', date):  # e.g. 18 - 19 January
                        
            pos = re.search('\-', date)  # from '-' onwards only
            # Splice string to day and month

            split_pos_start=pos.start()+2
            

            final_date = date[split_pos_start:] # left string post splicing

            
            final_year = year[2:]

            event_date = final_date + ' ' + final_year


            competitors.loc[rowIndex, 'event_date'] = event_date
            
        
    elif re.search('\w\-\w', date):
        
        if competitors.loc[rowIndex, 'COMPETITION'] == "National School Games":
            
            if competitors.loc[rowIndex, 'YEAR'] == '2024':
        
                event_date = '04'+'/'+date[1:3] + '/' + year[2:]  # reverse order from dd/mm to mm/dd. 04 because event was in April 24 only
            
       #         print('NSG 2024', event_date)
        
                competitors.loc[rowIndex, 'event_date'] = event_date
            
            elif competitors.loc[rowIndex, 'YEAR'] == '2025':
                
                event_date = date + '-' +year[2:]
                
        #        print('NSG2025', event_date)
                
                competitors.loc[rowIndex, 'event_date'] = event_date
                
        elif re.search('\d\-\d',  date):        #10-13 April
            
            print('HERE', i, date)

            rpos = re.search('\-', date)
            string = date[rpos.end():]
            
            print('extracted date', string)
            
            event_date = string + ' ' + year
            
            print('event date', event_date)
            
            competitors.loc[rowIndex, 'event_date'] = event_date

            
        
        else:
            
            event_date = date + '-' + year[2:]
            
            competitors.loc[rowIndex, 'event_date'] = event_date

    else:   # NEW
            
            event_date = date + '-' + year[2:]  # NEW
            
            competitors.loc[rowIndex, 'event_date'] = event_date  #NEW

        
competitors['event_date'] = competitors['event_date'].astype(str)
competitors['event_date'] = competitors['event_date'].str.replace('\xa0', ' ', regex=True)
competitors['event_date'] = competitors['event_date'].str.replace('[\x00-\x1f\x7f-\x9f]', '', regex=True)
competitors['event_date'] = competitors['event_date'].str.replace('\r', ' ', regex=True)
competitors['event_date'] = competitors['event_date'].str.replace('\n', ' ', regex=True)
competitors['event_date'] = competitors['event_date'].str.strip()
       
'''    

'\n# Create mm/dd/yy column for date of event\n\nimport datetime\n\n\n#datetime.datetime.fromtimestamp(s).strftime(\'%M:%S.%f\')\n\nfor i in range(len(competitors)):\n        \n    rowIndex = competitors.index[i]\n\n    date = competitors.loc[rowIndex,\'DATE\']\n    year = competitors.loc[rowIndex,\'YEAR\']    \n    \n    if \'to\' in date or \' - \' in date:\n        \n        if re.search(\'to|\\s\\-\\s\\d\\s|\\s\\-\\d\\d\', date):  # e.g. 03-04\n              \n            pos = re.search(\'to|\\s\\-\\s\\d\', date)\n            # Splice string to day and month\n\n            split_pos_start=pos.start()+3\n            \n\n\n            final_date = date[split_pos_start:] # left string post splicing\n\n            print(i, pos, date, final_date)\n            final_year = year[2:]\n\n            event_date = final_date + \'/\' + final_year\n\n            print(\'old code\', date, final_date, event_date)\n\n            competitors.loc[rowIndex, \'event_date\'] = event_date\n\n        el

In [686]:
competitors[competitors['COMPETITION']=='Box Hill Burn']

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT,DISTANCE,EVENT_CLASS,UNIQUE_ID,DOB,NATIONALITY,WIND,CATEGORY_EVENT,GENDER,COMPETITION,DATE,YEAR,REGION,TIMESTAMP
43,Jayden Tan,02:31.1,,,5,,1000m,,,,24-Jun-06,SGP,,Mid,Male,Box Hill Burn,2025-03-06 00:00:00+00:00,2025,International,
44,Romaine Soh,02:56.0,,,7,,1000m,,,,26-Nov-94,SGP,,Mid,Female,Box Hill Burn,2025-03-06 00:00:00+00:00,2025,International,
7462,Shaun Goh,14:57.4,,,3,,5000m,,,,12-Jan-97,SGP,,Long,Male,Box Hill Burn,2025-03-06 00:00:00+00:00,2025,International,
7463,Vanessa Lee Ying Zhuang,17:06.7,,,11,,5000m,,,,23-Feb-98,SGP,,Long,Female,Box Hill Burn,2025-03-06 00:00:00+00:00,2025,International,
7464,Nicole Low,17:21.3,,,13,,5000m,,,,19-Jun-98,SGP,,Long,Female,Box Hill Burn,2025-03-06 00:00:00+00:00,2025,International,


In [687]:
'''
# Calculate number of days from today to event date

competitors['event_date_dt'] = pd.to_datetime(competitors['event_date'], format='mixed', dayfirst=False)

competitors['delta_time']= datetime.datetime.now() - competitors['event_date_dt']

competitors['delta_time_conv'] = pd.to_numeric(competitors['delta_time'].dt.days, downcast='integer')

competitors['event_month'] = competitors['event_date_dt'].dt.month
'''

"\n# Calculate number of days from today to event date\n\ncompetitors['event_date_dt'] = pd.to_datetime(competitors['event_date'], format='mixed', dayfirst=False)\n\ncompetitors['delta_time']= datetime.datetime.now() - competitors['event_date_dt']\n\ncompetitors['delta_time_conv'] = pd.to_numeric(competitors['delta_time'].dt.days, downcast='integer')\n\ncompetitors['event_month'] = competitors['event_date_dt'].dt.month\n"

In [688]:
# DATE column to contain timezone - tz aware mode

competitors['DATE'] = pd.to_datetime(competitors['DATE'], format='mixed', dayfirst=False, utc=True)


In [691]:
# datetime to contain UTC (timezone)

competitors['NOW'] = datetime.datetime.now()

timezone = pytz.timezone('UTC')

competitors['NOW'] = datetime.datetime.now().replace(tzinfo=timezone)

In [692]:
# Calculate number of days from today to event date

#competitors['DATE'] = pd.to_datetime(competitors['DATE'], format='mixed', dayfirst=False, utc=False)

competitors['delta_time'] = competitors['NOW'] - competitors['DATE']


#competitors['delta_time'] = datetime.datetime.now() - competitors['DATE']


competitors['delta_time_conv'] = pd.to_numeric(competitors['delta_time'].dt.days, downcast='integer')

competitors['event_month'] = competitors['DATE'].dt.month

# Make sure date conversion is is valid for all rows

assert not competitors['delta_time'].isna().any()

In [693]:
competitors[competitors['COMPETITION']=='Box Hill Burn']

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT,DISTANCE,EVENT_CLASS,UNIQUE_ID,...,GENDER,COMPETITION,DATE,YEAR,REGION,TIMESTAMP,NOW,delta_time,delta_time_conv,event_month
43,Jayden Tan,02:31.1,,,5,,1000m,,,,...,Male,Box Hill Burn,2025-03-06 00:00:00+00:00,2025,International,,2025-07-02 22:06:35.488019+00:00,118 days 22:06:35.488019,118,3
44,Romaine Soh,02:56.0,,,7,,1000m,,,,...,Female,Box Hill Burn,2025-03-06 00:00:00+00:00,2025,International,,2025-07-02 22:06:35.488019+00:00,118 days 22:06:35.488019,118,3
7462,Shaun Goh,14:57.4,,,3,,5000m,,,,...,Male,Box Hill Burn,2025-03-06 00:00:00+00:00,2025,International,,2025-07-02 22:06:35.488019+00:00,118 days 22:06:35.488019,118,3
7463,Vanessa Lee Ying Zhuang,17:06.7,,,11,,5000m,,,,...,Female,Box Hill Burn,2025-03-06 00:00:00+00:00,2025,International,,2025-07-02 22:06:35.488019+00:00,118 days 22:06:35.488019,118,3
7464,Nicole Low,17:21.3,,,13,,5000m,,,,...,Female,Box Hill Burn,2025-03-06 00:00:00+00:00,2025,International,,2025-07-02 22:06:35.488019+00:00,118 days 22:06:35.488019,118,3


In [694]:
# Make sure date conversion is is valid for all rows

#assert not competitors['event_date_dt'].isna().any()

In [695]:
# These results have not had their event dates converted properly

#competitors[competitors['event_date_dt'].isna()]

In [696]:
# Choose date range for SEAG qualification window from Oct 22 to current


#competitors = competitors[(competitors['delta_time_conv']>=0) & (competitors['delta_time_conv']<=365)]

#competitors=competitors.reset_index(drop=True)

competitors['DATE']=competitors['DATE'].dt.tz_localize(None)  # switch off timezone for compatibility with np.datetime64


start = datetime.datetime(2024, 10, 22)
#start = datetime.datetime(2025, 5, 1)


end = datetime.datetime(2025, 6, 30)

start_date = np.datetime64(start)
end_date = np.datetime64(end)


mask = (competitors['DATE'] >= start_date) & (competitors['DATE'] <= end_date)
athletes_selected = competitors.loc[mask]



In [697]:
athletes_selected.to_csv('athletes_downloaded_june25_seag_tz.csv', encoding='utf-8')

In [698]:
# Select all of 2024/25 for OCTC

#athletes_selected = competitors[(competitors['YEAR']=='2024')|(competitors['YEAR']=='2025')]

#athletes_selected = competitors[(competitors['YEAR']=='2025')]

In [699]:
athletes_selected

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT,DISTANCE,EVENT_CLASS,UNIQUE_ID,...,GENDER,COMPETITION,DATE,YEAR,REGION,TIMESTAMP,NOW,delta_time,delta_time_conv,event_month
0,Shanti Veronica Pereira,11.71,,,7.0,,100m,,,,...,Female,AtleticaGenève,2025-06-21,2025,International,2025-06-29 22:00:26.182128,2025-07-02 22:06:35.488019+00:00,11 days 22:06:35.488019,11,6
1,Shanti Veronica Pereira,23.16w,,,7.0,,200m,,,,...,Female,AtleticaGenève,2025-06-21,2025,International,2025-06-29 22:00:26.182128,2025-07-02 22:06:35.488019+00:00,11 days 22:06:35.488019,11,6
2,Jun Yu Low,4.77,,,5.0,,Pole Vault,,,,...,Male,Jan Dietvorst Memorial,2025-06-21,2025,International,2025-06-29 22:00:26.182128,2025-07-02 22:06:35.488019+00:00,11 days 22:06:35.488019,11,6
3,Tung Hon Andrew Pak,2.05,,,,,High Jump,,,,...,Male,National Championships,2025-06-25,2025,International,2025-06-29 22:00:26.182128,2025-07-02 22:06:35.488019+00:00,7 days 22:06:35.488019,7,6
4,Tam Jong Hng,1.95,,,,,High Jump,,,,...,Male,National Championships,2025-06-25,2025,International,2025-06-29 22:00:26.182128,2025-07-02 22:06:35.488019+00:00,7 days 22:06:35.488019,7,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31125,"Sueto, Yuito",14.76m,Kansai University of Social We,21,3,Open,Triple Jump,0,,Y04,...,Male,85th Singapore Open Track & Field,2025-04-25,2025,Local,,2025-07-02 22:06:35.488019+00:00,68 days 22:06:35.488019,68,4
31150,Gabriel Lee,15.15,,,7.0,,Triple Jump,,,,...,Male,Taiwan Athletics Open,2025-06-08,2025,International,2025-06-11 16:38:50.923703,2025-07-02 22:06:35.488019+00:00,24 days 22:06:35.488019,24,6
31151,Tia Louise Rozario,12.78,,,2.0,,Triple Jump,,,,...,Female,Taiwan Athletics Open,2025-06-08,2025,International,2025-06-11 16:38:50.923703,2025-07-02 22:06:35.488019+00:00,24 days 22:06:35.488019,24,6
31152,Tan Shou Yi Rei,13.76,,,1,,Triple Jump,,,,...,Male,WA State Championships,2025-03-09,2025,International,,2025-07-02 22:06:35.488019+00:00,115 days 22:06:35.488019,115,3


In [700]:
athletes_selected[athletes_selected['COMPETITION']=='Box Hill Burn']

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT,DISTANCE,EVENT_CLASS,UNIQUE_ID,...,GENDER,COMPETITION,DATE,YEAR,REGION,TIMESTAMP,NOW,delta_time,delta_time_conv,event_month
43,Jayden Tan,02:31.1,,,5,,1000m,,,,...,Male,Box Hill Burn,2025-03-06,2025,International,,2025-07-02 22:06:35.488019+00:00,118 days 22:06:35.488019,118,3
44,Romaine Soh,02:56.0,,,7,,1000m,,,,...,Female,Box Hill Burn,2025-03-06,2025,International,,2025-07-02 22:06:35.488019+00:00,118 days 22:06:35.488019,118,3
7462,Shaun Goh,14:57.4,,,3,,5000m,,,,...,Male,Box Hill Burn,2025-03-06,2025,International,,2025-07-02 22:06:35.488019+00:00,118 days 22:06:35.488019,118,3
7463,Vanessa Lee Ying Zhuang,17:06.7,,,11,,5000m,,,,...,Female,Box Hill Burn,2025-03-06,2025,International,,2025-07-02 22:06:35.488019+00:00,118 days 22:06:35.488019,118,3
7464,Nicole Low,17:21.3,,,13,,5000m,,,,...,Female,Box Hill Burn,2025-03-06,2025,International,,2025-07-02 22:06:35.488019+00:00,118 days 22:06:35.488019,118,3


In [701]:
# Choose 2024/25 only

athletes = athletes_selected

In [702]:
athletes

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT,DISTANCE,EVENT_CLASS,UNIQUE_ID,...,GENDER,COMPETITION,DATE,YEAR,REGION,TIMESTAMP,NOW,delta_time,delta_time_conv,event_month
0,Shanti Veronica Pereira,11.71,,,7.0,,100m,,,,...,Female,AtleticaGenève,2025-06-21,2025,International,2025-06-29 22:00:26.182128,2025-07-02 22:06:35.488019+00:00,11 days 22:06:35.488019,11,6
1,Shanti Veronica Pereira,23.16w,,,7.0,,200m,,,,...,Female,AtleticaGenève,2025-06-21,2025,International,2025-06-29 22:00:26.182128,2025-07-02 22:06:35.488019+00:00,11 days 22:06:35.488019,11,6
2,Jun Yu Low,4.77,,,5.0,,Pole Vault,,,,...,Male,Jan Dietvorst Memorial,2025-06-21,2025,International,2025-06-29 22:00:26.182128,2025-07-02 22:06:35.488019+00:00,11 days 22:06:35.488019,11,6
3,Tung Hon Andrew Pak,2.05,,,,,High Jump,,,,...,Male,National Championships,2025-06-25,2025,International,2025-06-29 22:00:26.182128,2025-07-02 22:06:35.488019+00:00,7 days 22:06:35.488019,7,6
4,Tam Jong Hng,1.95,,,,,High Jump,,,,...,Male,National Championships,2025-06-25,2025,International,2025-06-29 22:00:26.182128,2025-07-02 22:06:35.488019+00:00,7 days 22:06:35.488019,7,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31125,"Sueto, Yuito",14.76m,Kansai University of Social We,21,3,Open,Triple Jump,0,,Y04,...,Male,85th Singapore Open Track & Field,2025-04-25,2025,Local,,2025-07-02 22:06:35.488019+00:00,68 days 22:06:35.488019,68,4
31150,Gabriel Lee,15.15,,,7.0,,Triple Jump,,,,...,Male,Taiwan Athletics Open,2025-06-08,2025,International,2025-06-11 16:38:50.923703,2025-07-02 22:06:35.488019+00:00,24 days 22:06:35.488019,24,6
31151,Tia Louise Rozario,12.78,,,2.0,,Triple Jump,,,,...,Female,Taiwan Athletics Open,2025-06-08,2025,International,2025-06-11 16:38:50.923703,2025-07-02 22:06:35.488019+00:00,24 days 22:06:35.488019,24,6
31152,Tan Shou Yi Rei,13.76,,,1,,Triple Jump,,,,...,Male,WA State Championships,2025-03-09,2025,International,,2025-07-02 22:06:35.488019+00:00,115 days 22:06:35.488019,115,3


In [703]:
athletes[athletes['NAME']=='Jun Jie Calvin Quek']

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT,DISTANCE,EVENT_CLASS,UNIQUE_ID,...,GENDER,COMPETITION,DATE,YEAR,REGION,TIMESTAMP,NOW,delta_time,delta_time_conv,event_month
549,Jun Jie Calvin Quek,10.86,,,2.0,,100m,,,,...,Male,Potch Invitational Meet,2025-04-16,2025,International,,2025-07-02 22:06:35.488019+00:00,77 days 22:06:35.488019,77,4
6306,Jun Jie Calvin Quek,51.64,,,6.0,,400m Hurdles,,,,...,Male,12th Kinami Michitaka Memorial Athletics Meet,2025-05-11,2025,International,2025-05-16 15:18:11.207945,2025-07-02 22:06:35.488019+00:00,52 days 22:06:35.488019,52,5
6311,Jun Jie Calvin Quek,52.62,,,9.0,,400m Hurdles,,,,...,Male,Sydney Track Classic,2025-03-15,2025,International,,2025-07-02 22:06:35.488019+00:00,109 days 22:06:35.488019,109,3
6312,Jun Jie Calvin Quek,50.77,,,1.0,,400m Hurdles,,,,...,Male,Potch Invitational Meet,2025-04-16,2025,International,,2025-07-02 22:06:35.488019+00:00,77 days 22:06:35.488019,77,4
6317,Jun Jie Calvin Quek,50.58,,,2.0,,400m Hurdles,,,,...,Male,26th Asian Athletics Championships,2025-05-31,2025,International,2025-06-01 16:49:06.835826,2025-07-02 22:06:35.488019+00:00,32 days 22:06:35.488019,32,5
6319,Jun Jie Calvin Quek,50.94,,,8.0,,400m Hurdles,,,,...,Male,26th Asian Athletics Championships,2025-05-31,2025,International,36:03.8,2025-07-02 22:06:35.488019+00:00,32 days 22:06:35.488019,32,5


In [704]:
# Run events

#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+200 Meter Dash.+', value='200m')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+100 Meter Dash.+', value='100m')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+400 Meter Dash.+', value='400m')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+800 Meter Run.+', value='800m')

#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+1500 Meter Run.+', value='1500m')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+3000 Meter Run.+', value='3000m')

# Hurdles events

#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+110 Meter Hurdles.+', value='110m hurdles')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+110m Hurdles.+', value='110m hurdles')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+100 Meter Hurdles.+', value='100m hurdles')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+200 Meter Hurdles.+', value='200m hurdles')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+400 Meter Hurdles.+', value='400m hurdles')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+400m Hurdles.+', value='400m hurdles')


#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+4x100 Meter Relay.+', value='4 x 100m relay')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+4x400 Meter Relay.+', value='4 x 400m relay')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+3000 meter.+', value='3000m')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+1500 Meter Race Walk.+', value='1500m race walk')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+3000m Race Walk.+', value='3000m race walk')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+5000 Meter Race Walk.+', value='5000m race walk')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+5000m Race Walk.+', value='5000m race walk')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+10000 Meter Race Walk.+', value='10000m race walk')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+5000 Meter Run.+', value='5000m run')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+10000 Meter Run.+', value='10000m run')


#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+Race Walk.+', value='race walk')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+Pole Vault.+', value='Pole vault')

#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+Shot Put.+', value='Shot put')

#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+Discus.+', value='Discus throw')

#mask = athletes['EVENT'].str.contains(r'Discus', na=True)
#athletes.loc[mask, 'EVENT'] = 'Discus throw'

#mask = athletes['EVENT'].str.contains(r'Shot', na=True)
#athletes.loc[mask, 'EVENT'] = 'Shot put'

#mask = athletes['EVENT'].str.contains(r'Javelin', na=True)
#athletes.loc[mask, 'EVENT'] = 'Javelin throw'



#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+Triple Jump.+', value='Triple jump')

#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+Javelin Throw.+', value='Javelin throw')

#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+Long Jump.+', value='Long jump')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'Long Jump', value='Long jump')


#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'High Jump', value='High jump')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+High Jump.+', value='High jump')

#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+S/C.+', value='steeplechase')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+3000 Meter Steeplechase.+', value='3000m steeplechase')
#athletes['EVENT'] = athletes['EVENT'].replace(regex=r'.+2000 Meter Steeplechase.+', value='2000m steeplechase')


#mask = athletes['EVENT'].str.contains(r'High', na=True)
#athletes.loc[mask, 'EVENT'] = 'High jump'

#mask = athletes['EVENT'].str.contains(r'110m hurdles', na=True)
#athletes.loc[mask, 'EVENT'] = '110m hurdles'

#mask = athletes['EVENT'].str.contains(r'400m hurdles', na=True)
#athletes.loc[mask, 'EVENT'] = '400m hurdles'

#mask = athletes['EVENT'].str.contains(r'200m Hurdles', na=True)
#athletes.loc[mask, 'EVENT'] = '200m hurdles'

#mask = athletes['EVENT'].str.contains(r'100m Hurdles', na=True)
#athletes.loc[mask, 'EVENT'] = '100m hurdles'

#mask = athletes['EVENT'].str.contains(r'4 X 100m relay', na=True)
#athletes.loc[mask, 'EVENT'] = '4 x 100m relay'

#mask = athletes['EVENT'].str.contains(r'4 X 400m relay', na=True)
#athletes.loc[mask, 'EVENT'] = '4 x 400m relay'

#mask = athletes['EVENT'].str.contains(r'2000 Meter Steeplechase', na=True)
#athletes.loc[mask, 'EVENT'] = '2000m steeplechase'

#mask = athletes['EVENT'].str.contains(r'Hammer Throw', na=True)
#athletes.loc[mask, 'EVENT'] = 'Hammer throw'

#mask = athletes['EVENT'].str.contains(r'3000m S/C', na=True)
#athletes.loc[mask, 'EVENT'] = '3000m steeplechase'

#mask = athletes['EVENT'].str.contains(r'2000m S/C', na=True)
#athletes.loc[mask, 'EVENT'] = '2000m steeplechase'


#mask = athletes['EVENT'].str.contains(r'4x100m Relay', na=True)
#athletes.loc[mask, 'EVENT'] = '4 x 100m relay'

#mask = athletes['EVENT'].str.contains(r'4x400m Relay', na=True)
#athletes.loc[mask, 'EVENT'] = '4 x 400m relay'


# correct javelin category
#mask = athletes['EVENT'].str.contains(r'Javelin', na=True)
#athletes.loc[mask, 'CATEGORY_EVENT'] = 'Throw'




In [705]:
# Wind mapping

#mask = athletes['EVENT'].str.contains(r'\d{1}.\d{1}', na=True, regex=True)
#athletes.loc[mask, 'WIND_MAP'] = '400m'


In [706]:
# Create temporary mapped event column

athletes['MAPPED_EVENT']=''

for col in athletes.columns:
    athletes[col] = athletes[col].astype(str)
    athletes[col] = athletes[col].str.replace('\xa0', ' ', regex=True)
    athletes[col] = athletes[col].str.replace('[\x00-\x1f\x7f-\x9f]', '', regex=True)
    athletes[col] = athletes[col].str.replace('\r', ' ', regex=True)
    athletes[col] = athletes[col].str.replace('\n', ' ', regex=True)
    athletes[col] = athletes[col].str.strip()


# Correct javelin category

mask = athletes['EVENT'].str.contains(r'Javelin', na=True)
athletes.loc[mask, 'CATEGORY_EVENT'] = 'Throw'


# Running

mask = (athletes['EVENT'].str.contains(r'Dash', na=True) & athletes['DISTANCE'].str.contains(r'100', na=True))
athletes.loc[mask, 'MAPPED_EVENT'] = '100m'
mask = (athletes['EVENT'].str.contains(r'Run', na=True) & athletes['DISTANCE'].str.contains(r'100', na=True))
athletes.loc[mask, 'MAPPED_EVENT'] = '100m'
mask = athletes['EVENT'].str.contains(r'100 Meter Run', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = '100m'
mask = athletes['EVENT'].str.contains(r'^100m$', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = '100m'

mask = (athletes['EVENT'].str.contains(r'Dash', na=True) & athletes['DISTANCE'].str.contains(r'200', na=True))
athletes.loc[mask, 'MAPPED_EVENT'] = '200m'
mask = (athletes['EVENT'].str.contains(r'Run', na=True) & athletes['DISTANCE'].str.contains(r'200', na=True))
athletes.loc[mask, 'MAPPED_EVENT'] = '200m'
mask = athletes['EVENT'].str.contains(r'^200m$', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = '200m'
mask = athletes['EVENT'].str.contains(r'200\sMeter', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = '200m'

mask = (athletes['EVENT'].str.contains(r'Dash', na=True) & athletes['DISTANCE'].str.contains(r'400', na=True))
athletes.loc[mask, 'MAPPED_EVENT'] = '400m'
mask = athletes['EVENT'].str.contains(r'^400m$', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = '400m'
mask = athletes['EVENT'].str.contains(r'^400\sMeter$', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = '400m'
mask = (athletes['EVENT'].str.contains(r'Run', na=True) & athletes['DISTANCE'].str.contains(r'400', na=True))
athletes.loc[mask, 'MAPPED_EVENT'] = '400m'


mask = (athletes['EVENT'].str.contains(r'Run', na=True) & athletes['DISTANCE'].str.contains(r'800', na=True))
athletes.loc[mask, 'MAPPED_EVENT'] = '800m'
mask = athletes['EVENT'].str.contains(r'800 Meter Run', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = '800m'
mask = athletes['EVENT'].str.contains(r'^800m$', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = '800m'
mask = (athletes['EVENT'].str.contains(r'Run', na=True) & athletes['DISTANCE'].str.contains(r'1000', na=True))
athletes.loc[mask, 'MAPPED_EVENT'] = '1000m'


mask = (athletes['EVENT'].str.contains(r'Run', na=True) & athletes['DISTANCE'].str.contains(r'1500', na=True))
athletes.loc[mask, 'MAPPED_EVENT'] = '1500m'
mask = athletes['EVENT'].str.contains(r'^1500m$', na=True, regex=True)
athletes.loc[mask, 'MAPPED_EVENT'] = '1500m'
mask = (athletes['EVENT'].str.contains(r'Run', na=True) & athletes['DISTANCE'].str.contains(r'3000', na=True))
athletes.loc[mask, 'MAPPED_EVENT'] = '3000m'
#mask = athletes['EVENT'].str.contains(r'3000m', na=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = '3000m'
mask = (athletes['EVENT'].str.contains(r'Run', na=True) & athletes['DISTANCE'].str.contains(r'5000', na=True))
athletes.loc[mask, 'MAPPED_EVENT'] = '5000m'
mask = athletes['EVENT'].str.contains(r'^5000m$', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = '5000m'
mask = (athletes['EVENT'].str.contains(r'Run', na=True) & athletes['DISTANCE'].str.contains(r'10000', na=True))
athletes.loc[mask, 'MAPPED_EVENT'] = '10,000m'
mask = athletes['EVENT'].str.contains(r'^10000m$', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = '10,000m'
mask = athletes['EVENT'].str.contains(r'^10\,000m$', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = '10,000m'
mask = (athletes['EVENT'].str.contains(r'Run', na=True) & athletes['DISTANCE'].str.contains(r'Mile', na=True))
athletes.loc[mask, 'MAPPED_EVENT'] = '1 Mile'

#mask = athletes['EVENT'].str.contains(r'10\,000m', na=True, regex=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = '10000m'



# Hurdles

#mask = athletes['EVENT'].str.contains(r'100\sMeter\sHurdles\s\(0\.838m\)', na=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = '100m Hurdles'
#mask = athletes['EVENT'].str.contains(r'100m\sHurdles\s\(0\.838m\)', na=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = '100m Hurdles'


##mask = athletes['EVENT'].str.contains(r'110m\sHurdles\s\(0\.914m\)', na=True)
##athletes.loc[mask, 'MAPPED_EVENT'] = '110m hurdles'
##mask = athletes['EVENT'].str.contains(r'110m\sHurdles\s\(1\.067m\)', na=True)
##athletes.loc[mask, 'MAPPED_EVENT'] = '110m Hurdles'


mask = (athletes['EVENT'].str.contains(r'100m Hurdles|100m hurdles', na=False) & athletes['EVENT_CLASS'].str.contains('0.84', na=False) & athletes['GENDER'].str.contains(r'Female', na=False))  # this is the correct syntax
athletes.loc[mask, 'MAPPED_EVENT'] = '100m Hurdles'
mask = (athletes['EVENT'].str.contains(r'100m Hurdles|100m hurdles', na=False) & athletes['DIVISION'].str.contains('None', na=False) & athletes['GENDER'].str.contains(r'Female', na=False) & athletes['REGION'].str.contains(r'International', na=False))  # this is the correct syntax
athletes.loc[mask, 'MAPPED_EVENT'] = '100m Hurdles'
mask = (athletes['EVENT'].str.contains(r'^Hurdles$', na=False) & athletes['DISTANCE'].str.contains(r'100', na=False) & athletes['DIVISION'].str.contains(r'OPEN|Open', na=False) & athletes['GENDER'].str.contains(r'Female', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = '100m Hurdles'
mask = (athletes['EVENT'].str.contains(r'100m Hurdles|100m hurdles', na=False) & athletes['REGION'].str.contains(r'International', na=False) & athletes['GENDER'].str.contains(r'Female', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = '100m Hurdles'



mask = (athletes['EVENT'].str.contains(r'^Hurdles$', na=False) & athletes['DISTANCE'].str.contains(r'110', na=False) & athletes['DIVISION'].str.contains(r'OPEN|Open', na=False) & athletes['GENDER'].str.contains(r'Male', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = '110m Hurdles'
mask = (athletes['EVENT'].str.contains(r'^Hurdles$', na=False) & athletes['DISTANCE'].str.contains(r'110', na=False) & athletes['EVENT_CLASS'].str.contains(r'0.838|0.84', na=False) & athletes['GENDER'].str.contains(r'Female', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = '110m Hurdles'
mask = ((athletes['EVENT'].str.contains(r'110m Hurdles|110m hurdles', na=False)) 
         & ((athletes['EVENT_CLASS'].str.contains('None', na=False))|(athletes['EVENT_CLASS']==np.nan)|(athletes['EVENT_CLASS']=='')) 
         & athletes['REGION'].str.contains(r'International', na=False) & (athletes['DIVISION'].str.contains(r'None', na=False)))  # this is the correct syntax
athletes.loc[mask, 'MAPPED_EVENT'] = '110m Hurdles'

#mask = (athletes['EVENT'].str.contains(r'110m Hurdles|110m hurdles', na=False) & athletes['REGION'].str.contains(r'International', na=False) & athletes['GENDER'].str.contains(r'Male', na=False))
#athletes.loc[mask, 'MAPPED_EVENT'] = '110m Hurdles'


# Using np.where instead

#athletes['MAPPED_EVENT'] = np.where(((athletes['EVENT']=='110m hurdles|110m Hurdles') & ((athletes['EVENT_CLASS']=='')|athletes['EVENT_CLASS']=='None') & (athletes['REGION']=='International')), '110m Hurdles', ' ')   
                                


mask = (athletes['EVENT'].str.contains(r'^Hurdles$', na=False) & athletes['DISTANCE'].str.contains(r'110', na=False) & athletes['EVENT_CLASS'].str.contains(r'1.067', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = '110m Hurdles'
mask = (athletes['EVENT'].str.contains(r'^Hurdles$', na=False) & athletes['DISTANCE'].str.contains(r'110', na=False) & athletes['EVENT_CLASS'].str.contains(r'0.914', na=False) & athletes['GENDER'].str.contains(r'Female', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = '110m Hurdles'
#mask = (athletes['EVENT'].str.contains(r'^Hurdles$', na=False) & athletes['DISTANCE'].str.contains(r'110', na=False) & athletes['EVENT_CLASS'].str.contains(' ', na=True))
#athletes.loc[mask, 'MAPPED_EVENT'] = '110m Hurdles'







#mask = (athletes['EVENT'].str.contains(r'^400m\sHurdles$', na=False) & athletes['EVENT_CLASS'].str.contains(r'', na=False))
#athletes.loc[mask, 'MAPPED_EVENT'] = '400m Hurdles'
#mask = athletes['EVENT'].str.contains(r'400m\sHurdles\s\(0.840m\)', na=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = ' '
mask = (athletes['EVENT'].str.contains(r'^Hurdles$', na=False) & athletes['DISTANCE'].str.contains(r'400', na=False) & athletes['EVENT_CLASS'].str.contains(r'0.84|84cm', na=False) & athletes['GENDER'].str.contains(r'Female', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = '400m Hurdles'


#mask = athletes['EVENT'].str.contains(r'400\sMeter\sHurdles\s\(0\.914m\)', na=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = '400m Hurdles'
#mask = athletes['EVENT'].str.contains(r'400m\sHurdles\s\(0\.914m\)', na=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = '400m Hurdles'
mask = (athletes['EVENT'].str.contains(r'^Hurdles$', na=False) & athletes['DISTANCE'].str.contains(r'400', na=False) & athletes['EVENT_CLASS'].str.contains(r'0.914', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = '400m Hurdles'
mask = (athletes['EVENT'].str.contains(r'^Hurdles$', na=False) & athletes['DISTANCE'].str.contains(r'400', na=False) & athletes['DIVISION'].str.contains(r'Open|Invitational', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = '400m Hurdles'

mask = (athletes['EVENT'].str.contains(r'400m Hurdles', na=False) & athletes['EVENT_CLASS'].str.contains(r'0.914', na=False)  & athletes['GENDER'].str.contains(r'Male', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = '400m Hurdles'


#mask = athletes['EVENT'].str.contains(r'^400\sMeter\sHurdles\s\(0\.762m\)$', na=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = '400m Hurdles'
##mask = athletes['EVENT'].str.contains(r'^400m\sHurdles\s\(0\.762m\)$', na=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = '400m Hurdles'
mask = (athletes['EVENT'].str.contains(r'Hurdles', na=False) & athletes['DISTANCE'].str.contains(r'400', na=False) & athletes['EVENT_CLASS'].str.contains(r'0.762', na=False)& athletes['GENDER'].str.contains(r'Female', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = '400m Hurdles'
mask = (athletes['EVENT'].str.contains(r'400m Hurdles', na=False) & athletes['EVENT_CLASS'].str.contains(r'0.762m', na=False) & athletes['GENDER'].str.contains(r'Female', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = '400m Hurdles'
mask = (athletes['EVENT'].str.contains(r'400m Hurdles|400m hurdles', na=False) & athletes['EVENT_CLASS'].str.contains('None|0.762|0.914', na=False) & athletes['REGION'].str.contains(r'International', na=False))  # this is the correct syntax
athletes.loc[mask, 'MAPPED_EVENT'] = '400m Hurdles'
mask = (athletes['EVENT'].str.contains(r'400m Hurdles|400m hurdles', na=False) & athletes['REGION'].str.contains(r'International', na=False))  # this is the correct syntax
athletes.loc[mask, 'MAPPED_EVENT'] = '400m Hurdles'



# Throws


#mask = ((athletes['EVENT'].str.contains(r'Javelin\sThrow\s\(600g\)', na=True, regex=True)) & (athletes['GENDER']=='Female'))
#athletes.loc[mask, 'MAPPED_EVENT'] = 'Javelin Throw'
#mask = ((athletes['EVENT'].str.contains(r'Javelin\sThrow\s600g', na=True, regex=True)) & (athletes['GENDER']=='Female'))
#athletes.loc[mask, 'MAPPED_EVENT'] = 'Javelin Throw'
#mask = ((athletes['EVENT'].str.contains(r'Javelin\sThrow\s600g\)', na=True, regex=True)) & (athletes['GENDER']=='Female'))
#athletes.loc[mask, 'MAPPED_EVENT'] = 'Javelin Throw'
#mask = athletes['EVENT'].str.contains(r'Javelin\sThrow\s\(800g\)', na=True, regex=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = 'Javelin Throw'

mask = (athletes['EVENT'].str.contains(r'Javelin Throw|Javelin throw|Javelin', na=False) & athletes['EVENT_CLASS'].str.contains(r'600g', na=False) & athletes['GENDER'].str.contains(r'Female', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = 'Javelin Throw'
mask = (athletes['EVENT'].str.contains(r'Javelin Throw|Javelin throw|Javelin', na=False) & athletes['EVENT_CLASS'].str.contains(r'800g', na=False) & athletes['GENDER'].str.contains(r'Male', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = 'Javelin Throw'
mask = (athletes['EVENT'].str.contains(r'Javelin Throw|Javelin throw', na=False) & athletes['DIVISION'].str.contains(r'OPEN|Open', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = 'Javelin Throw'

mask = (athletes['EVENT'].str.contains(r'Shot Put|Shot put', na=False, regex=True) & (athletes['GENDER']=='Female') & (athletes['EVENT_CLASS']=='4kg'))# there are some additional characters after Put
athletes.loc[mask, 'MAPPED_EVENT'] = 'Shot Put'


#mask = athletes['EVENT'].str.contains(r'Women\sShot\sPut\s4kg\sOpen', na=True, regex=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = 'Shot Put'
#mask = athletes['EVENT'].str.contains(r'Men\sShot\sPut\s4kg\sOPEN', na=True, regex=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = 'Shot put'


#mask = athletes['EVENT'].str.contains(r'Women\sShot\sPut\s\(4kg\)', na=True, regex=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = 'Shot Put'
#mask = athletes['EVENT'].str.contains(r'Shot\sPut\s\(7\.26kg\)', na=True, regex=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = 'Shot Put'
#mask = athletes['EVENT'].str.contains(r'Shot\sPut\s7\.26kg\sOpen', na=True, regex=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = 'Shot Put'
mask = (athletes['EVENT'].str.contains(r'Shot Put|Shot put', na=False) & (athletes['GENDER']=='Male') & (athletes['EVENT_CLASS'].str.contains(r'7.26', na=False)))# there are some additional characters after Put
athletes.loc[mask, 'MAPPED_EVENT'] = 'Shot Put'
mask = (athletes['EVENT'].str.contains(r'Shot Put|Shot put', na=False) & (athletes['GENDER']=='Female') & (athletes['EVENT_CLASS'].str.contains(r'4', na=False)))# there are some additional characters after Put
athletes.loc[mask, 'MAPPED_EVENT'] = 'Shot Put'

mask = (athletes['EVENT'].str.contains(r'Shot Put|Shot put', na=False) & (athletes['DIVISION'].str.contains(r'OPEN|Open', na=False)))# there are some additional characters after Put
athletes.loc[mask, 'MAPPED_EVENT'] = 'Shot Put'

mask = (athletes['EVENT'].str.contains(r'Shot Put|Shot put', na=False) & (athletes['REGION'].str.contains(r'International', na=False)) & athletes['EVENT_CLASS'].str.contains(r'None', na=False))# there are some additional characters after Put
athletes.loc[mask, 'MAPPED_EVENT'] = 'Shot Put'



#mask = (athletes['EVENT'].str.contains(r'Hammer\sThrow\s\(4kg\)', na=True) & (athletes['GENDER']=='Female'))
#athletes.loc[mask, 'MAPPED_EVENT'] = 'Hammer Throw'
#mask = athletes['EVENT'].str.contains(r'Hammer\sThrow\s\(7\.26kg\)', na=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = 'Hammer Throw'
mask = (athletes['EVENT'].str.contains(r'Hammer Throw|Hammer throw', na=False) & athletes['EVENT_CLASS'].str.contains(r'7.26kg', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = 'Hammer Throw'
mask = (athletes['EVENT'].str.contains(r'Hammer Throw|Hammer throw', na=False) & athletes['EVENT_CLASS'].str.contains(r'4.00kg', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = 'Hammer Throw'
mask = (athletes['EVENT'].str.contains(r'Hammer Throw|Hammer throw', na=False) & (athletes['DIVISION'].str.contains(r'OPEN|Open', na=False)))# there are some additional characters after Put
athletes.loc[mask, 'MAPPED_EVENT'] = 'Hammer Throw'



#mask = ((athletes['EVENT'].str.contains(r'Discus\sThrow\s\(1kg\)', na=False)) & (athletes['GENDER']=='Female'))
#athletes.loc[mask, 'MAPPED_EVENT'] = 'Discus Throw'

#mask = ((athletes['EVENT'].str.contains(r'Discus\s\(1\.00kg\)', na=False))  & (athletes['GENDER']=='Female'))
#athletes.loc[mask, 'MAPPED_EVENT'] = 'Discus Throw'


#mask = athletes['EVENT'].str.contains(r'Discus\sThrow\s\(2kg\)', na=False)
#athletes.loc[mask, 'MAPPED_EVENT'] = 'Discus Throw'
#mask = ((athletes['EVENT'].str.contains(r'Discus\sThrow\s\(1kg\)', na=False)) & (athletes['GENDER']=='Female'))
#athletes.loc[mask, 'MAPPED_EVENT'] = 'Discus Throw'
mask = (athletes['EVENT'].str.contains(r'Discus Throw|Discus|Discus throw', na=False) & athletes['EVENT_CLASS'].str.contains(r'2kg|2.00kg', na=False) & athletes['GENDER'].str.contains(r'Male', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = 'Discus Throw'
mask = (athletes['EVENT'].str.contains(r'Discus Throw|Discus|Discus throw', na=False) & athletes['EVENT_CLASS'].str.contains(r'1kg|1.00kg', na=False) & athletes['GENDER'].str.contains(r'Female', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = 'Discus Throw'

#mask = (athletes['EVENT'].str.contains(r'Discus Throw|Discus throw', na=False) & athletes['REGION'].str.contains(r'International', na=False))
#athletes.loc[mask, 'MAPPED_EVENT'] = 'Discus Throw'
mask = (athletes['EVENT'].str.contains(r'Discus Throw|Discus throw', na=False) & athletes['DIVISION'].str.contains(r'OPEN|Open', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = 'Discus Throw'
mask = (athletes['EVENT'].str.contains(r'Discus Throw|Discus throw', na=False) & athletes['DIVISION'].str.contains(r'None', na=False) & athletes['EVENT_CLASS'].str.contains(r'None', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = 'Discus Throw'

mask = (athletes['EVENT'].str.contains(r'Discus Throw|Discus throw', na=False) & athletes['REGION'].str.contains(r'International', na=False) & athletes['EVENT_CLASS'].str.contains(r'None', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = 'Discus Throw'



# Jumps

mask = athletes['EVENT'].str.contains(r'High Jump', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = 'High Jump'

mask = athletes['EVENT'].str.contains(r'^Long\sJump$', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = 'Long Jump'
mask = athletes['EVENT'].str.contains(r'Long Jump Open', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = 'Long Jump'
mask = athletes['EVENT'].str.contains(r'Long Jump Trial', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = 'Long Jump'


mask = athletes['EVENT'].str.contains(r'Triple Jump', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = 'Triple Jump'
mask = athletes['EVENT'].str.contains(r'Pole Vault', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = 'Pole Vault'
mask = athletes['EVENT'].str.contains(r'High jump', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = 'High Jump'
mask = athletes['EVENT'].str.contains(r'Long jump', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = 'Long Jump'
mask = athletes['EVENT'].str.contains(r'Triple jump', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = 'Triple Jump'
mask = athletes['EVENT'].str.contains(r'^Pole\svault$', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = 'Pole Vault'

# Steeplechase

#mask = athletes['EVENT'].str.contains(r'2000m S/C', na=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = '2000m Steeplechase'
#mask = athletes['EVENT'].str.contains(r'2000m steeplechase', na=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = '2000m Steeplechase'
#mask = athletes['EVENT'].str.contains(r'2000 Meter Steeplechase', na=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = '2000m Steeplechase'
mask = (athletes['EVENT'].str.contains(r'3000m Steeplechase|3000m S\/C', na=True) & athletes['REGION'].str.contains(r'International', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = '3000m Steeplechase'
mask = (athletes['EVENT'].str.contains(r'Steeplechase|S\/C', na=False) & athletes['DISTANCE'].str.contains(r'3000', na=False)  & athletes['EVENT_CLASS'].str.contains(r'0.914', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = '3000m Steeplechase'
mask = (athletes['EVENT'].str.contains(r'3000m Steeplechase|3000m S\/C', na=False) & athletes['EVENT_CLASS'].str.contains(r'0.914', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = '3000m Steeplechase'
mask = (athletes['EVENT'].str.contains(r'Steeplechase', na=False) & athletes['DISTANCE'].str.contains(r'3000', na=False)  & athletes['DIVISION'].str.contains(r'OPEN|Open', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = '3000m Steeplechase'


# Marathon

mask = athletes['EVENT'].str.contains(r'^Marathon$', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = 'Marathon'
mask = athletes['EVENT'].str.contains(r'^Half\sMarathon$', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = 'Half Marathon'
mask = athletes['EVENT'].str.contains(r'^Half\smarathon$', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = 'Half Marathon'


# Walk

#mask = athletes['EVENT'].str.contains(r'1500m Race Walk', na=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = '1500m Race Walk'
#mask = athletes['EVENT'].str.contains(r'1500 Meter Race Walk', na=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = '1500m Race Walk'
#mask = (athletes['EVENT'].str.contains(r'Race Walk', na=False) & athletes['DISTANCE'].str.contains(r'1500', na=False))
#athletes.loc[mask, 'MAPPED_EVENT'] = '1500m Race Walk'


#mask = athletes['EVENT'].str.contains(r'3000m Race Walk', na=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = '3000m Race Walk'
#mask = athletes['EVENT'].str.contains(r'3000 Meter Race Walk', na=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = '3000m Race Walk'
#mask = (athletes['EVENT'].str.contains(r'Race Walk', na=False) & athletes['DISTANCE'].str.contains(r'3000', na=False))
#athletes.loc[mask, 'MAPPED_EVENT'] = '3000m Race Walk'


#mask = athletes['EVENT'].str.contains(r'5000 Meter Race Walk', na=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = '5000m Race Walk'
#mask = athletes['EVENT'].str.contains(r'5000m Race Walk', na=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = '5000m Race Walk'
mask = (athletes['EVENT'].str.contains(r'Race Walk', na=False) & athletes['DISTANCE'].str.contains(r'10000', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = '10000m Racewalk'


#mask = athletes['EVENT'].str.contains(r'10000 Meter Race Walk', na=True)
#athletes.loc[mask, 'MAPPED_EVENT'] = '10000m Race Walk'

# Relay

mask = athletes['EVENT'].str.contains(r'4x80m Relay', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = '4 x 80m'
mask = athletes['EVENT'].str.contains(r'^4\sx\s100m$', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = '4 x 100m'
mask = athletes['EVENT'].str.contains(r'4x100m Relay', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = '4 x 100m'
mask = athletes['EVENT'].str.contains(r'4 X 100m Relay', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = '4 x 100m'
mask = (athletes['EVENT'].str.contains(r'Relay', na=False) & athletes['DISTANCE'].str.contains(r'400', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = '4 x 100m'

mask = athletes['EVENT'].str.contains(r'4x400m Relay', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = '4 x 400m'
mask = athletes['EVENT'].str.contains(r'4 X 400m Relay', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = '4 x 400m'
mask = athletes['EVENT'].str.contains(r'4x100 Meter Relay', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = '4 x 100m'
mask = (athletes['EVENT'].str.contains(r'Relay', na=False) & athletes['DISTANCE'].str.contains(r'1600', na=False))
athletes.loc[mask, 'MAPPED_EVENT'] = '4 x 400m'
mask = athletes['EVENT'].str.contains(r'^4\sx\s400m$', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = '4 x 400m'

# Decathlon/Heptathlon

mask = athletes['EVENT'].str.contains(r'^Heptathlon$', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = 'Heptathlon'
mask = athletes['EVENT'].str.contains(r'^Decathlon$', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = 'Decathlon'
mask = athletes['EVENT'].str.contains(r'Heptathlon', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = 'Heptathlon'
mask = athletes['EVENT'].str.contains(r'Decathlon', na=True)
athletes.loc[mask, 'MAPPED_EVENT'] = 'Decathlon'



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  athletes['MAPPED_EVENT']=''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  athletes[col] = athletes[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  athletes[col] = athletes[col].str.replace('\xa0', ' ', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
T

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  athletes[col] = athletes[col].str.replace('[\x00-\x1f\x7f-\x9f]', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  athletes[col] = athletes[col].str.replace('\r', ' ', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  athletes[col] = athletes[col].str.replace('\n', ' ', regex

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  athletes[col] = athletes[col].str.replace('[\x00-\x1f\x7f-\x9f]', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  athletes[col] = athletes[col].str.replace('\r', ' ', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  athletes[col] = athletes[col].str.replace('\n', ' ', regex

In [707]:
athletes[(athletes['MAPPED_EVENT']=='Decathlon')]

#athletes[(athletes['EVENT']=='110m Hurdles')].tail(50)

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT,DISTANCE,EVENT_CLASS,UNIQUE_ID,...,COMPETITION,DATE,YEAR,REGION,TIMESTAMP,NOW,delta_time,delta_time_conv,event_month,MAPPED_EVENT
17233,Jayden Ng,5716,,,3,,Decathlon U18,,,,...,National U20,2025-03-23,2025,International,,2025-07-02 22:06:35.488019+00:00,101 days 22:06:35.488019,101,3,Decathlon


In [708]:
athletes[athletes['MAPPED_EVENT']=='10,000m']

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT,DISTANCE,EVENT_CLASS,UNIQUE_ID,...,COMPETITION,DATE,YEAR,REGION,TIMESTAMP,NOW,delta_time,delta_time_conv,event_month,MAPPED_EVENT
25,Shaun Goh,32:21.0,,,16,,"10,000m",,,,...,26th Asian Athletics Championships,2025-05-31,2025,International,2025-06-01 16:49:06.835826,2025-07-02 22:06:35.488019+00:00,32 days 22:06:35.488019,32,5,"10,000m"
28,SOH RUI YONG GUILLAUME,31:11.4,INDIVIDUAL,33,133,,10000m,,,,...,"Tokai University Long Distance Challenge 10,00...",2024-11-25,2024,International,,2025-07-02 22:06:35.488019+00:00,219 days 22:06:35.488019,219,11,"10,000m"
29,DANIEL LEOW SOON YE,35:10:00,INDIVIDUAL,27,7,,10000m,,,,...,Swiss City Marathon (Switzerland),2024-10-27,2024,International,,2025-07-02 22:06:35.488019+00:00,248 days 22:06:35.488019,248,10,"10,000m"
25678,"Heng Chin Kiat, Richard",34:42.3,National University Singapore,0,2,Open,Run,10000,,,...,IVP Track & Field Championships 2025,2025-01-12,2025,Local,,2025-07-02 22:06:35.488019+00:00,171 days 22:06:35.488019,171,1,"10,000m"
25679,"Tay, Zi Xiang",39:25.0,Temasek Polytechnic,0,10,Open,Run,10000,,,...,IVP Track & Field Championships 2025,2025-01-12,2025,Local,,2025-07-02 22:06:35.488019+00:00,171 days 22:06:35.488019,171,1,"10,000m"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28348,"Tan, Bernice",44:25.6,Lacticbuds,25,3,Open,Run,10000,,B075C00,...,85th Singapore Open Track & Field,2025-04-25,2025,Local,,2025-07-02 22:06:35.488019+00:00,68 days 22:06:35.488019,68,4,"10,000m"
28354,"Petingko, Nofeldi",32:03.7,Indonesia,27,1,Open,Run,10000,,N655298,...,85th Singapore Open Track & Field,2025-04-25,2025,Local,,2025-07-02 22:06:35.488019+00:00,68 days 22:06:35.488019,68,4,"10,000m"
28363,"INTHAKUMMAN, LODKEO",38:58.9,Laos,30,1,Open,Run,10000,,L95,...,85th Singapore Open Track & Field,2025-04-25,2025,Local,,2025-07-02 22:06:35.488019+00:00,68 days 22:06:35.488019,68,4,"10,000m"
28391,"Goh, Shing Ling",39:21.6,TeamFabian,26,2,Open,Run,10000,,S268G99,...,85th Singapore Open Track & Field,2025-04-25,2025,Local,,2025-07-02 22:06:35.488019+00:00,68 days 22:06:35.488019,68,4,"10,000m"


In [709]:
for col in athletes.columns:
    athletes[col] = athletes[col].astype(str)
    athletes[col] = athletes[col].str.replace('\xa0', ' ', regex=True)
    athletes[col] = athletes[col].str.replace('[\x00-\x1f\x7f-\x9f]', '', regex=True)
    athletes[col] = athletes[col].str.replace('\r', ' ', regex=True)
    athletes[col] = athletes[col].str.replace('\n', ' ', regex=True)
    athletes[col] = athletes[col].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  athletes[col] = athletes[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  athletes[col] = athletes[col].str.replace('\xa0', ' ', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  athletes[col] = athletes[col].str.replace('[\x00-\x1f\x7f-\x9f]', '', regex=True)
A value is try

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  athletes[col] = athletes[col].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  athletes[col] = athletes[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  athletes[col] = athletes[col].str.replace('\xa0', ' ', regex=True)
A value is trying to be set on a copy of a slice from 

In [710]:
os.chdir('/Users/veesheenyuen/Desktop/DataScience/SAA/SEAG/')


athletes.to_csv('athletes_post_map_seag_june25_tz.csv', sep=',', encoding='utf-8-sig', index=False)


In [711]:
#credentials = service_account.Credentials.from_service_account_file(
#    '/Users/veesheenyuen/Desktop/DataScience/Keys/saa-analytics-7c8937b70609.json',
#)

#sql="""
#SELECT NAME, RESULT, RANK, EVENT, CATEGORY_EVENT, GENDER, COMPETITION, STAGE
#FROM `saa-analytics.results.saa_full`
#WHERE STAGE='Final' AND COMPETITION='SEA Game AND RANK='3'
#"""

#benchmarks = pandas_gbq.read_gbq(sql, project_id="saa-analytics", credentials=credentials)




In [712]:
import pandas_gbq
from google.oauth2 import service_account


credentials = service_account.Credentials.from_service_account_file(
    '/Users/veesheenyuen/Desktop/DataScience/Keys/saa-analytics-7c8937b70609.json',
)

sql="""
SELECT YEAR, EVENT, SUB_EVENT, GENDER, NAME, RESULT, RANK, CATEGORY_EVENT, COMPETITION, STAGE, HEAT
FROM `saa-analytics.benchmarks.saa_benchmarks_prod`
WHERE YEAR='2023' AND COMPETITION='Southeast Asian Games' AND (RANK='3' OR RANK='3.0')
"""

SEAG = pandas_gbq.read_gbq(sql, project_id="saa-analytics", credentials=credentials)



Downloading: 100%|[32m██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████[0m|[0m


In [713]:
SEAG

Unnamed: 0,YEAR,EVENT,SUB_EVENT,GENDER,NAME,RESULT,RANK,CATEGORY_EVENT,COMPETITION,STAGE,HEAT
0,2023,4 x 100m,,Male,"{0: '\xa0', 1: ' Jonathan Nyepa, Khairul Hafiz...",39.36,3,Relay,Southeast Asian Games,,
1,2023,4 x 400m,,Male,"{0: '\xa0', 1: ' Muhammad Firdaus Bin Mohamad ...",03:08.8,3,Relay,Southeast Asian Games,,
2,2023,4 x 100m,,Female,"{0: '\xa0', 1: ' Azreen Nabila Alias, Nur Afri...",44.58,3,Relay,Southeast Asian Games,,
3,2023,4 x 400m,,Female,"{0: '\xa0', 1: ' Sukanya Janchaona, Benny Nont...",03:39.3,3,Relay,Southeast Asian Games,,
4,2023,400m,,Male,Frederick Ramirez,46.63,3,Sprint,Southeast Asian Games,,
...,...,...,...,...,...,...,...,...,...,...,...
75,2023,100m Hurdles,,Female,Natchaya Chowpakpang,14.23,3,Hurdles,Southeast Asian Games,Heats,Heat 2
76,2023,Marathon,,Male,Nguyen Thanh Hoang,2:35:49,3.0,Marathon,Southeast Asian Games,,
77,2023,Marathon,,Female,Christine Organiza Hallasgo,2:50:27,3.0,Marathon,Southeast Asian Games,,
78,2023,20km Race Walk,,Female,Kotchaphon Tangsrivong,1:57:11,3.0,20km Race Walk,Southeast Asian Games,,


In [714]:
SEAG_filtered=SEAG[SEAG['HEAT'].isnull() & SEAG['SUB_EVENT'].isnull()]

In [715]:
benchmarks=SEAG_filtered

In [716]:
benchmarks

Unnamed: 0,YEAR,EVENT,SUB_EVENT,GENDER,NAME,RESULT,RANK,CATEGORY_EVENT,COMPETITION,STAGE,HEAT
0,2023,4 x 100m,,Male,"{0: '\xa0', 1: ' Jonathan Nyepa, Khairul Hafiz...",39.36,3.0,Relay,Southeast Asian Games,,
1,2023,4 x 400m,,Male,"{0: '\xa0', 1: ' Muhammad Firdaus Bin Mohamad ...",03:08.8,3.0,Relay,Southeast Asian Games,,
2,2023,4 x 100m,,Female,"{0: '\xa0', 1: ' Azreen Nabila Alias, Nur Afri...",44.58,3.0,Relay,Southeast Asian Games,,
3,2023,4 x 400m,,Female,"{0: '\xa0', 1: ' Sukanya Janchaona, Benny Nont...",03:39.3,3.0,Relay,Southeast Asian Games,,
4,2023,400m,,Male,Frederick Ramirez,46.63,3.0,Sprint,Southeast Asian Games,,
5,2023,"10,000m",,Male,Than Htike Soe,31:25.5,3.0,Long,Southeast Asian Games,,
6,2023,3000m Steeplechase,,Male,Pandu Sukarya,08:55.0,3.0,Steeple,Southeast Asian Games,,
7,2023,110m Hurdles,,Male,John Cabang,13.85,3.0,Hurdles,Southeast Asian Games,,
8,2023,400m Hurdles,,Male,Jun Jie Calvin Quek,50.75,3.0,Hurdles,Southeast Asian Games,,
9,2023,High Jump,,Male,Farrell Glenn Felix,2.15,3.0,Jump,Southeast Asian Games,,


In [717]:
benchmarks.rename(columns = {'RESULT':'BENCHMARK'}, inplace = True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  benchmarks.rename(columns = {'RESULT':'BENCHMARK'}, inplace = True)


In [718]:
benchmarks.drop(['YEAR', 'HEAT', 'NAME', 'RANK', 'CATEGORY_EVENT', 'COMPETITION', 'STAGE'], axis=1, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  benchmarks.drop(['YEAR', 'HEAT', 'NAME', 'RANK', 'CATEGORY_EVENT', 'COMPETITION', 'STAGE'], axis=1, inplace=True)


In [719]:
benchmarks=benchmarks.reset_index(drop=True)

In [720]:
# Converts any time format into seconds

def convert_time(i, string, metric):

    global output
    
    l=['discus', 'throw', 'jump', 'vault', 'shot']
        
    string=string.lower()
    
   # print('metric', metric)
    
    try:
        
        if 'w' in metric:  # skip marks with illegal wind speeds
            
        #    print('W', metric)
            
            output=''
            
        else:
            
    
            if any(s in string for s in l)==True:
            
                if 'm' in metric:
            
                    metric=metric.replace('m', '')
                    output=float(str(metric))
            
                elif 'GR' in metric:
            
                    metric=metric.replace('GR', '')
                    output=float(str(metric))
                
                
                else:
    
                    output=float(str(metric))
        
            elif string=='':   # no event description at all!
                
                output='' # return nothing
            
                
        
            else:
        
                searchstring = ":"
                searchstring2 = "."
                substring=str(metric)
                count = substring.count(searchstring)
                count2 = substring.count(searchstring2)
            
                if count==0:
                
                    output=float(substring)
            
            
                elif '10,000m' in string and count==2:  # fix erroneous timing format from XX:XX:XX to XX:XX.XX
                
                
                    idx = 5 # 6th character position
                    replacement = "."
                    metric = metric[:idx] + replacement + metric[idx+1:]                
                
                    m,s = metric.split(':')            

                    output = float(datetime.timedelta(minutes=int(m),seconds=float(s)).total_seconds())

                elif '5000m' in string and count==2:  # fix erroneous timing format from XX:XX:XX to XX:XX.XX
                
                
                    idx = 5 # 6th character position
                    replacement = "."
                    metric = metric[:idx] + replacement + metric[idx+1:]                
                
                    m,s = metric.split(':')            

                    output = float(datetime.timedelta(minutes=int(m),seconds=float(s)).total_seconds())

                    
                    
                elif '1500m' in string and count==2:  # fix erroneous timing format from XX:XX:XX to XX:XX.XX
                    
                    if len(substring)==7:  # format is X:XX:XX and not XX:XX:XX 
                        
                        idx = 4 # 5th character position
                        replacement = "."
                        metric = '0' + metric[:idx] + replacement + metric[idx+1:]                
                
                        m,s = metric.split(':')            

                        output = float(datetime.timedelta(minutes=int(m),seconds=float(s)).total_seconds())
                    
                        
                    else:  # format is XX:XX:XX
                        
                        idx = 5 # 5th character position
                        replacement = "."
                        metric = metric[:idx] + replacement + metric[idx+1:]                
                
                        m,s = metric.split(':')            

                        output = float(datetime.timedelta(minutes=int(m),seconds=float(s)).total_seconds())  
             
                elif (type(metric)==datetime.time or type(metric)==datetime.datetime):
                
                                                
                    time=str(metric)
                    h, m ,s = time.split(':')
                    output = float(datetime.timedelta(hours=int(h),minutes=int(m),seconds=float(s)).total_seconds())
            
                                
                elif (count==1 and count2==1):
            
                    m,s = metric.split(':')
                    output = float(datetime.timedelta(minutes=int(m),seconds=float(s)).total_seconds())
                     
                elif (count==1 and count2==2):
                
            
                    metric = metric.replace(".", ":", 1)
            
                    h,m,s = metric.split(':')            
                    output = float(datetime.timedelta(hours=int(h),minutes=int(m),seconds=float(s)).total_seconds())
                
        
                elif (count==2 and count2==0):
                
            
                    h,m,s = metric.split(':')
                    output = float(datetime.timedelta(hours=int(h),minutes=int(m),seconds=float(s)).total_seconds())
  
            

    except:
        
        pass
                
    return output

In [721]:
def process_benchmarks(df):
    
    for i in range(len(df)):

        rowIndex = df.index[i]

        input_string=df.iloc[rowIndex,0]
    
        metric=df.iloc[rowIndex,3]
    
        if metric==None:
        
            continue
        
        out = convert_time(i, input_string, metric)
        
        print(rowIndex, input_string, out)

    
        df.loc[rowIndex, 'Metric'] = out
    
    return df

In [722]:
process_benchmarks(benchmarks)

0 4 x 100m 39.36
1 4 x 400m 188.8
2 4 x 100m 44.58
3 4 x 400m 219.3
4 400m 46.63
5 10,000m 1885.5
6 3000m Steeplechase 535.0
7 110m Hurdles 13.85
8 400m Hurdles 50.75
9 High Jump 2.15
10 Shot Put 17.3
11 400m 53.84
12 1500m 266.3
13 10,000m 2131.0
14 400m Hurdles 59.09
15 Triple Jump 13.46
16 200m 21.02
17 800m 113.9
18 5000m 883.4
19 Discus Throw 50.02
20 Javelin Throw 66.2
21 Decathlon 6891.0
22 800m 129.2
23 5000m 1033.6
24 3000m Steeplechase 660.9
25 High Jump 1.73
26 Discus Throw 45.08
27 Heptathlon 5253.0
28 100m 10.45
29 100m 10.45
30 1500m 239.4
31 Pole Vault 5.2
32 Long Jump 7.62
33 Triple Jump 15.7
34 Hammer Throw 59.76
35 100m 11.75
36 200m 23.6
37 100m Hurdles 13.59
38 Pole Vault 4.0
39 Long Jump 6.02
40 Shot Put 14.44
41 Hammer Throw 49.61
42 Javelin Throw 48.31
43 Marathon 9349.0
44 Marathon 10227.0
45 20km Race Walk 7031.0
46 20km Race Walk 6579.0


Unnamed: 0,EVENT,SUB_EVENT,GENDER,BENCHMARK,Metric
0,4 x 100m,,Male,39.36,39.36
1,4 x 400m,,Male,03:08.8,188.8
2,4 x 100m,,Female,44.58,44.58
3,4 x 400m,,Female,03:39.3,219.3
4,400m,,Male,46.63,46.63
5,"10,000m",,Male,31:25.5,1885.5
6,3000m Steeplechase,,Male,08:55.0,535.0
7,110m Hurdles,,Male,13.85,13.85
8,400m Hurdles,,Male,50.75,50.75
9,High Jump,,Male,2.15,2.15


In [723]:
'''
for i in range(len(benchmarks)):
        
    rowIndex = benchmarks.index[i]

    input_string=benchmarks.iloc[rowIndex,0]
    
    metric=benchmarks.iloc[rowIndex,3]
    
    if metric==None:
        continue
        
    out = convert_time(i, input_string, metric)
    
    print(rowIndex, input_string, out)
     
    benchmarks.loc[rowIndex, 'Metric'] = out
'''

"\nfor i in range(len(benchmarks)):\n        \n    rowIndex = benchmarks.index[i]\n\n    input_string=benchmarks.iloc[rowIndex,0]\n    \n    metric=benchmarks.iloc[rowIndex,3]\n    \n    if metric==None:\n        continue\n        \n    out = convert_time(i, input_string, metric)\n    \n    print(rowIndex, input_string, out)\n     \n    benchmarks.loc[rowIndex, 'Metric'] = out\n"

In [724]:
benchmarks

Unnamed: 0,EVENT,SUB_EVENT,GENDER,BENCHMARK,Metric
0,4 x 100m,,Male,39.36,39.36
1,4 x 400m,,Male,03:08.8,188.8
2,4 x 100m,,Female,44.58,44.58
3,4 x 400m,,Female,03:39.3,219.3
4,400m,,Male,46.63,46.63
5,"10,000m",,Male,31:25.5,1885.5
6,3000m Steeplechase,,Male,08:55.0,535.0
7,110m Hurdles,,Male,13.85,13.85
8,400m Hurdles,,Male,50.75,50.75
9,High Jump,,Male,2.15,2.15


In [725]:
mask = benchmarks['EVENT'].str.contains(r'jump|throw|Pole|put|Jump|Throw|pole|Put|Decathlon|Heptathlon', na=True)

benchmarks.loc[mask, '2%']=benchmarks['Metric']*0.98
benchmarks.loc[mask, '3.5%']=benchmarks['Metric']*0.965
benchmarks.loc[mask, '5%']=benchmarks['Metric']*0.95
benchmarks.loc[mask, '10%']=benchmarks['Metric']*0.90


benchmarks.loc[~mask, '2%']=benchmarks['Metric']*1.02
benchmarks.loc[~mask, '3.5%']=benchmarks['Metric']*1.035
benchmarks.loc[~mask, '5%']=benchmarks['Metric']*1.05
benchmarks.loc[~mask, '10%']=benchmarks['Metric']*1.10


In [726]:
benchmarks['MAPPED_EVENT']=benchmarks['EVENT'].str.strip()

In [727]:
for col in benchmarks.columns:
    benchmarks[col] = benchmarks[col].astype(str)
    benchmarks[col] = benchmarks[col].str.replace('\xa0', ' ', regex=True)
    benchmarks[col] = benchmarks[col].str.replace('[\x00-\x1f\x7f-\x9f]', '', regex=True)
    benchmarks[col] = benchmarks[col].str.replace('\r', ' ', regex=True)
    benchmarks[col] = benchmarks[col].str.replace('\n', ' ', regex=True)
    benchmarks[col] = benchmarks[col].str.strip()


In [728]:
benchmarks.head(50)

Unnamed: 0,EVENT,SUB_EVENT,GENDER,BENCHMARK,Metric,2%,3.5%,5%,10%,MAPPED_EVENT
0,4 x 100m,,Male,39.36,39.36,40.1472,40.73759999999999,41.328,43.296,4 x 100m
1,4 x 400m,,Male,03:08.8,188.8,192.576,195.408,198.24,207.68000000000004,4 x 400m
2,4 x 100m,,Female,44.58,44.58,45.4716,46.1403,46.809,49.038,4 x 100m
3,4 x 400m,,Female,03:39.3,219.3,223.686,226.9755,230.265,241.23,4 x 400m
4,400m,,Male,46.63,46.63,47.5626,48.26205,48.96150000000001,51.293000000000006,400m
5,"10,000m",,Male,31:25.5,1885.5,1923.21,1951.4925,1979.775,2074.05,"10,000m"
6,3000m Steeplechase,,Male,08:55.0,535.0,545.7,553.7249999999999,561.75,588.5,3000m Steeplechase
7,110m Hurdles,,Male,13.85,13.85,14.127,14.334749999999998,14.5425,15.235,110m Hurdles
8,400m Hurdles,,Male,50.75,50.75,51.765,52.52625,53.2875,55.825,400m Hurdles
9,High Jump,,Male,2.15,2.15,2.107,2.07475,2.0425,1.935,High Jump


In [729]:
athletes

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT,DISTANCE,EVENT_CLASS,UNIQUE_ID,...,COMPETITION,DATE,YEAR,REGION,TIMESTAMP,NOW,delta_time,delta_time_conv,event_month,MAPPED_EVENT
0,Shanti Veronica Pereira,11.71,,,7.0,,100m,,,,...,AtleticaGenève,2025-06-21,2025,International,2025-06-29 22:00:26.182128,2025-07-02 22:06:35.488019+00:00,11 days 22:06:35.488019,11,6,100m
1,Shanti Veronica Pereira,23.16w,,,7.0,,200m,,,,...,AtleticaGenève,2025-06-21,2025,International,2025-06-29 22:00:26.182128,2025-07-02 22:06:35.488019+00:00,11 days 22:06:35.488019,11,6,200m
2,Jun Yu Low,4.77,,,5.0,,Pole Vault,,,,...,Jan Dietvorst Memorial,2025-06-21,2025,International,2025-06-29 22:00:26.182128,2025-07-02 22:06:35.488019+00:00,11 days 22:06:35.488019,11,6,Pole Vault
3,Tung Hon Andrew Pak,2.05,,,,,High Jump,,,,...,National Championships,2025-06-25,2025,International,2025-06-29 22:00:26.182128,2025-07-02 22:06:35.488019+00:00,7 days 22:06:35.488019,7,6,High Jump
4,Tam Jong Hng,1.95,,,,,High Jump,,,,...,National Championships,2025-06-25,2025,International,2025-06-29 22:00:26.182128,2025-07-02 22:06:35.488019+00:00,7 days 22:06:35.488019,7,6,High Jump
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31125,"Sueto, Yuito",14.76m,Kansai University of Social We,21,3,Open,Triple Jump,0,,Y04,...,85th Singapore Open Track & Field,2025-04-25,2025,Local,,2025-07-02 22:06:35.488019+00:00,68 days 22:06:35.488019,68,4,Triple Jump
31150,Gabriel Lee,15.15,,,7.0,,Triple Jump,,,,...,Taiwan Athletics Open,2025-06-08,2025,International,2025-06-11 16:38:50.923703,2025-07-02 22:06:35.488019+00:00,24 days 22:06:35.488019,24,6,Triple Jump
31151,Tia Louise Rozario,12.78,,,2.0,,Triple Jump,,,,...,Taiwan Athletics Open,2025-06-08,2025,International,2025-06-11 16:38:50.923703,2025-07-02 22:06:35.488019+00:00,24 days 22:06:35.488019,24,6,Triple Jump
31152,Tan Shou Yi Rei,13.76,,,1,,Triple Jump,,,,...,WA State Championships,2025-03-09,2025,International,,2025-07-02 22:06:35.488019+00:00,115 days 22:06:35.488019,115,3,Triple Jump


In [730]:
athletes[athletes['NAME']=='Heng, Richard']

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT,DISTANCE,EVENT_CLASS,UNIQUE_ID,...,COMPETITION,DATE,YEAR,REGION,TIMESTAMP,NOW,delta_time,delta_time_conv,event_month,MAPPED_EVENT


In [731]:
# Merge benchmarks onto athletes on MAPPED_EVENT and GENDER

df = pd.merge(
    left=athletes, 
    right=benchmarks,
    how='left',
    left_on=['MAPPED_EVENT', 'GENDER'],
    right_on=['MAPPED_EVENT', 'GENDER'],
)

In [732]:
df

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,UNIQUE_ID,...,event_month,MAPPED_EVENT,EVENT_y,SUB_EVENT,BENCHMARK,Metric,2%,3.5%,5%,10%
0,Shanti Veronica Pereira,11.71,,,7.0,,100m,,,,...,6,100m,100m,,11.75,11.75,11.985,12.161249999999999,12.3375,12.925
1,Shanti Veronica Pereira,23.16w,,,7.0,,200m,,,,...,6,200m,200m,,23.6,23.6,24.072000000000003,24.426,24.78,25.960000000000004
2,Jun Yu Low,4.77,,,5.0,,Pole Vault,,,,...,6,Pole Vault,Pole Vault,,5.2,5.2,5.096,5.018,4.9399999999999995,4.680000000000001
3,Tung Hon Andrew Pak,2.05,,,,,High Jump,,,,...,6,High Jump,High Jump,,2.15,2.15,2.1069999999999998,2.07475,2.0425,1.935
4,Tam Jong Hng,1.95,,,,,High Jump,,,,...,6,High Jump,High Jump,,2.15,2.15,2.1069999999999998,2.07475,2.0425,1.935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15846,"Sueto, Yuito",14.76m,Kansai University of Social We,21,3,Open,Triple Jump,0,,Y04,...,4,Triple Jump,Triple Jump,,15.7,15.7,15.386,15.1505,14.915,14.129999999999999
15847,Gabriel Lee,15.15,,,7.0,,Triple Jump,,,,...,6,Triple Jump,Triple Jump,,15.7,15.7,15.386,15.1505,14.915,14.129999999999999
15848,Tia Louise Rozario,12.78,,,2.0,,Triple Jump,,,,...,6,Triple Jump,Triple Jump,,13.46,13.46,13.190800000000001,12.988900000000001,12.787,12.114
15849,Tan Shou Yi Rei,13.76,,,1,,Triple Jump,,,,...,3,Triple Jump,Triple Jump,,15.7,15.7,15.386,15.1505,14.915,14.129999999999999


In [733]:
df[df['NAME']=='Caleb Hia']

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,UNIQUE_ID,...,event_month,MAPPED_EVENT,EVENT_y,SUB_EVENT,BENCHMARK,Metric,2%,3.5%,5%,10%


In [734]:
df[df['MAPPED_EVENT']=='Decathlon']

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,UNIQUE_ID,...,event_month,MAPPED_EVENT,EVENT_y,SUB_EVENT,BENCHMARK,Metric,2%,3.5%,5%,10%
9254,Jayden Ng,5716,,,3,,Decathlon U18,,,,...,3,Decathlon,Decathlon,,6891,6891.0,6753.18,6649.815,6546.45,6201.900000000001


In [735]:
# replace '-' with NaN

df['RESULT'] = df['RESULT'].replace(regex=r'–', value=np.NaN)
#df['SEED'] = df['SEED'].replace(regex=r'–', value=np.NaN)


In [736]:
df

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,UNIQUE_ID,...,event_month,MAPPED_EVENT,EVENT_y,SUB_EVENT,BENCHMARK,Metric,2%,3.5%,5%,10%
0,Shanti Veronica Pereira,11.71,,,7.0,,100m,,,,...,6,100m,100m,,11.75,11.75,11.985,12.161249999999999,12.3375,12.925
1,Shanti Veronica Pereira,23.16w,,,7.0,,200m,,,,...,6,200m,200m,,23.6,23.6,24.072000000000003,24.426,24.78,25.960000000000004
2,Jun Yu Low,4.77,,,5.0,,Pole Vault,,,,...,6,Pole Vault,Pole Vault,,5.2,5.2,5.096,5.018,4.9399999999999995,4.680000000000001
3,Tung Hon Andrew Pak,2.05,,,,,High Jump,,,,...,6,High Jump,High Jump,,2.15,2.15,2.1069999999999998,2.07475,2.0425,1.935
4,Tam Jong Hng,1.95,,,,,High Jump,,,,...,6,High Jump,High Jump,,2.15,2.15,2.1069999999999998,2.07475,2.0425,1.935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15846,"Sueto, Yuito",14.76m,Kansai University of Social We,21,3,Open,Triple Jump,0,,Y04,...,4,Triple Jump,Triple Jump,,15.7,15.7,15.386,15.1505,14.915,14.129999999999999
15847,Gabriel Lee,15.15,,,7.0,,Triple Jump,,,,...,6,Triple Jump,Triple Jump,,15.7,15.7,15.386,15.1505,14.915,14.129999999999999
15848,Tia Louise Rozario,12.78,,,2.0,,Triple Jump,,,,...,6,Triple Jump,Triple Jump,,13.46,13.46,13.190800000000001,12.988900000000001,12.787,12.114
15849,Tan Shou Yi Rei,13.76,,,1,,Triple Jump,,,,...,3,Triple Jump,Triple Jump,,15.7,15.7,15.386,15.1505,14.915,14.129999999999999


In [737]:
df[df['NAME']=='Caleb Hia']

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,UNIQUE_ID,...,event_month,MAPPED_EVENT,EVENT_y,SUB_EVENT,BENCHMARK,Metric,2%,3.5%,5%,10%


In [738]:
os.chdir('/Users/veesheenyuen/Desktop/DataScience/SAA/SEAG/')


df.to_csv('seag_postmap_june25_tz.csv', sep=',', encoding='utf-8-sig', index=False)


In [739]:
# Convert results and seed into seconds format

df.reset_index(drop=True, inplace=True)

for col in df.columns:
    
    df[col] = df[col].astype(str)
    df[col] = df[col].str.replace('\xa0', ' ', regex=True)
    df[col] = df[col].str.replace('[\x00-\x1f\x7f-\x9f]', '', regex=True)
    df[col] = df[col].str.replace('\r', ' ', regex=True)
    df[col] = df[col].str.replace('\n', ' ', regex=True)
    df[col] = df[col].str.strip()

for i in range(len(df)):
    
    result_out=''
    
        
    rowIndex = df.index[i]

    event=df.loc[rowIndex,'MAPPED_EVENT']    # event description
    
    result=df.loc[rowIndex,'RESULT'] # result
    
    if result=='—' or result=='DQ' or result=='SCR' or result=='FS' or result=='DNQ' or result=='DNS' or result=='NH' or result=='NM' or result=='FOUL' or result=='DNF' or result=='SR' :
        continue
    
    result_out = convert_time(i, event, result)
         
    df.loc[rowIndex, 'RESULT_CONV'] = result_out



  df.loc[rowIndex, 'RESULT_CONV'] = result_out


In [740]:
df

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,UNIQUE_ID,...,MAPPED_EVENT,EVENT_y,SUB_EVENT,BENCHMARK,Metric,2%,3.5%,5%,10%,RESULT_CONV
0,Shanti Veronica Pereira,11.71,,,7.0,,100m,,,,...,100m,100m,,11.75,11.75,11.985,12.161249999999999,12.3375,12.925,11.71
1,Shanti Veronica Pereira,23.16w,,,7.0,,200m,,,,...,200m,200m,,23.6,23.6,24.072000000000003,24.426,24.78,25.960000000000004,
2,Jun Yu Low,4.77,,,5.0,,Pole Vault,,,,...,Pole Vault,Pole Vault,,5.2,5.2,5.096,5.018,4.9399999999999995,4.680000000000001,4.77
3,Tung Hon Andrew Pak,2.05,,,,,High Jump,,,,...,High Jump,High Jump,,2.15,2.15,2.1069999999999998,2.07475,2.0425,1.935,2.05
4,Tam Jong Hng,1.95,,,,,High Jump,,,,...,High Jump,High Jump,,2.15,2.15,2.1069999999999998,2.07475,2.0425,1.935,1.95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15846,"Sueto, Yuito",14.76m,Kansai University of Social We,21,3,Open,Triple Jump,0,,Y04,...,Triple Jump,Triple Jump,,15.7,15.7,15.386,15.1505,14.915,14.129999999999999,14.76
15847,Gabriel Lee,15.15,,,7.0,,Triple Jump,,,,...,Triple Jump,Triple Jump,,15.7,15.7,15.386,15.1505,14.915,14.129999999999999,15.15
15848,Tia Louise Rozario,12.78,,,2.0,,Triple Jump,,,,...,Triple Jump,Triple Jump,,13.46,13.46,13.190800000000001,12.988900000000001,12.787,12.114,12.78
15849,Tan Shou Yi Rei,13.76,,,1,,Triple Jump,,,,...,Triple Jump,Triple Jump,,15.7,15.7,15.386,15.1505,14.915,14.129999999999999,13.76


In [741]:
# Choose SEED if better than RESULT

#condition1=df['SEED_CONV']>df['RESULT_CONV']
#condition2=((df['CATEGORY_EVENT']=='Jump')|(df['CATEGORY_EVENT']=='Throw'))
#condition3=df['SEED_CONV']<df['RESULT_CONV']
#condition4=~((df['CATEGORY_EVENT']=='Jump')|(df['CATEGORY_EVENT']=='Throw'))


#df['RESULT_BEST']=df['SEED_CONV'].where((condition1 & condition2)|(condition3 & condition4), df['RESULT_CONV'].values)

df['RESULT_BEST'] = df['RESULT_CONV']

In [742]:
df

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,UNIQUE_ID,...,EVENT_y,SUB_EVENT,BENCHMARK,Metric,2%,3.5%,5%,10%,RESULT_CONV,RESULT_BEST
0,Shanti Veronica Pereira,11.71,,,7.0,,100m,,,,...,100m,,11.75,11.75,11.985,12.161249999999999,12.3375,12.925,11.71,11.71
1,Shanti Veronica Pereira,23.16w,,,7.0,,200m,,,,...,200m,,23.6,23.6,24.072000000000003,24.426,24.78,25.960000000000004,,
2,Jun Yu Low,4.77,,,5.0,,Pole Vault,,,,...,Pole Vault,,5.2,5.2,5.096,5.018,4.9399999999999995,4.680000000000001,4.77,4.77
3,Tung Hon Andrew Pak,2.05,,,,,High Jump,,,,...,High Jump,,2.15,2.15,2.1069999999999998,2.07475,2.0425,1.935,2.05,2.05
4,Tam Jong Hng,1.95,,,,,High Jump,,,,...,High Jump,,2.15,2.15,2.1069999999999998,2.07475,2.0425,1.935,1.95,1.95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15846,"Sueto, Yuito",14.76m,Kansai University of Social We,21,3,Open,Triple Jump,0,,Y04,...,Triple Jump,,15.7,15.7,15.386,15.1505,14.915,14.129999999999999,14.76,14.76
15847,Gabriel Lee,15.15,,,7.0,,Triple Jump,,,,...,Triple Jump,,15.7,15.7,15.386,15.1505,14.915,14.129999999999999,15.15,15.15
15848,Tia Louise Rozario,12.78,,,2.0,,Triple Jump,,,,...,Triple Jump,,13.46,13.46,13.190800000000001,12.988900000000001,12.787,12.114,12.78,12.78
15849,Tan Shou Yi Rei,13.76,,,1,,Triple Jump,,,,...,Triple Jump,,15.7,15.7,15.386,15.1505,14.915,14.129999999999999,13.76,13.76


In [743]:
df[df['NAME']=='Caleb Hia']

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,UNIQUE_ID,...,EVENT_y,SUB_EVENT,BENCHMARK,Metric,2%,3.5%,5%,10%,RESULT_CONV,RESULT_BEST


In [744]:
# Change to numeric

df[['2%', '3.5%', '5%', '10%', 'RESULT_BEST', 'Metric']] = df[['2%', '3.5%', '5%', '10%', 'RESULT_BEST', 'Metric']].apply(pd.to_numeric, errors='coerce')

In [745]:
mask = df['CATEGORY_EVENT'].str.contains(r'Jump|Throw|jump|throw|Decathlon|Heptathlon|decathlon|heptathlon', na=True)

df.loc[mask, 'Delta2'] = df['RESULT_BEST']-df['2%']
df.loc[mask, 'Delta3.5'] = df['RESULT_BEST']-df['3.5%']
df.loc[mask, 'Delta5'] = df['RESULT_BEST']-df['5%']
df.loc[mask, 'Delta10'] = df['RESULT_BEST']-df['10%']
df.loc[mask, 'Delta_Benchmark'] = df['RESULT_BEST']-df['Metric']

df.loc[~mask, 'Delta2'] =  df['2%'] - df['RESULT_BEST']
df.loc[~mask, 'Delta3.5'] = df['3.5%'] - df['RESULT_BEST']
df.loc[~mask, 'Delta5'] = df['5%'] - df['RESULT_BEST']
df.loc[~mask, 'Delta10'] = df['10%'] - df['RESULT_BEST']
df.loc[~mask, 'Delta_Benchmark'] = df['Metric'] - df['RESULT_BEST']

df=df.loc[df['COMPETITION']!='Southeast Asian Games'] # Do not include results from SEAG in dataset

In [746]:
# Performance metric to filter out athletes

df['PERF_SCALAR']=df['Delta5']/df['Metric']*100

In [747]:
os.chdir('/Users/veesheenyuen/Desktop/DataScience/SAA/SEAG/')


df.to_csv('seag_postmap_benchmarked_june25_tz.csv', sep=',', encoding='utf-8-sig', index=False)


In [748]:
df[df['MAPPED_EVENT']=='10,000m']

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,UNIQUE_ID,...,5%,10%,RESULT_CONV,RESULT_BEST,Delta2,Delta3.5,Delta5,Delta10,Delta_Benchmark,PERF_SCALAR
23,Shaun Goh,32:21.0,,,16,,"10,000m",,,,...,1979.775,2074.05,1941.0,1941.0,-17.79,10.4925,38.775,133.05,-55.5,2.056484
24,SOH RUI YONG GUILLAUME,31:11.4,INDIVIDUAL,33,133,,10000m,,,,...,1979.775,2074.05,1871.4,1871.4,51.81,80.0925,108.375,202.65,14.1,5.747812
25,DANIEL LEOW SOON YE,35:10:00,INDIVIDUAL,27,7,,10000m,,,,...,1979.775,2074.05,2110.0,2110.0,-186.79,-158.5075,-130.225,-35.95,-224.5,-6.906656
13311,"Heng Chin Kiat, Richard",34:42.3,National University Singapore,0,2,Open,Run,10000,,,...,1979.775,2074.05,2082.3,2082.3,-159.09,-130.8075,-102.525,-8.25,-196.8,-5.437550
13312,"Tay, Zi Xiang",39:25.0,Temasek Polytechnic,0,10,Open,Run,10000,,,...,1979.775,2074.05,2365.0,2365.0,-441.79,-413.5075,-385.225,-290.95,-479.5,-20.430920
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14814,"Tan, Bernice",44:25.6,Lacticbuds,25,3,Open,Run,10000,,B075C00,...,2237.550,2344.10,2665.6,2665.6,-491.98,-460.0150,-428.050,-321.50,-534.6,-20.086814
14820,"Petingko, Nofeldi",32:03.7,Indonesia,27,1,Open,Run,10000,,N655298,...,1979.775,2074.05,1923.7,1923.7,-0.49,27.7925,56.075,150.35,-38.2,2.974012
14829,"INTHAKUMMAN, LODKEO",38:58.9,Laos,30,1,Open,Run,10000,,L95,...,2237.550,2344.10,2338.9,2338.9,-165.28,-133.3150,-101.350,5.20,-207.9,-4.755983
14857,"Goh, Shing Ling",39:21.6,TeamFabian,26,2,Open,Run,10000,,S268G99,...,2237.550,2344.10,2361.6,2361.6,-187.98,-156.0150,-124.050,-17.50,-230.6,-5.821211


In [749]:
df

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,UNIQUE_ID,...,5%,10%,RESULT_CONV,RESULT_BEST,Delta2,Delta3.5,Delta5,Delta10,Delta_Benchmark,PERF_SCALAR
0,Shanti Veronica Pereira,11.71,,,7.0,,100m,,,,...,12.3375,12.925,11.71,11.71,0.2750,0.45125,0.6275,1.215,0.04,5.340426
1,Shanti Veronica Pereira,23.16w,,,7.0,,200m,,,,...,24.7800,25.960,,,,,,,,
2,Jun Yu Low,4.77,,,5.0,,Pole Vault,,,,...,4.9400,4.680,4.77,4.77,-0.3260,-0.24800,-0.1700,0.090,-0.43,-3.269231
3,Tung Hon Andrew Pak,2.05,,,,,High Jump,,,,...,2.0425,1.935,2.05,2.05,-0.0570,-0.02475,0.0075,0.115,-0.10,0.348837
4,Tam Jong Hng,1.95,,,,,High Jump,,,,...,2.0425,1.935,1.95,1.95,-0.1570,-0.12475,-0.0925,0.015,-0.20,-4.302326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15846,"Sueto, Yuito",14.76m,Kansai University of Social We,21,3,Open,Triple Jump,0,,Y04,...,14.9150,14.130,14.76,14.76,-0.6260,-0.39050,-0.1550,0.630,-0.94,-0.987261
15847,Gabriel Lee,15.15,,,7.0,,Triple Jump,,,,...,14.9150,14.130,15.15,15.15,-0.2360,-0.00050,0.2350,1.020,-0.55,1.496815
15848,Tia Louise Rozario,12.78,,,2.0,,Triple Jump,,,,...,12.7870,12.114,12.78,12.78,-0.4108,-0.20890,-0.0070,0.666,-0.68,-0.052006
15849,Tan Shou Yi Rei,13.76,,,1,,Triple Jump,,,,...,14.9150,14.130,13.76,13.76,-1.6260,-1.39050,-1.1550,-0.370,-1.94,-7.356688


In [750]:

# Read a variation name list and corrections from CSVs
'''
os.chdir('/Users/veesheenyuen/Desktop/DataScience/SAA/OCTC/')

names = pd.read_csv("name_variations.csv")

for index, row in names.iterrows():
        
    print(names.VARIATION, names.NAME)
    df['NAME'] = df['NAME'].replace(regex=rf"{row['VARIATION']}", value=f"{row['NAME']}")
'''

'\nos.chdir(\'/Users/veesheenyuen/Desktop/DataScience/SAA/OCTC/\')\n\nnames = pd.read_csv("name_variations.csv")\n\nfor index, row in names.iterrows():\n        \n    print(names.VARIATION, names.NAME)\n    df[\'NAME\'] = df[\'NAME\'].replace(regex=rf"{row[\'VARIATION\']}", value=f"{row[\'NAME\']}")\n'

In [751]:
# Read name variations from GCS name lists bucket (Still in beta)


df['NAME'] = df['NAME'].str.replace('\xa0', '', regex=True)
df['NAME'] = df['NAME'].str.replace('[\x00-\x1f\x7f-\x9f]', '', regex=True)
df['NAME'] = df['NAME'].str.replace('\r', '', regex=True)
df['NAME'] = df['NAME'].str.replace('\n', '', regex=True)
df['NAME'] = df['NAME'].str.strip()

df['NAME'] = df['NAME'].str.casefold()  # everything lower case (NEW)


# Read csv from GCS bucket

file_path = "gs://name_variations/name_variations.csv"
names = pd.read_csv(file_path,
                 sep=",",
                 storage_options={"token": '/Users/veesheenyuen/Desktop/DataScience/Keys/saa-analytics-7c8937b70609.json'})

# Iterate over dataframe and replace names

names['VARIATION'] = names['VARIATION'].str.replace('\xa0', '', regex=True)
names['VARIATION'] = names['VARIATION'].str.replace('[\x00-\x1f\x7f-\x9f]', '', regex=True)
names['VARIATION'] = names['VARIATION'].str.replace('\r', '', regex=True)
names['VARIATION'] = names['VARIATION'].str.replace('\n', '', regex=True)
names['VARIATION'] = names['VARIATION'].str.strip()


names['VARIATION'] = names['VARIATION'].str.casefold() # convert to lower case (NEW)
names['NAME'] = names['NAME'].str.casefold() # convert to lower case (NEW)


for index, row in names.iterrows():
        
    df['NAME'] = df['NAME'].replace(regex=rf"{row['VARIATION']}", value=f"{row['NAME']}")

    
    
df['NAME'] = df['NAME'].str.title()  # capitalize first letter (NEW)


In [752]:
# Exclude foreigners from MALAYSIA, THAILAND etc.

#df_select = df[(df['TEAM']!='Malaysia') & (df['TEAM']!='THAILAND') & (df['TEAM']!='China') & (df['TEAM']!='South Korea') & (df['TEAM']!='Laos') & (df['TEAM']!='Philippines') & (df['TEAM']!='Piboonbumpen Thailand') & (df['TEAM']!='Chinese Taipei') & (df['TEAM']!='Gurkha Contingent') & (df['TEAM']!='Australia') & (df['TEAM']!='Piboonbumpen Thailand') & (df['TEAM']!='Hong Kong') & (df['TEAM']!='PERAK')] 

df_select = df[(df['TEAM']!='Malaysia')&(df['TEAM']!='THAILAND')&(df['TEAM']!='China')&(df['TEAM']!='Thailand') 
                       &(df['TEAM']!='South Korea')&(df['TEAM']!='Laos')&(df['TEAM']!='Myanmar') 
                       &(df['TEAM']!='Philippines')&(df['TEAM']!='Piboonbumpen Thailand') 
                       &(df['TEAM']!='Chinese Taipei')&(df['TEAM']!='Gurkha Contingent') 
                       &(df['TEAM']!='Australia')&(df['TEAM']!='Piboonbumpen Thailand') 
                       &(df['TEAM']!='Hong Kong')&(df['TEAM']!='PERAK')&(df['TEAM']!='Sri Lanka') 
                       &(df['TEAM']!='Indonesia')&(df['TEAM']!='THAILAND')&(df['TEAM']!='MALAYSIA') 
                       &(df['TEAM']!='PHILIPPINES') & (df['TEAM']!='SOUTH KOREA')&(df['TEAM']!='Waseda') 
                       &(df['TEAM']!='LAOS')&(df['TEAM']!='CHINESE TAIPEI')&(df['TEAM']!='Vietnam')
                       &(df['TEAM']!='INDIA')&(df['TEAM']!='Hong Kong, China')&(df['TEAM']!='AIC JAPAN')
                       &(df['NATIONALITY']!='GBR')&(df['NATIONALITY']!='JPN')&(df['NATIONALITY']!='SRI')&(df['NATIONALITY']!='SAM')
                       &(df['NATIONALITY']!='THA')&(df['NATIONALITY']!='IND')] 

In [753]:
df_select

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,UNIQUE_ID,...,5%,10%,RESULT_CONV,RESULT_BEST,Delta2,Delta3.5,Delta5,Delta10,Delta_Benchmark,PERF_SCALAR
0,"Pereira, Veronica Shanti",11.71,,,7.0,,100m,,,,...,12.3375,12.925,11.71,11.71,0.2750,0.45125,0.6275,1.215,0.04,5.340426
1,"Pereira, Veronica Shanti",23.16w,,,7.0,,200m,,,,...,24.7800,25.960,,,,,,,,
2,"Low, Jun Yu",4.77,,,5.0,,Pole Vault,,,,...,4.9400,4.680,4.77,4.77,-0.3260,-0.24800,-0.1700,0.090,-0.43,-3.269231
3,"Pak, Andrew",2.05,,,,,High Jump,,,,...,2.0425,1.935,2.05,2.05,-0.0570,-0.02475,0.0075,0.115,-0.10,0.348837
4,"Tam Jong-Hng, Joash",1.95,,,,,High Jump,,,,...,2.0425,1.935,1.95,1.95,-0.1570,-0.12475,-0.0925,0.015,-0.20,-4.302326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15845,Lee Gabriel Jin Yi,15.20m,Singapore,22,2,Open,Triple Jump,0,,G897C03,...,14.9150,14.130,15.2,15.20,-0.1860,0.04950,0.2850,1.070,-0.50,1.815287
15847,Lee Gabriel Jin Yi,15.15,,,7.0,,Triple Jump,,,,...,14.9150,14.130,15.15,15.15,-0.2360,-0.00050,0.2350,1.020,-0.55,1.496815
15848,"Rozario, Tia Louise",12.78,,,2.0,,Triple Jump,,,,...,12.7870,12.114,12.78,12.78,-0.4108,-0.20890,-0.0070,0.666,-0.68,-0.052006
15849,Tan Shou Yi Rei,13.76,,,1,,Triple Jump,,,,...,14.9150,14.130,13.76,13.76,-1.6260,-1.39050,-1.1550,-0.370,-1.94,-7.356688


In [754]:
df_select[df_select['NAME']=='LEE, VANESSA']

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,UNIQUE_ID,...,5%,10%,RESULT_CONV,RESULT_BEST,Delta2,Delta3.5,Delta5,Delta10,Delta_Benchmark,PERF_SCALAR


In [755]:
'''
# Read list of foreigners

foreigners = pd.read_csv('/Users/veesheenyuen/Desktop/DataScience/SAA/MM/List of Foreigners.csv', encoding='latin-1')
'''


"\n# Read list of foreigners\n\nforeigners = pd.read_csv('/Users/veesheenyuen/Desktop/DataScience/SAA/MM/List of Foreigners.csv', encoding='latin-1')\n"

In [756]:
# Read list of foreigners from GCS bucket

file_path = "gs://name_lists/List of Foreigners.csv"
foreigners = pd.read_csv(file_path,
                 sep=",",
                 encoding="unicode escape",
                 storage_options={"token": '/Users/veesheenyuen/Desktop/DataScience/Keys/saa-analytics-7c8937b70609.json'})


In [757]:
foreigners

Unnamed: 0,LAST_NAME,FIRST_NAME
0,Aaryan,Greuter Christoph
1,Akahodani,Takayuki
2,Apondar,Audric
3,Brooks,Ruby
4,Brouwer,Cees
...,...,...
235,Kashama,Biwesa Daniel
236,ISMAIL,MUHAMMAD ZULFIQAR
237,Jayaganeson,Kirtisha
238,LIN,Yu Sian


In [758]:
foreigners['V1'] = foreigners['LAST_NAME']+' '+foreigners['FIRST_NAME']
foreigners['V2'] = foreigners['FIRST_NAME']+' '+foreigners['LAST_NAME']
foreigners['V3'] = foreigners['LAST_NAME']+', '+foreigners['FIRST_NAME']
foreigners['V4'] = foreigners['FIRST_NAME']+' '+foreigners['LAST_NAME']

for1 = foreigners['V1'].dropna().tolist()
for2 = foreigners['V2'].dropna().tolist()
for3 = foreigners['V3'].dropna().tolist()
for4 = foreigners['V4'].dropna().tolist()

foreign_list = for1+for2+for3+for4 

foreign_list_casefold=[s.casefold() for s in foreign_list]

exclusions = foreign_list_casefold

no_foreigners_list = df_select.loc[~df['NAME'].str.casefold().isin(exclusions)]  # ~ means NOT IN. DROP spex carded athletes

In [759]:
no_foreigners_list

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,UNIQUE_ID,...,5%,10%,RESULT_CONV,RESULT_BEST,Delta2,Delta3.5,Delta5,Delta10,Delta_Benchmark,PERF_SCALAR
0,"Pereira, Veronica Shanti",11.71,,,7.0,,100m,,,,...,12.3375,12.925,11.71,11.71,0.2750,0.45125,0.6275,1.215,0.04,5.340426
1,"Pereira, Veronica Shanti",23.16w,,,7.0,,200m,,,,...,24.7800,25.960,,,,,,,,
2,"Low, Jun Yu",4.77,,,5.0,,Pole Vault,,,,...,4.9400,4.680,4.77,4.77,-0.3260,-0.24800,-0.1700,0.090,-0.43,-3.269231
3,"Pak, Andrew",2.05,,,,,High Jump,,,,...,2.0425,1.935,2.05,2.05,-0.0570,-0.02475,0.0075,0.115,-0.10,0.348837
4,"Tam Jong-Hng, Joash",1.95,,,,,High Jump,,,,...,2.0425,1.935,1.95,1.95,-0.1570,-0.12475,-0.0925,0.015,-0.20,-4.302326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15845,Lee Gabriel Jin Yi,15.20m,Singapore,22,2,Open,Triple Jump,0,,G897C03,...,14.9150,14.130,15.2,15.20,-0.1860,0.04950,0.2850,1.070,-0.50,1.815287
15847,Lee Gabriel Jin Yi,15.15,,,7.0,,Triple Jump,,,,...,14.9150,14.130,15.15,15.15,-0.2360,-0.00050,0.2350,1.020,-0.55,1.496815
15848,"Rozario, Tia Louise",12.78,,,2.0,,Triple Jump,,,,...,12.7870,12.114,12.78,12.78,-0.4108,-0.20890,-0.0070,0.666,-0.68,-0.052006
15849,Tan Shou Yi Rei,13.76,,,1,,Triple Jump,,,,...,14.9150,14.130,13.76,13.76,-1.6260,-1.39050,-1.1550,-0.370,-1.94,-7.356688


In [760]:
# Choose the best performing event for each athlete

#top_performers_clean = excluded_list.sort_values(['NAME','PERF_SCALAR'],ascending=False).groupby('NAME').head(1) # Choose top performing event per NAME


In [761]:
# Choose the best result for each event participated by every athlete

top_performers_clean = no_foreigners_list.sort_values(['MAPPED_EVENT', 'NAME','PERF_SCALAR'],ascending=False).groupby(['MAPPED_EVENT', 'NAME']).head(1)


In [762]:
top_performers_clean.reset_index(inplace=True)


In [763]:
os.chdir('/Users/veesheenyuen/Desktop/DataScience/SAA/SEAG/')


top_performers_clean.to_csv('seag_top_performers_prod_june25_tz.csv', encoding='utf-8')

In [764]:
top_performers_clean

Unnamed: 0,index,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,...,5%,10%,RESULT_CONV,RESULT_BEST,Delta2,Delta3.5,Delta5,Delta10,Delta_Benchmark,PERF_SCALAR
0,15542,Zhou Xuanyu,9.12,DHS,,10.0,C,Triple Jump,,,...,12.787,12.114,9.12,9.12,-4.0708,-3.8689,-3.667,-2.994,-4.34,-27.243685
1,15468,"Zhong, Chuhan",12.26,,,,,Triple Jump,,,...,12.787,12.114,12.26,12.26,-0.9308,-0.7289,-0.527,0.146,-1.20,-3.915305
2,15624,"Zheng, Justin De",10.47m,National Junior College,14,12,U15,Triple Jump,0,,...,14.915,14.130,10.47,10.47,-4.9160,-4.6805,-4.445,-3.660,-5.23,-28.312102
3,15526,"Zhao, Daniel",11.68,HCI,,3.0,C,Triple Jump,,,...,14.915,14.130,11.68,11.68,-3.7060,-3.4705,-3.235,-2.450,-4.02,-20.605096
4,15554,"Zhang, Xinyu",8.45,SNG,,16.0,C,Triple Jump,,,...,12.787,12.114,8.45,8.45,-4.7408,-4.5389,-4.337,-3.664,-5.01,-32.221397
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9772,5276,"., Shaik Isa",15.49,Club ZOOM,8,32,U9,Dash,80,,...,,,,,,,,,,
9773,5336,"., Nur Elena",16.42,UNA,6,1,U7,Dash,80,,...,,,,,,,,,,
9774,12474,"., Dharkshitha",08:35.6,Cedar Girls Secondary School,15,1,U18,Race Walk,1500,,...,,,,,,,,,,
9775,12131,"., Cayden",06:05.1,North Vista,14,54,Open,Mile Run,1,,...,,,,,,,,,,


In [765]:
'''
# Join 2024 best results for each event for each athlete

os.chdir('/Users/veesheenyuen/Desktop/DataScience/SAA/OCTC/2023/')


df_yoy = pd.read_csv("Best_results_2023.csv")

df_yoy['NAME'] = df_yoy['NAME'].str.replace('\xa0', '', regex=True)
df_yoy['NAME'] = df_yoy['NAME'].str.replace('[\x00-\x1f\x7f-\x9f]', '', regex=True)
df_yoy['NAME'] = df_yoy['NAME'].str.replace('\r', '', regex=True)
df_yoy['NAME'] = df_yoy['NAME'].str.replace('\n', '', regex=True)
df_yoy['NAME'] = df_yoy['NAME'].str.strip()
'''

'\n# Join 2024 best results for each event for each athlete\n\nos.chdir(\'/Users/veesheenyuen/Desktop/DataScience/SAA/OCTC/2023/\')\n\n\ndf_yoy = pd.read_csv("Best_results_2023.csv")\n\ndf_yoy[\'NAME\'] = df_yoy[\'NAME\'].str.replace(\'\xa0\', \'\', regex=True)\ndf_yoy[\'NAME\'] = df_yoy[\'NAME\'].str.replace(\'[\x00-\x1f\x7f-\x9f]\', \'\', regex=True)\ndf_yoy[\'NAME\'] = df_yoy[\'NAME\'].str.replace(\'\r\', \'\', regex=True)\ndf_yoy[\'NAME\'] = df_yoy[\'NAME\'].str.replace(\'\n\', \'\', regex=True)\ndf_yoy[\'NAME\'] = df_yoy[\'NAME\'].str.strip()\n'

In [766]:
#df_yoy

In [767]:
'''

# Merge benchmarks onto athletes on MAPPED_EVENT and GENDER

yoy_performance = pd.merge(
    left=df_yoy, 
    right=top_performers_clean,
    how='left',
    left_on=['EVENT', 'GENDER', 'NAME'],
    right_on=['MAPPED_EVENT', 'GENDER', 'NAME'],
)

'''

"\n\n# Merge benchmarks onto athletes on MAPPED_EVENT and GENDER\n\nyoy_performance = pd.merge(\n    left=df_yoy, \n    right=top_performers_clean,\n    how='left',\n    left_on=['EVENT', 'GENDER', 'NAME'],\n    right_on=['MAPPED_EVENT', 'GENDER', 'NAME'],\n)\n\n"

In [768]:
#yoy_performance

In [769]:
'''

os.chdir('/Users/veesheenyuen/Desktop/DataScience/SAA/OCTC/')


yoy_performance.to_csv('yoy_performance_prod.csv', index=False, encoding='utf-8')

'''

"\n\nos.chdir('/Users/veesheenyuen/Desktop/DataScience/SAA/OCTC/')\n\n\nyoy_performance.to_csv('yoy_performance_prod.csv', index=False, encoding='utf-8')\n\n"

In [770]:
# Choose best performance for each event

#tiered_performers = top_performers_clean.sort_values(['GENDER', 'MAPPED_EVENT', 'PERF_SCALAR'],ascending=False).groupby(['MAPPED_EVENT', 'NAME']).head(1)

tiered_performers = top_performers_clean


In [771]:
tiered_performers

Unnamed: 0,index,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,...,5%,10%,RESULT_CONV,RESULT_BEST,Delta2,Delta3.5,Delta5,Delta10,Delta_Benchmark,PERF_SCALAR
0,15542,Zhou Xuanyu,9.12,DHS,,10.0,C,Triple Jump,,,...,12.787,12.114,9.12,9.12,-4.0708,-3.8689,-3.667,-2.994,-4.34,-27.243685
1,15468,"Zhong, Chuhan",12.26,,,,,Triple Jump,,,...,12.787,12.114,12.26,12.26,-0.9308,-0.7289,-0.527,0.146,-1.20,-3.915305
2,15624,"Zheng, Justin De",10.47m,National Junior College,14,12,U15,Triple Jump,0,,...,14.915,14.130,10.47,10.47,-4.9160,-4.6805,-4.445,-3.660,-5.23,-28.312102
3,15526,"Zhao, Daniel",11.68,HCI,,3.0,C,Triple Jump,,,...,14.915,14.130,11.68,11.68,-3.7060,-3.4705,-3.235,-2.450,-4.02,-20.605096
4,15554,"Zhang, Xinyu",8.45,SNG,,16.0,C,Triple Jump,,,...,12.787,12.114,8.45,8.45,-4.7408,-4.5389,-4.337,-3.664,-5.01,-32.221397
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9772,5276,"., Shaik Isa",15.49,Club ZOOM,8,32,U9,Dash,80,,...,,,,,,,,,,
9773,5336,"., Nur Elena",16.42,UNA,6,1,U7,Dash,80,,...,,,,,,,,,,
9774,12474,"., Dharkshitha",08:35.6,Cedar Girls Secondary School,15,1,U18,Race Walk,1500,,...,,,,,,,,,,
9775,12131,"., Cayden",06:05.1,North Vista,14,54,Open,Mile Run,1,,...,,,,,,,,,,


In [772]:
# Identify Tier 1/2/3 performers

#top_performers_clean['TIER'] = np.where((top_performers_clean['Delta_Benchmark']>=0), 'Tier 1',    
#                                np.where(((top_performers_clean['Delta_Benchmark']<0) & (top_performers_clean['Delta2']>=0)), 'Tier2',
#                                np.where(((top_performers_clean['Delta2']<0) & (top_performers_clean['Delta3.5']>=0)), 'Tier3', ' ')))


tiered_performers['TIER'] = np.where((tiered_performers['Delta_Benchmark']>=0), 'Tier 1',    
                                np.where(((tiered_performers['Delta_Benchmark']<0) & (tiered_performers['Delta2']>=0)), 'Tier 2',
                                np.where(((tiered_performers['Delta2']<0) & (tiered_performers['Delta3.5']>=0)), 'Tier 3',
                                np.where(((tiered_performers['Delta3.5']<0) & (tiered_performers['Delta5']>=0)), 'Tier 4',
                                np.where(((tiered_performers['Delta5']<0) & (tiered_performers['Delta10']>=0)), 'Tier 5', ' ')))))



In [773]:
tiered_performers[tiered_performers['MAPPED_EVENT']=='5000m']

Unnamed: 0,index,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,...,10%,RESULT_CONV,RESULT_BEST,Delta2,Delta3.5,Delta5,Delta10,Delta_Benchmark,PERF_SCALAR,TIER
2508,14580,"Zong, Darrell",16:30.6,Singapore University of Techno,25,4,Open,Run,5000,,...,971.74,990.6,990.60,-89.532,-76.281,-63.03,-18.86,-107.20,-7.134933,
2509,14103,"Zhi Yan, Lim",20:44.5,Oldham Athletics,22,27,Open,Run,5000,,...,971.74,1244.5,1244.50,-343.432,-330.181,-316.93,-272.76,-361.10,-35.876160,
2510,13612,"Yip, Wan Hoi",19:36.5,EROVRA CLUB,24,28,Open,Run,5000,,...,971.74,1176.5,1176.50,-275.432,-262.181,-248.93,-204.76,-293.10,-28.178628,
2511,13788,"Yip, Tony",19:50.5,Singapore Institute of Managem,24,20,Open,Run,5000,,...,971.74,1190.5,1190.50,-289.432,-276.181,-262.93,-218.76,-307.10,-29.763414,
2512,14674,"Yeong, Jia Wen Jonathan",16:59.4,Hwa Chong Alumni Association,22,6,Open,Run,5000,,...,971.74,1019.4,1019.40,-118.332,-105.081,-91.83,-47.66,-136.00,-10.395065,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2650,4049,Ayden Tan Chee Yew,17:47.35,VJC,,7.0,A,5000m,,,...,971.74,1067.35,1067.35,-166.282,-153.031,-139.78,-95.61,-183.95,-15.822957,
2651,13336,"Aravinth, Adarsh",16:07.4,National University Singapore,0,1,Open,Run,5000,,...,971.74,967.4,967.40,-66.332,-53.081,-39.83,4.34,-84.00,-4.508716,Tier 5
2652,13599,"Ang, Tze See",23:42.3,Ngee Ann Polytechnic,17,18,Open,Run,5000,,...,1136.96,1422.3,1422.30,-368.028,-352.524,-337.02,-285.34,-388.70,-32.606424,
2653,14690,"Ahmed, Nawaz",17:19.5,Lacticbuds,22,7,Open,Run,5000,,...,971.74,1039.5,1039.50,-138.432,-125.181,-111.93,-67.76,-156.10,-12.670365,


In [774]:
# Drop rows without a SEAG benchmark

final_df = tiered_performers[tiered_performers['BENCHMARK'].notna()]


In [775]:
# Process dates to extract age

# Map NSG divisions into age

mask = (final_df['DIVISION'].str.contains(r'A', na=False))
final_df.loc[mask, 'AGE'] = '18.5'

mask = (final_df['DIVISION'].str.contains(r'B', na=False))
final_df.loc[mask, 'AGE'] = '16'

mask = (final_df['DIVISION'].str.contains(r'C', na=False))
final_df.loc[mask, 'AGE'] = '13.5'

mask = (final_df['DIVISION'].str.contains(r'O', na=False))
final_df.loc[mask, 'AGE'] = '12'



In [776]:
def length(string):

    B = ''
    year = ''

    try:

        length = len(string)

        if length == 2:

            string = '19' + string

        elif length == 1:

            string = ''

        else:

            pass

        if string is not None or len(string) != 1:

            B = parser.parse(string, dayfirst=True)
                        
    except:

        pass

    return B


final_df['DOB_new'] = final_df['DOB'].apply(length)



#B = parser.parse("10-09-2021", dayfirst = True)

In [777]:
final_df['DOB_new'] = pd.to_datetime(final_df['DOB_new'], errors='coerce')

final_df['year_extract']=final_df['DOB_new'].dt.strftime('%Y')

final_df['year_extract'] = pd.to_numeric(final_df['year_extract'])

final_df['age_extract'] = 2025 - final_df['year_extract']


In [778]:
def age(number):  # correct negative age numbers
    
    if number<0:
        
        number+=100
        
    return number


final_df['age_extract']=final_df['age_extract'].apply(age)


In [779]:
# If NSG event then choose AGE otherwise choose age_extract

condition1 = final_df['COMPETITION']=='National School Games'
#condition2=((df['CATEGORY_EVENT']=='Jump')|(df['CATEGORY_EVENT']=='Throw'))
#condition3=df['SEED_CONV']<df['RESULT_CONV']
#condition4=~((df['CATEGORY_EVENT']=='Jump')|(df['CATEGORY_EVENT']=='Throw'))


final_df['age_extract'] = final_df['AGE'].where((condition1), final_df['age_extract'].values)


In [780]:
# Change to numeric

final_df[['age_extract']] = final_df[['age_extract']].apply(pd.to_numeric)

In [781]:
final_df[final_df['NAME']=='Marican, Shohib']

Unnamed: 0,index,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,...,Delta2,Delta3.5,Delta5,Delta10,Delta_Benchmark,PERF_SCALAR,TIER,DOB_new,year_extract,age_extract
2576,13588,"Marican, Shohib",16:52.2,Macritchie Runners 25,12,8,Open,Run,5000,,...,-111.132,-97.881,-84.63,-40.46,-128.8,-9.580032,,1997-05-19,1997.0,28.0
5588,14818,"Marican, Shohib",04:15.5,Macritchie Runners 25,12,12,Open,Run,1500,,...,-11.312,-7.721,-4.13,7.84,-16.1,-1.725146,Tier 5,1997-05-19,1997.0,28.0


In [782]:
os.chdir('/Users/veesheenyuen/Desktop/DataScience/SAA/SEAG/')


final_df.to_csv('seag_tiered_performers_June30_tz.csv', encoding='utf-8')

In [108]:
# Rank everyone for published ranking lists

published_ranking = final_df.sort_values(['MAPPED_EVENT','GENDER','PERF_SCALAR'], ascending=[False, False, False])
published_ranking['Rank'] = published_ranking.groupby(['GENDER', 'MAPPED_EVENT']).cumcount() + 1

published_ranking.to_csv('published_ranking_prod.csv', encoding='utf-8')

In [109]:
# Rank everyone for octc selection

all_ranking = final_df.sort_values(['MAPPED_EVENT','GENDER','PERF_SCALAR'], ascending=[False, False, False])
all_ranking['Rank'] = all_ranking.groupby(['GENDER', 'MAPPED_EVENT', 'TIER']).cumcount() + 1


In [565]:
# Convert time format for marathon and 5000m into mm:ss.00
# Choose the correct column indices or you will get erratic timings

import datetime

#s=247.779

#datetime.datetime.fromtimestamp(s).strftime('%M:%S.%f')

all_ranking=all_ranking.reset_index(drop=True)


#all_ranking[['2%', '3.5%', '5%']] = df[['2%', '3.5%', '5%']].apply(pd.to_numeric)


#all_ranking['2%'] = all_ranking['2%'].astype("string")
#all_ranking['3.5%'] = all_ranking['3.5%'].astype("string")
#all_ranking['5%'] = all_ranking['5%'].astype("string")


for i in range(len(all_ranking)):
        
    rowIndex = all_ranking.index[i]

    event=all_ranking.iloc[rowIndex,21]
        
    
    time_base2=all_ranking.iloc[rowIndex,25]
    time_base3=all_ranking.iloc[rowIndex,26]
    time_base5=all_ranking.iloc[rowIndex,27]
    
        
    if metric==None:
        continue
        
    if event=='800m' or event=='10,000m' or event=='5000m' or event=='3000m Steeplechase' or event=='1500m':
        
      #  print(i, event, time_base2, time_base3, time_base5)

        
        

            
        
        date_preconvert2 = datetime.datetime.utcfromtimestamp(time_base2)
        date_preconvert3 = datetime.datetime.utcfromtimestamp(time_base3)
        date_preconvert5 = datetime.datetime.utcfromtimestamp(time_base5)
        
    #    print(date_preconvert2, date_preconvert3, date_preconvert5)
            
        
        output2 = datetime.datetime.strftime(date_preconvert2, "%M:%S.%f")
        output3 = datetime.datetime.strftime(date_preconvert3, "%M:%S.%f")
        output5 = datetime.datetime.strftime(date_preconvert5, "%M:%S.%f")
            
     #   print(event, output2, output3, output5)

                    
       #     top_performers_clean.loc[rowIndex, '2%_timing'] = output2
       #     top_performers_clean.loc[rowIndex, '3.5%_timing'] = output3
       #     top_performers_clean.loc[rowIndex, '5%_timing'] = output5
            
   
        all_ranking.at[rowIndex, '2%'] = output2 # copy over time format
        all_ranking.at[rowIndex, '3.5%'] = output3
        all_ranking.at[rowIndex, '5%'] = output5


            


        
    elif event=='Marathon':
        
      #  print(time_base2, time_base3, time_base5)

        
        try:
            

        
            date_preconvert2 = datetime.datetime.utcfromtimestamp(time_base2)
            date_preconvert3 = datetime.datetime.utcfromtimestamp(time_base3)
            date_preconvert5 = datetime.datetime.utcfromtimestamp(time_base5)

            
            
            output2 = datetime.datetime.strftime(date_preconvert2, "%H:%M:%S")
            output3 = datetime.datetime.strftime(date_preconvert3, "%H:%M:%S")
            output5 = datetime.datetime.strftime(date_preconvert5, "%H:%M:%S")

            
        
        #    top_performers_clean.loc[rowIndex, '2%_timing'] = output2
        #    top_performers_clean.loc[rowIndex, '3.5%_timing'] = output3
        #    top_performers_clean.loc[rowIndex, '5%_timing'] = output5
            
            all_ranking.at[rowIndex, '2%'] = output2 # copy over time format
            all_ranking.at[rowIndex, '3.5%'] = output3
            all_ranking.at[rowIndex, '5%'] = output5

            
         #   print('output', output2, output3, output5)


        
        except:
            
            pass
                        
             


In [566]:
all_ranking

Unnamed: 0,index,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,...,Delta2,Delta3.5,Delta5,Delta_Benchmark,PERF_SCALAR,TIER,DOB_new,year_extract,age_extract,Rank
0,32796,LEE GABRIEL JIN YI,15.67m,NUS,12,1,Open,Triple Jump,0,,...,0.284,0.5195,0.755,-0.03,4.808917,Tier 2,2003-02-23,2003.0,22.0,1
1,32799,"MEDINA, ANDREW GEORGE",15.43,,,,,Triple Jump,,,...,0.044,0.2795,0.515,-0.27,3.280255,Tier 2,NaT,,,2
2,33352,"Kapil, Arnav",15.03m,WINGS ATHLETICS CLUB,12,1,Open,Triple Jump,0,,...,-0.356,-0.1205,0.115,-0.67,0.732484,Tier 4,2003-08-25,2003.0,22.0,1
3,32842,Tan Shou Yi Rei (Chen Shouyi),14.99m,Raffles Institution JC,17,1,U20,Triple Jump,0,,...,-0.396,-0.1605,0.075,-0.71,0.477707,Tier 4,2008-12-05,2008.0,17.0,2
4,33306,"Sueto, Yuito",14.76m,Kansai University of Social We,12,3,Open,Triple Jump,0,,...,-0.626,-0.3905,-0.155,-0.94,-0.987261,,2004-01-06,2004.0,21.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18390,16467,"., Rasyiqah",31.51,Seng Kang Primary School,10,30,U11,Dash,150,,...,,,,,,,2014-01-20,2014.0,11.0,1728
18391,11025,"., Nur Elena",16.42,UNA,6,1,U7,Dash,80,,...,,,,,,,2018-12-04,2018.0,7.0,1729
18392,16747,"., Nur Amelia",16.64,Seng Kang Primary School,10,63,U11,Dash,80,,...,,,,,,,2014-03-12,2014.0,11.0,1730
18393,26372,"., Dharkshitha",12:46.5,Cedar Girls Secondary School,14,5,U15,Race Walk,1500,,...,,,,,,,2010-04-06,2010.0,15.0,1731


## Apply Rule E - Two Athletes per Tier, 3rd placing and below move down one tier

In [567]:
# Apply Rule E, two per tier only, therefore move 3rd and 4th placings down one tier

#all_ranking['TIER_ADJ'] = np.where(
#                                ((all_ranking['TIER']=='Tier 1') & (all_ranking['Rank']==3)), 'Tier 2',    
#                                np.where(
#                                ((all_ranking['TIER']=='Tier 1') & (all_ranking['Rank']==4)), 'Tier2',
#                                np.where(
#                                ((all_ranking['TIER']=='Tier2') & (all_ranking['Rank']==3)), 'Tier3', 
#                                np.where(
#                                ((all_ranking['TIER']=='Tier2') & (all_ranking['Rank']==4)), 'Tier3', ' ')
#                                
#                                )))


#all_ranking['TIER_ADJ'] = np.where(
#                                ((all_ranking['TIER']=='Tier 1') & (all_ranking['Rank']==3)), 'Tier 2',    
#                                np.where(
#                                ((all_ranking['TIER']=='Tier 1') & (all_ranking['Rank']==4)), 'Tier 2',
#                                np.where(
#                                ((all_ranking['TIER']=='Tier 2') & (all_ranking['Rank']==3)), 'Tier 3', 
#                                np.where(
#                                ((all_ranking['TIER']=='Tier 2') & (all_ranking['Rank']==4)), 'Tier 3', 
#                                np.where(
#                                ((all_ranking['TIER']=='Tier 2') & (all_ranking['Rank']==5)), 'Tier 3', 
#                                np.where(  
#                                ((all_ranking['TIER']=='Tier 2') & (all_ranking['Rank']==6)), 'Tier 3', 
#                                np.where(    
#                                ((all_ranking['TIER']=='Tier 3') & (all_ranking['Rank']==3)), 'Tier 4', 
#                                np.where(                             
#                                ((all_ranking['TIER']=='Tier 3') & (all_ranking['Rank']==4)), 'Tier 4', 
#                                np.where(                             
#                                ((all_ranking['TIER']=='Tier 3') & (all_ranking['Rank']==5)), 'Tier 4', 
#                                np.where(                             
#                                ((all_ranking['TIER']=='Tier 3') & (all_ranking['Rank']==6)), 'Tier 4', 
#                                np.where(                             
#                                
#                                ((all_ranking['TIER']=='Tier 3') & (all_ranking['Rank']==4)), 'Tier 4', all_ranking['TIER'])
#                                
#                               ))))))))))

all_ranking['TIER_ADJ'] = np.where(
                                ((all_ranking['TIER']=='Tier 1') & (all_ranking['Rank']==3)), 'Tier 2',    
                                np.where(
                                ((all_ranking['TIER']=='Tier 1') & (all_ranking['Rank']>=4)), 'Tier 2',
                                np.where(
                                ((all_ranking['TIER']=='Tier 2') & (all_ranking['Rank']==3)), 'Tier 3', 
                                np.where(
                                ((all_ranking['TIER']=='Tier 2') & (all_ranking['Rank']>=4)), 'Tier 3', 
                                np.where(                             
                                ((all_ranking['TIER']=='Tier 3') & (all_ranking['Rank']==3)), 'Tier 4', 
                                np.where(                             
                                ((all_ranking['TIER']=='Tier 3') & (all_ranking['Rank']>=4)), 'Tier 4', all_ranking['TIER']) 
                
                                )))))



In [568]:
all_ranking.to_csv('octc_all_ranking.csv', encoding='utf-8')

In [569]:
# Re-rank based on MAPPED_EVENT, GENDER, TIER_ADJ & PERF_SCALAR

rerank = all_ranking.sort_values(['MAPPED_EVENT','GENDER','TIER_ADJ', 'PERF_SCALAR'], ascending=[False, False, False, False])
rerank['Rank_ADJ'] = rerank.groupby(['MAPPED_EVENT', 'GENDER', 'TIER_ADJ']).cumcount() + 1


In [570]:
rerank_filtered=rerank[(rerank['TIER_ADJ']!=' ') & (rerank['TIER_ADJ']!='Tier 4')]

In [571]:
rerank_filtered

Unnamed: 0,index,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,...,Delta5,Delta_Benchmark,PERF_SCALAR,TIER,DOB_new,year_extract,age_extract,Rank,TIER_ADJ,Rank_ADJ
0,32796,LEE GABRIEL JIN YI,15.67m,NUS,12,1,Open,Triple Jump,0,,...,0.7550,-0.03,4.808917,Tier 2,2003-02-23,2003.0,22.0,1,Tier 2,1
1,32799,"MEDINA, ANDREW GEORGE",15.43,,,,,Triple Jump,,,...,0.5150,-0.27,3.280255,Tier 2,NaT,,,2,Tier 2,2
230,32761,"ROZARIO, TIA LOUISE",13.01,,,1,,Triple Jump,,,...,0.2230,-0.45,1.656761,Tier 3,2000-10-14,2000.0,25.0,1,Tier 3,1
559,25935,"LOW, JUN YU",5.34,,,1,,Pole Vault,,,...,0.4000,0.14,7.692308,Tier 1,2001-04-21,2001.0,24.0,1,Tier 1,1
751,25763,"CHAN, JEREMY WEI LUN",2:40:21,INDIVIDUAL,35,1064,,Marathon,,,...,195.4500,-272.00,2.090598,Tier 3,1989-05-05,1989.0,36.0,1,Tier 3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12682,17333,"PEREIRA, VERONICA SHANTI",11.45,Singapore,12,1,Open,Dash,100,,...,0.8875,0.30,7.553191,Tier 1,1996-09-20,1996.0,29.0,1,Tier 1,1
12683,17438,"TAN, ELIZABETH-ANN",11.73,Singapore,12,3,Open,Dash,100,,...,0.6075,0.02,5.170213,Tier 1,2003-09-23,2003.0,22.0,2,Tier 1,2
14149,5,SOH RUI YONG GUILLAUME,31:11.4,INDIVIDUAL,33,133,,10000m,,,...,108.3750,14.10,5.747812,Tier 1,1991-05-05,1991.0,34.0,1,Tier 1,1
14198,2,Chui Ling Goh,36:29.7,,,8,,"10,000m",,,...,47.8500,-58.70,2.245425,Tier 3,1992-11-27,1992.0,33.0,1,Tier 3,1


In [572]:
# Check this output to ensure every new record upon upload has an event mapping and perf_scalar calc

rerank_filtered.to_csv('octc_rerank_filtered.csv', encoding='utf-8')

In [782]:
tuple = list(zip(rerank_filtered['NAME'], rerank_filtered['MAPPED_EVENT']))

In [783]:
tuple

[('LEE GABRIEL JIN YI', 'Triple Jump'),
 ('MEDINA, ANDREW GEORGE', 'Triple Jump'),
 ('ROZARIO, TIA LOUISE', 'Triple Jump'),
 ('LOW, JUN YU', 'Pole Vault'),
 ('CHAN, JEREMY WEI LUN', 'Marathon'),
 ('DANIEL LEOW SOON YE', 'Marathon'),
 ('He Yong', 'Marathon'),
 ('Yaohan Melvin Wong', 'Marathon'),
 ('TAN, AARON JUSTIN WEN JIE', 'Marathon'),
 ('Wei Xiang Gordon Lim', 'Marathon'),
 ('SOH RUI YONG GUILLAUME', 'Marathon'),
 ('HIA, CALEB', 'Marathon'),
 ('GOH SHING LING', 'Marathon'),
 ('MEDINA, ANDREW', 'Long Jump'),
 ('EMERY, CONRAD', 'Long Jump'),
 ('ROZARIO, TIA LOUISE', 'Long Jump'),
 ('CHLOE CHEE EN-YA', 'Long Jump'),
 ('Kampton Kam', 'High Jump'),
 ('GOH, AMELIA', 'High Jump'),
 ('MICHELLE SNG', 'High Jump'),
 ('YEE, CHUN WAI, ERIC', 'Discus Throw'),
 ('Lee, Tzu Yun', 'Discus Throw'),
 ('Shawn, Chia', '800m'),
 ('Fiore, Oliver', '800m'),
 ('nan', '800m'),
 ('ZUBIN PERCY MUNCHERJI', '800m'),
 ('JIE CONG JAYDEN, TAN', '800m'),
 ('LIM, OLIVER', '800m'),
 ('THANA RAJAN, THIRUBEN  S/O', '800

In [784]:
os.chdir('/Users/veesheenyuen/Desktop/DataScience/SAA/OCTC/')

mar25 = pd.read_csv("rerank_filtered_mar25.csv")

In [785]:
mar25

Unnamed: 0,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,DISTANCE,EVENT_CLASS,UNIQUE_ID,DOB,...,5%,RESULT_CONV,RESULT_BEST,Delta2,Delta3.5,Delta5,Delta_Benchmark,PERF_SCALAR,TIER_ADJ,Rank_ADJ
0,LEE GABRIEL JIN YI,15.67m,NUS,12.0,1.0,Open,0.0,,L435B06,23/2/03,...,14.915,15.67,15.67,0.2840,0.51950,0.7550,-0.03,4.808917,Tier 2,1
1,"MEDINA, ANDREW GEORGE",15.43,,,,,,,,2,...,14.915,15.43,15.43,0.0440,0.27950,0.5150,-0.27,3.280255,Tier 2,2
2,"ROZARIO, TIA LOUISE",13.01,,,1.0,,,,,14-Oct-00,...,12.787,13.01,13.01,-0.1808,0.02110,0.2230,-0.45,1.656761,Tier 3,1
3,Anson Loh Ding Rong,17.2,,,1.0,,,5kg,,8,...,16.435,17.20,17.20,0.2460,0.50550,0.7650,-0.10,4.421965,Tier 2,1
4,"LOW, JUN YU",5.34,,,1.0,,,,,21-Apr-01,...,4.94,5.34,5.34,0.2440,0.32200,0.4000,0.14,7.692308,Tier 1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,"TAN, ELIZABETH-ANN",11.99,SINGAPORE,12.0,1.0,Open,100.0,,,23/9/03,...,12.3375,11.99,11.99,-0.0050,0.17125,0.3475,-0.24,2.957447,Tier 3,1
65,"JAIGANTH, LAAVINIA",12.15,Ngee Ann Polytechnic,12.0,1.0,Open,100.0,,L435B06,22/1/06,...,12.3375,12.15,12.15,-0.1650,0.01125,0.1875,-0.40,1.595745,Tier 3,2
66,"PEREIRA, VERONICA SHANTI",11.47,,,1.0,,,,,20-Sep-96,...,12.3375,11.47,11.47,0.5150,0.69125,0.8675,0.28,7.382979,Tier 1,1
67,SOH RUI YONG GUILLAUME,31:11.4,INDIVIDUAL,33.0,133.0,,,,,1991,...,32:59.8,1871.40,1871.40,51.8100,80.09250,108.3750,14.10,5.747812,Tier 1,1


In [3244]:
os.chdir('/Users/veesheenyuen/Desktop/DataScience/SAA/OCTC/')

dec24 = pd.read_csv("octc_rule_2d_cut_dec24.csv")
mar25 = pd.read_csv("octc_rule_2d_cut_mar25.csv")

In [3245]:
compare = datacompy.Compare(
    dec24,
    mar25,
    join_columns=['NAME', 'MAPPED_EVENT'],  #You can also specify a list of columns
    rel_tol=0, #Optional, defaults to 0
    )

In [3246]:
compare.matches(ignore_extra_columns=False)

print(compare.report())


DataComPy Comparison
--------------------

DataFrame Summary
-----------------

  DataFrame  Columns  Rows
0       df1        5    50
1       df2        5    17

Column Summary
--------------

Number of columns in common: 5
Number of columns in df1 but not in df2: 0 []
Number of columns in df2 but not in df1: 0 []

Row Summary
-----------

Matched on: name, mapped_event
Any duplicates on match values: No
Absolute Tolerance: 0
Relative Tolerance: 0
Number of rows in common: 9
Number of rows in df1 but not in df2: 41
Number of rows in df2 but not in df1: 8

Number of rows with some compared columns unequal: 7
Number of rows with all compared columns equal: 2

Column Comparison
-----------------

Number of columns compared with some values unequal: 3
Number of columns compared with all values equal: 2
Total number of values which compare unequal: 14

Columns with Unequal Values or Types
------------------------------------

        Column df1 dtype df2 dtype  # Unequal  Max Diff  # Null D

In [2814]:
print(compare.df2_unq_rows)

                     name  mapped_event  result_best tier_adj  final_rank
2        CHLOE CHEE EN-YA     Long Jump         5.90   Tier 2         2.0
3           Chua, Garrett  110m Hurdles        14.25   Tier 3         3.0
16              Jayden Ng     Decathlon      5716.00   Tier 1         1.0
31            Low, Nicole         5000m      1041.30   Tier 2         2.0
51        Tak Yeung Leung         5000m       903.90   Tier 3         2.0
53          Tate Tan Fung          100m        10.50   Tier 2         2.0
54  Thiruben, Thana Rajan          800m       113.70   Tier 2         2.0
55             YAN, ETHAN         1500m       243.90   Tier 3         3.0


## Apply Rule 3 - If athlete qualifies for more than one event, the higher tier event is selected

In [2141]:
octc_rule_3 = rerank_filtered.sort_values(['NAME','PERF_SCALAR'],ascending=False).groupby('NAME').head(1)

In [2142]:
octc_rule_3

Unnamed: 0,index,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,...,Delta5,Delta_Benchmark,PERF_SCALAR,TIER,DOB_new,year_extract,age_extract,Rank,TIER_ADJ,Rank_ADJ
5570,25335,"{'545': 'Zulkhair, Muhd', '546': 'TAN, DARYL',...",41.06,Team Cicada Trackers,12,1,Open,Relay,400,,...,0.2680,-1.70,0.680894,Tier 4,NaT,,,2,Tier 4,2
5263,25248,"{'442': 'Chauhan, Aarya', '455': 'RYAN, PRAHAR...",03:18.1,Oldham Athletics,12,1,Open,Relay,1600,,...,0.1400,-9.30,0.074153,Tier 4,NaT,,,1,Tier 4,1
5689,25077,"{'183': 'LEE, MARK REN', '159': 'RYAN, PRAHARS...",40.88,SINGAPORE,12,1,Open,Relay,400,,...,0.4480,-1.52,1.138211,Tier 4,NaT,,,1,Tier 4,1
5712,5158,"{'135': 'Koe, Ian', '136': 'GAN, IAN', '139': ...",41.27,Trackstar Athletics,12,1,Open,Relay,400,,...,0.0580,-1.91,0.147358,Tier 4,2015-04-19,2015.0,10.0,4,Tier 4,4
5717,4689,"{'1335': 'TAN, DARYL HONG AN', '1336': 'Teo, Y...",41.1,Singapore,12,1,Open,Relay,400,Trial,...,0.2280,-1.74,0.579268,Tier 4,2010-03-21,2010.0,15.0,3,Tier 4,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3562,1157,"Aswin, Kumar",01:54.9,Oldham Athletics,19,1,U20,Run,800,,...,4.6950,-1.00,4.122037,Tier 2,2006-04-13,2006.0,19.0,2,Tier 2,3
10072,23506,"Ashok, Tharun",10.81,Ngee Ann Polytechnic,12,4,Open,Dash,100,,...,0.1625,-0.36,1.555024,Tier 3,2005-07-02,2005.0,20.0,10,Tier 4,8
498,7059,Anson Loh Ding Rong,17.2,,,1,,Shot Put,,5kg,...,0.7650,-0.10,4.421965,Tier 2,NaT,,,1,Tier 2,1
8319,27741,"Adarsh, Aravinth",04:07.1,National University Singapore,12,2,Open,Run,1500,,...,4.2700,-7.70,1.783626,Tier 3,2001-10-13,2001.0,24.0,4,Tier 4,2


In [2143]:
octc_rule_3.to_csv('octc_rule_3.csv', encoding='utf-8')

## Apply Rule 2d - max 3 per event unless 100m/400m then it is max 6

In [2796]:
#octc_rule_2d = octc_rule_3[(   
#    ((octc_rule_3['MAPPED_EVENT']=='400m')|(octc_rule_3['MAPPED_EVENT']=='100m'))&((octc_rule_3['TIER_ADJ']=='Tier 1')|(octc_rule_3['TIER_ADJ']=='Tier 2')|(octc_rule_3['TIER_ADJ']=='Tier 3'))&((octc_rule_3['Rank_ADJ']==1)|(octc_rule_3['Rank_ADJ']==2))
#    |
#    (
#        ~((octc_rule_3['MAPPED_EVENT']=='400m')|(octc_rule_3['MAPPED_EVENT']=='100m'))&((octc_rule_3['TIER_ADJ']=='Tier 1')|(octc_rule_3['TIER_ADJ']=='Tier 2')|(octc_rule_3['TIER_ADJ']=='Tier 3'))&((octc_rule_3['Rank_ADJ']==1)|(octc_rule_3['Rank_ADJ']==2))
#    )
#    )]

octc_rule_2d = rerank_filtered[(   
    ((rerank_filtered['MAPPED_EVENT']=='400m')|(rerank_filtered['MAPPED_EVENT']=='100m'))&((rerank_filtered['TIER_ADJ']=='Tier 1')|(rerank_filtered['TIER_ADJ']=='Tier 2')|(rerank_filtered['TIER_ADJ']=='Tier 3'))&((rerank_filtered['Rank_ADJ']==1)|(rerank_filtered['Rank_ADJ']==2))
    |
    (
        ~((rerank_filtered['MAPPED_EVENT']=='400m')|(rerank_filtered['MAPPED_EVENT']=='100m'))&((rerank_filtered['TIER_ADJ']=='Tier 1')|(rerank_filtered['TIER_ADJ']=='Tier 2')|(rerank_filtered['TIER_ADJ']=='Tier 3'))&((rerank_filtered['Rank_ADJ']==1)|(rerank_filtered['Rank_ADJ']==2))
    )
    )]



In [2797]:
octc_rule_2d.to_csv('octc_rule_2d_prelim.csv', encoding='utf-8')

In [2798]:
# Rank again so we can choose top 6 for 100m/400m and top 3 for all other events

octc_rule_2d = octc_rule_2d.sort_values(['MAPPED_EVENT','GENDER','PERF_SCALAR'], ascending=[False, False, False])
octc_rule_2d['Final_Rank'] = octc_rule_2d.groupby(['MAPPED_EVENT', 'GENDER']).cumcount() + 1


In [2799]:
octc_rule_2d

Unnamed: 0,index,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,...,Delta_Benchmark,PERF_SCALAR,TIER,DOB_new,year_extract,age_extract,Rank,TIER_ADJ,Rank_ADJ,Final_Rank
0,23997,LEE GABRIEL JIN YI,15.67m,NUS,12,1,Open,Triple Jump,0,,...,-0.03,4.808917,Tier 2,2003-02-23,2003.0,22.0,1,Tier 2,1,1
1,6986,"MEDINA, ANDREW GEORGE",15.43,,,,,Triple Jump,,,...,-0.27,3.280255,Tier 2,NaT,,,2,Tier 2,2,2
203,6945,"ROZARIO, TIA LOUISE",13.01,,,1,,Triple Jump,,,...,-0.45,1.656761,Tier 3,2000-10-14,2000.0,25.0,1,Tier 3,1,1
404,7059,Anson Loh Ding Rong,17.2,,,1,,Shot Put,,5kg,...,-0.10,4.421965,Tier 2,NaT,,,1,Tier 2,1,1
502,7035,"LOW, JUN YU",5.34,,,1,,Pole Vault,,,...,0.14,7.692308,Tier 1,2001-04-21,2001.0,24.0,1,Tier 1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9102,6924,"PEREIRA, VERONICA SHANTI",11.47,,,1,,100m,,,...,0.28,7.382979,Tier 1,1996-09-20,1996.0,29.0,1,Tier 1,1,1
9103,24052,"TAN, ELIZABETH-ANN",11.99,SINGAPORE,12,1,Open,Dash,100,,...,-0.24,2.957447,Tier 3,2003-09-23,2003.0,22.0,1,Tier 3,1,2
9104,22622,"JAIGANTH, LAAVINIA",12.15,Ngee Ann Polytechnic,12,1,Open,Dash,100,,...,-0.40,1.595745,Tier 3,2006-01-22,2006.0,19.0,2,Tier 3,2,3
9759,6000,SOH RUI YONG GUILLAUME,31:11.4,INDIVIDUAL,33,133,,10000m,,,...,14.10,5.747812,Tier 1,1991-04-10,1991.0,34.0,1,Tier 1,1,1


In [2800]:
# Choose 3/6 for each event

octc_rule_2d = octc_rule_2d[(((octc_rule_2d['MAPPED_EVENT']=='400m')|(octc_rule_2d['MAPPED_EVENT']=='100m'))&(octc_rule_2d['Final_Rank']<7))|(~((octc_rule_2d['MAPPED_EVENT']=='400m')|(octc_rule_2d['MAPPED_EVENT']=='100m'))&(octc_rule_2d['Final_Rank']<4))]


In [2801]:
octc_rule_2d.to_csv('octc_rule_2d_final.csv', encoding='utf-8')

# Top 8 Performers

In [820]:
# Filter top 8 performances for each event and gender

top_eight_performers = tiered_performers.sort_values(by=['MAPPED_EVENT', 'GENDER', 'PERF_SCALAR'],ascending=[False, False, False]).groupby(['GENDER', 'MAPPED_EVENT']).head(8).reset_index(drop=True)

top_eight_performers

Unnamed: 0,index,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,...,3.5%,5%,RESULT_CONV,RESULT_BEST,Delta2,Delta3.5,Delta5,Delta_Benchmark,PERF_SCALAR,TIER
0,23028,LEE GABRIEL JIN YI,15.67m,NUS,21,1,Open,Triple Jump,0,,...,15.1505,14.915,15.67,15.67,0.284,0.5195,0.755,-0.03,4.808917,Tier2
1,5676,"MEDINA, ANDREW GEORGE",15.43,,,,,Triple Jump,,,...,15.1505,14.915,15.43,15.43,0.044,0.2795,0.515,-0.27,3.280255,Tier2
2,23153,"Kapil, Arnav",15.03m,WINGS ATHLETICS CLUB,21,1,Open,Triple Jump,0,,...,15.1505,14.915,15.03,15.03,-0.356,-0.1205,0.115,-0.67,0.732484,Tier4
3,912,"Kumar, Viresh",14.36m,Victoria Junior College,18,1,U20,Triple Jump,0,,...,15.1505,14.915,14.36,14.36,-1.026,-0.7905,-0.555,-1.34,-3.535032,
4,5838,Tan Shou Yi Rei (Chen Shouyi),14.12,,,1,,Triple Jump,,,...,15.1505,14.915,14.12,14.12,-1.266,-1.0305,-0.795,-1.58,-5.063694,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450,3678,"{'90': 'CHEW, CAERISSE KAI YI', '1438': 'TNG, ...",12:22.0,Dunman High School,10,2,B Division,Relay,3200,,...,,,742.0,742.00,,,,,,
451,3639,"{'879': 'WONG YU XIM, ANDREA', '884': 'YONG, S...",01:54.3,Nanyang Girls' High School,10,2,C Division,Relay,800,,...,,,114.3,114.30,,,,,,
452,3654,"{'870': 'PUAR MIN, QUINN', '872': 'CHONG KATE ...",01:53.3,Nanyang Girls' High School,10,1,B Division,Relay,800,,...,,,113.3,113.30,,,,,,
453,3491,"{'813': 'LEA, PANG SUI ON', '814': 'CHARYNE, C...",37.74,CHIJ Primary (Toa Payoh),10,19,Junior 2,Relay,200,,...,,,37.74,37.74,,,,,,


In [831]:
top_eight_performers.to_csv('top_8_prod.csv', encoding='utf-8')

# Apply OCTC Rules (OLD)

## OCTC Rule 4 - if athlete qualifies in 2 events, choose the better performing one

In [822]:
# Change to numeric

final_df[['2%', '3.5%', '5%', 'RESULT_CONV']] = final_df[['2%', '3.5%', '5%', 'RESULT_CONV']].apply(pd.to_numeric)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df[['2%', '3.5%', '5%', 'RESULT_CONV']] = final_df[['2%', '3.5%', '5%', 'RESULT_CONV']].apply(pd.to_numeric)


In [823]:
# Select qualifiers based on 5% tolerance

#octc_rule3 = octc_rule4.loc[(((octc_rule4['CATEGORY_EVENT']=='Mid')|(octc_rule4['CATEGORY_EVENT']=='Sprint')|(octc_rule4['CATEGORY_EVENT']=='Long')|(octc_rule4['CATEGORY_EVENT']=='Hurdles')|(octc_rule4['CATEGORY_EVENT']=='Walk')|(octc_rule4['CATEGORY_EVENT']=='Relay')|(octc_rule4['CATEGORY_EVENT']=='Marathon')|(octc_rule4['CATEGORY_EVENT']=='Steeple')|(octc_rule4['CATEGORY_EVENT']=='Pentathlon')|(octc_rule4['CATEGORY_EVENT']=='Heptathlon')|(octc_rule4['CATEGORY_EVENT']=='Triathlon'))&(octc_rule4['RESULT_CONV'] <= octc_rule4['5%']) & (octc_rule4['age_extract']<40) & ((octc_rule4['MAPPED_EVENT']!='Marathon')|(octc_rule4['age_extract']<60) & (octc_rule4['MAPPED_EVENT']=='Marathon')))|(((octc_rule4['CATEGORY_EVENT']=='Jump')|(octc_rule4['CATEGORY_EVENT']=='Throw'))&(octc_rule4['RESULT_CONV'] >= octc_rule4['5%']) & (octc_rule4['age_extract']<40) & ((octc_rule4['MAPPED_EVENT']!='Marathon')|(octc_rule4['age_extract']<60) & (octc_rule4['MAPPED_EVENT']=='Marathon')))]

octc_rule4 = final_df.loc[
    (
    ((final_df['CATEGORY_EVENT']=='Mid')|(final_df['CATEGORY_EVENT']=='Sprint')|(final_df['CATEGORY_EVENT']=='Long')|(final_df['CATEGORY_EVENT']=='Hurdles')|(final_df['CATEGORY_EVENT']=='Walk')|(final_df['CATEGORY_EVENT']=='Relay')|(final_df['CATEGORY_EVENT']=='Marathon')|(final_df['CATEGORY_EVENT']=='Steeple')|(final_df['CATEGORY_EVENT']=='Pentathlon')|(final_df['CATEGORY_EVENT']=='Heptathlon')|(final_df['CATEGORY_EVENT']=='Triathlon'))
    
    & (final_df['RESULT_CONV'] <= final_df['5%']) 
    
#    & (octc_rule4['age_extract']<40) 
    
#    & ((octc_rule4['MAPPED_EVENT']!='Marathon')|(octc_rule4['age_extract']<60) & (octc_rule4['MAPPED_EVENT']=='Marathon'))
    )  
    |(
        ((final_df['CATEGORY_EVENT']=='Jump')|(final_df['CATEGORY_EVENT']=='Throw'))
        
    & (final_df['RESULT_CONV'] >= final_df['5%']) 
#    & (octc_rule4['age_extract']<40) 
#    & ((octc_rule4['MAPPED_EVENT']!='Marathon')|(octc_rule4['age_extract']<60) & (octc_rule4['MAPPED_EVENT']=='Marathon'))
    )
    ]

In [824]:
octc_rule4.to_csv('octc_rule4_prod.csv', encoding='utf-8')

In [825]:
#top_performers_final = top_performers_sorted.sort_values(by=['MAPPED_EVENT', 'GENDER', 'PERF_SCALAR'], ascending=[False, False, False], na_position='last')
octc_rule4_final = octc_rule4.sort_values(['NAME','PERF_SCALAR'],ascending=False).groupby('NAME').head(1)

In [826]:
# Rank list of athletes by group after sorting

#octc_rule3['Rank'] = (octc_rule3.sort_values(by=['MAPPED_EVENT', 'GENDER', 'PERF_SCALAR'], ascending=[False, False, True])['MAPPED_EVENT', 'GENDER']
#                .rank(method='first', ascending=False)
#             )


#octc_rule3 = octc_rule3.sort_values(['MAPPED_EVENT','GENDER','PERF_SCALAR'])
#octc_rule3['Rank'] = octc_rule3.groupby(['MAPPED_EVENT', 'GENDER']).cumcount() + 1

octc_rule4_final = octc_rule4_final.sort_values(['MAPPED_EVENT','GENDER','PERF_SCALAR'], ascending=[False, False, False])
octc_rule4_final['Rank'] = octc_rule4_final.groupby(['MAPPED_EVENT', 'GENDER']).cumcount() + 1



In [827]:
octc_rule4_final

Unnamed: 0,index,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,...,Delta2,Delta3.5,Delta5,Delta_Benchmark,PERF_SCALAR,TIER,DOB_new,year_extract,age_extract,Rank
234,23028,LEE GABRIEL JIN YI,15.67m,NUS,12,1,Open,Triple Jump,0,,...,0.284,0.51950,0.7550,-0.03,4.808917,Tier2,2003-02-23,2003.0,22.0,1
173,5676,"MEDINA, ANDREW GEORGE",15.43,,,,,Triple Jump,,,...,0.044,0.27950,0.5150,-0.27,3.280255,Tier2,NaT,,,2
248,23153,"Kapil, Arnav",15.03m,WINGS ATHLETICS CLUB,12,1,Open,Triple Jump,0,,...,-0.356,-0.12050,0.1150,-0.67,0.732484,Tier4,2003-08-25,2003.0,22.0,3
539,5725,"LOW, JUN YU",5.34,,,1,,Pole Vault,,,...,0.244,0.32200,0.4000,0.14,7.692308,Tier 1,2001-04-21,2001.0,24.0,1
625,5655,SOH RUI YONG GUILLAUME,2:24:07,,,41,,Marathon,,,...,888.980,1029.21500,1169.4500,702.00,12.508824,Tier 1,1991-08-06,1991.0,34.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10270,1695,"Ng, Bryan",10.95,Wings Athletics Club,12,5,Open,Dash,100,,...,-0.291,-0.13425,0.0225,-0.50,0.215311,Tier4,2000-10-11,2000.0,25.0,22
10652,177,"Lee Shyen, Joshua",10.96,TeamFabian,17,1,U18,Dash,100,,...,-0.301,-0.14425,0.0125,-0.51,0.119617,Tier4,2008-09-12,2008.0,17.0,23
10162,5614,"PEREIRA, VERONICA SHANTI",11.47,,,1,,100m,,,...,0.515,0.69125,0.8675,0.28,7.382979,Tier 1,1996-09-20,1996.0,29.0,1
9881,23083,"TAN, ELIZABETH-ANN",11.99,SINGAPORE,12,1,Open,Dash,100,,...,-0.005,0.17125,0.3475,-0.24,2.957447,Tier3,2003-09-23,2003.0,22.0,2


In [828]:
octc_rule4_final.to_csv('octc_rule4_final.csv', encoding='utf-8')

## OCTC Rule 3 - Where top athlete is >30 yrs old (except marathon), to include next athlete as well (below 30)

In [257]:
# Apply Rule 3 & is within 5% performance band

octc_rule3 = octc_rule4_final.loc[(((octc_rule4['CATEGORY_EVENT']=='Mid')|(octc_rule4['CATEGORY_EVENT']=='Sprint')|(octc_rule4['CATEGORY_EVENT']=='Long')|(octc_rule4['CATEGORY_EVENT']=='Hurdles')|(octc_rule4['CATEGORY_EVENT']=='Walk')|(octc_rule4['CATEGORY_EVENT']=='Relay')|(octc_rule4['CATEGORY_EVENT']=='Marathon')|(octc_rule4['CATEGORY_EVENT']=='Steeple')|(octc_rule4['CATEGORY_EVENT']=='Pentathlon')|(octc_rule4['CATEGORY_EVENT']=='Heptathlon')|(octc_rule4['CATEGORY_EVENT']=='Triathlon'))&(octc_rule4['RESULT_CONV'] <= octc_rule4['5%']) & (octc_rule4['age_extract']<40) & ((octc_rule4['MAPPED_EVENT']!='Marathon')|(octc_rule4['age_extract']<60) & (octc_rule4['MAPPED_EVENT']=='Marathon')))|(((octc_rule4['CATEGORY_EVENT']=='Jump')|(octc_rule4['CATEGORY_EVENT']=='Throw'))&(octc_rule4['RESULT_CONV'] >= octc_rule4['5%']) & (octc_rule4['age_extract']<40) & ((octc_rule4['MAPPED_EVENT']!='Marathon')|(octc_rule4['age_extract']<60) & (octc_rule4['MAPPED_EVENT']=='Marathon')))]

In [258]:
octc_rule3

Unnamed: 0,index,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,...,RESULT_BEST,Delta2,Delta3.5,Delta5,Delta_Benchmark,PERF_SCALAR,TIER,DOB_new,year_extract,age_extract
2570,1607,,01:56.9,,,2,,800m,,,...,116.90,-0.7220,0.98650,2.6950,-3.00,2.366111,Tier3,2006-06-24,2006.0,19.0
2580,1606,ZUBIN PERCY MUNCHERJI,01:53.7,,,2,,800m,,,...,113.70,2.4780,4.18650,5.8950,0.20,5.175593,Tier 1,1996-06-23,1996.0,29.0
2433,20042,"YEE, CHUN WAI, ERIC",49.51m,Hwa Chong Alumni Association,12,1,Open,Discus Throw,0,(2kg),...,49.51,0.4904,1.24070,1.9910,-0.51,3.980408,Tier2,1999-03-01,1999.0,26.0
540,1600,Wei Xiang Gordon Lim,2:36:19,,,566,,Marathon,,,...,9379.00,156.9800,297.21500,437.4500,-30.00,4.679110,Tier2,1993-02-07,1993.0,32.0
8604,22740,"Toh Jun Xi, Tedd",10.93,National University Singapore,12,4,Open,Dash,100,,...,10.93,-0.2710,-0.11425,0.0425,-0.48,0.406699,Tier4,2002-03-18,2002.0,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10273,2816,Brayden Chan Wei Jie,00:10.86,RI,18.5,3,A,100m,,,...,10.86,-0.2010,-0.04425,0.1125,-0.41,1.076555,Tier4,NaT,,18.5
3345,17663,"Aswin, Kumar",01:56.0,Temasek Polytechnic,12,4,Open,Run,800,,...,116.00,0.1780,1.88650,3.5950,-2.10,3.156277,Tier2,2006-04-13,2006.0,19.0
10347,18575,"Ashok, Tharun",10.81,Ngee Ann Polytechnic,12,4,Open,Dash,100,,...,10.81,-0.1510,0.00575,0.1625,-0.36,1.555024,Tier3,2005-07-02,2005.0,20.0
8286,22810,"Adarsh, Aravinth",04:07.1,National University Singapore,12,2,Open,Run,1500,,...,247.10,-2.9120,0.67900,4.2700,-7.70,1.783626,Tier3,2001-10-13,2001.0,24.0


In [259]:
# Rank list of athletes by group after sorting

#octc_rule3['Rank'] = (octc_rule3.sort_values(by=['MAPPED_EVENT', 'GENDER', 'PERF_SCALAR'], ascending=[False, False, True])['MAPPED_EVENT', 'GENDER']
#                .rank(method='first', ascending=False)
#             )


#octc_rule3 = octc_rule3.sort_values(['MAPPED_EVENT','GENDER','PERF_SCALAR'])
#octc_rule3['Rank'] = octc_rule3.groupby(['MAPPED_EVENT', 'GENDER']).cumcount() + 1

octc_rule3 = octc_rule3.sort_values(['MAPPED_EVENT','GENDER','PERF_SCALAR'], ascending=[False, False, False])
octc_rule3['Rank'] = octc_rule3.groupby(['MAPPED_EVENT', 'GENDER']).cumcount() + 1



In [260]:
octc_rule3

Unnamed: 0,index,NAME,RESULT,TEAM,AGE,COMPETITION_RANK,DIVISION,EVENT_x,DISTANCE,EVENT_CLASS,...,Delta2,Delta3.5,Delta5,Delta_Benchmark,PERF_SCALAR,TIER,DOB_new,year_extract,age_extract,Rank
193,19066,LEE GABRIEL JIN YI,15.67m,NUS,12,1,Open,Triple Jump,0,,...,0.284,0.51950,0.7550,-0.03,4.808917,Tier2,2003-02-23,2003.0,22.0,1
206,19191,"Kapil, Arnav",15.03m,WINGS ATHLETICS CLUB,12,1,Open,Triple Jump,0,,...,-0.356,-0.12050,0.1150,-0.67,0.732484,Tier4,2003-08-25,2003.0,22.0,2
476,1763,"LOW, JUN YU",5.34,,,1,,Pole Vault,,,...,0.244,0.32200,0.4000,0.14,7.692308,Tier 1,2001-04-21,2001.0,24.0,1
549,1693,SOH RUI YONG GUILLAUME,2:24:07,,,41,,Marathon,,,...,888.980,1029.21500,1169.4500,702.00,12.508824,Tier 1,1991-08-06,1991.0,34.0,1
545,772,"TAN, AARON JUSTIN WEN JIE",2:35:26,INDIVIDUAL,31,,,Marathon,,,...,209.980,350.21500,490.4500,23.00,5.246016,Tier 1,1993-04-01,1993.0,32.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9353,21106,MARK LEE REN,10.92,INDIVIDUAL,21,,,100m,,,...,-0.261,-0.10425,0.0525,-0.47,0.502392,Tier4,2004-04-01,2004.0,21.0,17
8604,22740,"Toh Jun Xi, Tedd",10.93,National University Singapore,12,4,Open,Dash,100,,...,-0.271,-0.11425,0.0425,-0.48,0.406699,Tier4,2002-03-18,2002.0,23.0,18
9091,1652,"PEREIRA, VERONICA SHANTI",11.47,,,1,,100m,,,...,0.515,0.69125,0.8675,0.28,7.382979,Tier 1,1996-09-20,1996.0,29.0,1
8826,19121,"TAN, ELIZABETH-ANN",11.99,SINGAPORE,12,1,Open,Dash,100,,...,-0.005,0.17125,0.3475,-0.24,2.957447,Tier3,2003-09-23,2003.0,22.0,2


In [261]:
os.chdir('/Users/veesheenyuen/Desktop/DataScience/SAA/OCTC/')

octc_rule3.to_csv('octc_rule3.csv', encoding='utf-8')

## OCTC Rule 6 - max 6 for 100m/400m and max 3 for all other events

In [829]:
octc_rule6 = octc_rule4_final[(((octc_rule4_final['MAPPED_EVENT']=='400m')|(octc_rule4_final['MAPPED_EVENT']=='100m'))&(octc_rule4_final['Rank']<7))|(~((octc_rule4_final['MAPPED_EVENT']=='400m')|(octc_rule4_final['MAPPED_EVENT']=='100m'))&(octc_rule4_final['Rank']<4))]


In [830]:
octc_rule6.to_csv('octc_rule6.csv', sep=',', encoding='utf-8-sig', index=False)


In [364]:
#rslt_df['Rank'] = (rslt_df.sort_values(by=['EVENT', 'GENDER', 'Delta35'], ascending=[False, False, True])['Delta35']
#                .rank(method='first', ascending=False)
#             )


# Download list of foreigners and spex carded athletes

In [365]:
#spex_list=pd.read_csv('/Users/veesheenyuen/Desktop/DataScience/SAA/SPEX_CARDED_LIST.csv', encoding='latin-1')


In [1622]:
foreigners = pd.read_csv('/Users/veesheenyuen/Desktop/DataScience/SAA/MM/List of Foreigners.csv', encoding='latin-1')


In [1623]:
#spex_list

In [432]:
foreigners

Unnamed: 0,LAST_NAME,FIRST_NAME,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,Aaryan,Greuter Christoph,,,
1,Akahodani,Takayuki,,,
2,Apondar,Audric,,,
3,Brooks,Ruby,,,
4,Brouwer,Cees,,,
...,...,...,...,...,...
219,CHO,CHIA-HSUAN,,,
220,NGUYEN,HOAI VAN,,,
221,NGUYEN,QUOC THINH,,,
222,PHAM,QUYNH GIANG,,,


In [433]:
foreigners['V1'] = foreigners['LAST_NAME']+' '+foreigners['FIRST_NAME']
foreigners['V2'] = foreigners['FIRST_NAME']+' '+foreigners['LAST_NAME']
foreigners['V3'] = foreigners['LAST_NAME']+', '+foreigners['FIRST_NAME']
foreigners['V4'] = foreigners['FIRST_NAME']+' '+foreigners['LAST_NAME']

In [434]:
foreigners

Unnamed: 0,LAST_NAME,FIRST_NAME,Unnamed: 2,Unnamed: 3,Unnamed: 4,V1,V2,V3,V4
0,Aaryan,Greuter Christoph,,,,Aaryan Greuter Christoph,Greuter Christoph Aaryan,"Aaryan, Greuter Christoph",Greuter Christoph Aaryan
1,Akahodani,Takayuki,,,,Akahodani Takayuki,Takayuki Akahodani,"Akahodani, Takayuki",Takayuki Akahodani
2,Apondar,Audric,,,,Apondar Audric,Audric Apondar,"Apondar, Audric",Audric Apondar
3,Brooks,Ruby,,,,Brooks Ruby,Ruby Brooks,"Brooks , Ruby",Ruby Brooks
4,Brouwer,Cees,,,,Brouwer Cees,Cees Brouwer,"Brouwer, Cees",Cees Brouwer
...,...,...,...,...,...,...,...,...,...
219,CHO,CHIA-HSUAN,,,,CHO CHIA-HSUAN,CHIA-HSUAN CHO,"CHO, CHIA-HSUAN",CHIA-HSUAN CHO
220,NGUYEN,HOAI VAN,,,,NGUYEN HOAI VAN,HOAI VAN NGUYEN,"NGUYEN , HOAI VAN",HOAI VAN NGUYEN
221,NGUYEN,QUOC THINH,,,,NGUYEN QUOC THINH,QUOC THINH NGUYEN,"NGUYEN , QUOC THINH",QUOC THINH NGUYEN
222,PHAM,QUYNH GIANG,,,,PHAM QUYNH GIANG,QUYNH GIANG PHAM,"PHAM , QUYNH GIANG",QUYNH GIANG PHAM


In [371]:
#spex1 = spex_list['V1'].dropna().tolist()
#spex2 = spex_list['V2'].dropna().tolist()
#spex3 = spex_list['V3'].dropna().tolist()
#spex4 = spex_list['V4'].dropna().tolist()
#spex5 = spex_list['V5'].dropna().tolist()


In [435]:
for1 = foreigners['V1'].dropna().tolist()
for2 = foreigners['V2'].dropna().tolist()
for3 = foreigners['V3'].dropna().tolist()
for4 = foreigners['V4'].dropna().tolist()


In [373]:
#spex_athletes=spex1+spex2+spex3+spex4+spex5

In [436]:
foreign_list = for1+for2+for3+for4 

In [437]:
#spex_athletes

In [438]:
foreign_list

['Aaryan Greuter Christoph',
 'Akahodani Takayuki',
 'Apondar Audric',
 'Brooks  Ruby',
 'Brouwer Cees',
 'CARLESATER ARMAAN',
 'Chary Mathangi ',
 'Cinthikael Angel',
 'Comia Tsang Hannah',
 'Comia Tsang Mhandy',
 'Donnelly Jordan ',
 'Dorai Trinity Sharilyn',
 'Freeman Alexander',
 'Gaume Constance',
 'Gregorio John Alexander',
 'HAGIWARA RYOTA ',
 'Indarto Gerard',
 'Joy Matthew',
 'JULURI ADITYA',
 'Kiet Tran',
 'Koduru Kalyani',
 'Kuchenbuch Natalya',
 'KUIJPERS KUIJPERS AYKO ELLIN',
 'Kumar Joseph Suraj',
 'KUPPUSAMY PRAKSHEETA',
 'Kusumo Naia',
 'Kuwalekar  Spruha ',
 'Lahey Kezia Stephanie ',
 'Liz Jerry',
 'Marican Raees Marican Bin Ayoob Marican',
 'MARIE BRUNO DE FERRIERES DE SAUVEBOEUF LOUIS ',
 'Maslov Timofei',
 'Masrezwan Maeva',
 'MENDOZA NICCOLO',
 'Mennella Leonardo',
 'Meyers Nikita Mae Jiny-Yu',
 'Mishra Sunay',
 'Mo Thant Cin',
 'Moon Daniel Youngjo',
 'OCAMPO ANIAG DWAYNE MICHAEL OCAMPO ANIAG',
 'Ortega Rafael Pedro',
 'Owen Elia',
 'owl emma',
 'Ozbudak Deniz',
 

In [439]:
#spex_athletes_casefold=[s.casefold() for s in spex_athletes]

In [440]:
foreign_list_casefold=[s.casefold() for s in foreign_list]

In [442]:
#spex_athletes_casefold

In [443]:
foreign_list_casefold

['aaryan greuter christoph',
 'akahodani takayuki',
 'apondar audric',
 'brooks  ruby',
 'brouwer cees',
 'carlesater armaan',
 'chary mathangi ',
 'cinthikael angel',
 'comia tsang hannah',
 'comia tsang mhandy',
 'donnelly jordan ',
 'dorai trinity sharilyn',
 'freeman alexander',
 'gaume constance',
 'gregorio john alexander',
 'hagiwara ryota ',
 'indarto gerard',
 'joy matthew',
 'juluri aditya',
 'kiet tran',
 'koduru kalyani',
 'kuchenbuch natalya',
 'kuijpers kuijpers ayko ellin',
 'kumar joseph suraj',
 'kuppusamy praksheeta',
 'kusumo naia',
 'kuwalekar  spruha ',
 'lahey kezia stephanie ',
 'liz jerry',
 'marican raees marican bin ayoob marican',
 'marie bruno de ferrieres de sauveboeuf louis ',
 'maslov timofei',
 'masrezwan maeva',
 'mendoza niccolo',
 'mennella leonardo',
 'meyers nikita mae jiny-yu',
 'mishra sunay',
 'mo thant cin',
 'moon daniel youngjo',
 'ocampo aniag dwayne michael ocampo aniag',
 'ortega rafael pedro',
 'owen elia',
 'owl emma',
 'ozbudak deniz',
 

In [444]:
# Exclusion list is foreigners + spex carded 

#exclusions = foreign_list_casefold + spex_athletes_casefold

exclusions = foreign_list_casefold

In [445]:
# Apply Rule 4 above...if an athlete is in more than one event, choose the best performing one

top_performers=rslt_df.sort_values(['NAME','PERF_SCALAR'],ascending=False).groupby('NAME').head(1) # Choose top performing event per NAME


In [446]:
top_performers

Unnamed: 0,index_x,NAME,RESULT,AGE,COMPETITION_RANK,EVENT_x,DOB,COUNTRY,CATEGORY_EVENT,GENDER,...,EVENT_y,Metric,2pc,35pc,5pc,RESULT_CONV,Delta2,Delta35,Delta5,PERF_SCALAR
8353,11670,{},12.99,8.0,2,Boys 4x100 Meter Relay 15-16,2015-01-01 00:00:00.000,,Relay,Male,...,4 x 100m relay,39.36,40.1472,40.73760,41.3280,12.99,27.1572,27.74760,28.3380,71.996951
12480,17949,"{9: 'Ng, Caitlin Shan Wen', 438: 'Seow, Kyra',...",22.17,0.0,1,Girls 13-14 4x100 Meter Relay U15,,,Relay,Female,...,4 x 100m relay,44.58,45.4716,46.14030,46.8090,22.17,23.3016,23.97030,24.6390,55.269179
9267,13141,"{995: 'Lee, Asher', 991: 'Yeo, Chee Hean Phili...",29.89,0.0,2,Boys 9-12 4x400 Meter Relay U13,,,Relay,Male,...,4 x 400m relay,188.82,192.5964,195.42870,198.2610,29.89,162.7064,165.53870,168.3710,89.170109
9588,13486,"{994: 'De Ming, Gilbert Lee', 988: 'Lee, Jerem...",38.89,0.0,14,Boys 10-12 4x100 Meter Relay U13,,,Relay,Male,...,4 x 100m relay,39.36,40.1472,40.73760,41.3280,38.89,1.2572,1.84760,2.4380,6.194106
12424,17876,"{991: 'Yeo, Chee Hean Philip', 992: 'Yeo, Chee...",12.92,0.0,16,Boys 10-12 4x100 Meter Relay U13,,,Relay,Male,...,4 x 100m relay,39.36,40.1472,40.73760,41.3280,12.92,27.2272,27.81760,28.4080,72.174797
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10975,15886,"., Khairulnazim",14.72,17.0,4,Men 400 Meter Dash Open,2006-01-27,,Sprint,Male,...,400m,46.63,47.5626,48.26205,48.9615,14.72,32.8426,33.54205,34.2415,73.432340
9918,13873,"., Kaarthika",12.53,27.0,9,Women 400 Meter Dash Women Junior,1996-06-13,,Sprint,Female,...,400m,53.84,54.9168,55.72440,56.5320,12.53,42.3868,43.19440,44.0020,81.727340
10701,15018,"., Jayashree",1:14.05,15.0,36,Girls Long Jump B Div,2008-01-29 00:00:00.000,,Jump,Female,...,Long jump,6.02,5.8996,5.80930,5.7190,10.97,5.0704,5.16070,5.2510,87.225914
12077,17484,"., Cheung Zheng",47.61,14.0,20,Boys 11-14 400 Meter Dash U15,2009-02-19,,Sprint,Male,...,400m,46.63,47.5626,48.26205,48.9615,47.61,-0.0474,0.65205,1.3515,2.898349


In [447]:
top_performers.to_csv('top_checkpoint.csv', sep=',', encoding='utf-8-sig', index=False)

In [448]:
excluded_list = top_performers.loc[~rslt_df['NAME'].str.casefold().isin(exclusions)]  # ~ means NOT IN. DROP spex carded athletes

In [449]:
#spexed_list=top_performers

In [450]:
excluded_list.sort_values(['MAPPED_EVENT', 'GENDER', 'PERF_SCALAR'], ascending=[True, True, False], inplace=True)
excluded_list['overall_rank'] = 1
excluded_list['overall_rank'] = excluded_list.groupby(['MAPPED_EVENT', 'GENDER'])['overall_rank'].cumsum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  excluded_list.sort_values(['MAPPED_EVENT', 'GENDER', 'PERF_SCALAR'], ascending=[True, True, False], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  excluded_list['overall_rank'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  excluded_list['overall_rank'] = excluded_list.groupby(['MAPPED_EVENT', 'GENDER'])['overall_rank'].cumsum()


In [451]:
#spexed_list=spexed_list[(((spexed_list['EVENT']=='400m')&(spexed_list['overall_rank']<7)))|(((spexed_list['EVENT']=='100m')&(spexed_list['overall_rank']<7)))]

In [452]:
#Apply OCTC selection rule: max 6 for 100m/400m and max 3 for all other events

excluded_list = excluded_list[(((excluded_list['MAPPED_EVENT']=='400m')|(excluded_list['MAPPED_EVENT']=='100m'))&(excluded_list['overall_rank']<7))|(~((excluded_list['MAPPED_EVENT']=='400m')|(excluded_list['MAPPED_EVENT']=='100m'))&(excluded_list['overall_rank']<4))]


In [453]:
excluded_list

Unnamed: 0,index_x,NAME,RESULT,AGE,COMPETITION_RANK,EVENT_x,DOB,COUNTRY,CATEGORY_EVENT,GENDER,...,Metric,2pc,35pc,5pc,RESULT_CONV,Delta2,Delta35,Delta5,PERF_SCALAR,overall_rank
8135,11374,"Tan, Bernice",2:16.97,23.0,3,Women 10000 Meter Run Open,2000-11-16,,Long,Female,...,2131.03,2173.6506,2205.61605,2237.5815,136.97,2036.6806,2068.64605,2100.6115,98.572592,1
8137,11380,"NicAmhlaoibh, Maire",2:16.97,35.0,2,Women 10000 Meter Run Open,1988-02-14,,Long,Female,...,2131.03,2173.6506,2205.61605,2237.5815,136.97,2036.6806,2068.64605,2100.6115,98.572592,2
8134,11372,"Ng, Yew Cheo",2:16.97,37.0,4,Women 10000 Meter Run Open,1986-11-30,,Long,Female,...,2131.03,2173.6506,2205.61605,2237.5815,136.97,2036.6806,2068.64605,2100.6115,98.572592,3
9739,13637,"PHAGAMI, KHUM BAHADUR",20.34m,39.0,2,Men 10000 Meter Run Men Senior A,1984-08-09,,Long,Male,...,1885.55,1923.2610,1951.54425,1979.8275,144.29,1778.9710,1807.25425,1835.5375,97.347591,1
9734,13632,"Mohd, Jamian",35.31m,38.0,6,Men 10000 Meter Run Men Senior A,1985-04-29,,Long,Male,...,1885.55,1923.2610,1951.54425,1979.8275,144.29,1778.9710,1807.25425,1835.5375,97.347591,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3249,3544,SOLANA LEANN REINKIRSTEN CANDA,00:22.7,0.0,9.0,Triple Jump,,,Jump,Female,...,13.46,13.1908,12.98890,12.7870,1150.60,1137.4092,1137.61110,1137.8130,8453.291233,2
3260,3555,SHERYL TOH PEIXUAN,00:23.4,0.0,6.0,Triple Jump,,,Jump,Female,...,13.46,13.1908,12.98890,12.7870,1150.60,1137.4092,1137.61110,1137.8130,8453.291233,3
3257,3552,RAO TIANYU,00:28.7,0.0,12.0,Triple Jump,,,Jump,Male,...,15.70,15.3860,15.15050,14.9150,1150.60,1135.2140,1135.44950,1135.6850,7233.662420,1
3246,3541,NG JUN JIE,00:15.7,0.0,11.0,Triple Jump,,,Jump,Male,...,15.70,15.3860,15.15050,14.9150,1150.60,1135.2140,1135.44950,1135.6850,7233.662420,2


In [454]:
excluded_list.to_csv('octc_all_events.csv', sep=',', encoding='utf-8-sig', index=False)


In [None]:
# 1. If top athlete>30 (and the only one >30), there are already 6 in list, and next one is already <30 then it means do nothing? 
# 2. If top athlete>30 and is the sole pick.  Next athlete is far beyond 5% band.  Do we add?
# 3. If top athlete>30 and there are less than 6 within 5% band of SEAG benchmark. Do we add that one more who is beyond 5%?
# 4. At least one per gender.  The only pick is far beyond 5%.  Do we add?

In [None]:
# Convert time format for marathon and 5000m into mm:ss.00
import datetime

#s=247.779

#datetime.datetime.fromtimestamp(s).strftime('%M:%S.%f')

for i in range(len(top_performers_clean)):
        
    rowIndex = top_performers_clean.index[i]

    event=top_performers_clean.iloc[rowIndex,19]
        
    
    time_base2=top_performers_clean.iloc[rowIndex,24]
    time_base3=top_performers_clean.iloc[rowIndex,25]
    time_base5=top_performers_clean.iloc[rowIndex,26]
    
        
    if metric==None:
        continue
        
    if event=='800m' or event=='10000m' or event=='5000m' or event=='3000m steeplechase' or event=='1500m':
        
        print(time_base2, time_base3, time_base5)

        
        
        try:
            
        
            date_preconvert2 = datetime.datetime.utcfromtimestamp(time_base2)
            date_preconvert3 = datetime.datetime.utcfromtimestamp(time_base3)
            date_preconvert5 = datetime.datetime.utcfromtimestamp(time_base5)

            output2 = datetime.datetime.strftime(date_preconvert2, "%M:%S.%f")
            output3 = datetime.datetime.strftime(date_preconvert3, "%M:%S.%f")
            output5 = datetime.datetime.strftime(date_preconvert5, "%M:%S.%f")
            
       #     print(output2, output3, output5)

                    
       #     top_performers_clean.loc[rowIndex, '2%_timing'] = output2
       #     top_performers_clean.loc[rowIndex, '3.5%_timing'] = output3
       #     top_performers_clean.loc[rowIndex, '5%_timing'] = output5
            
   
            top_performers_clean.at[rowIndex, '2%'] = output2 # copy over time format
            top_performers_clean.at[rowIndex, '3.5%'] = output3
            top_performers_clean.at[rowIndex, '5%'] = output5


            
        except:
            
            pass
        

        
    elif event=='Marathon':
        
        print(time_base2, time_base3, time_base5)

        
        try:
            

        
            date_preconvert2 = datetime.datetime.utcfromtimestamp(time_base2)
            date_preconvert3 = datetime.datetime.utcfromtimestamp(time_base3)
            date_preconvert5 = datetime.datetime.utcfromtimestamp(time_base5)

            
            
            output2 = datetime.datetime.strftime(date_preconvert2, "%H:%M:%S")
            output3 = datetime.datetime.strftime(date_preconvert3, "%H:%M:%S")
            output5 = datetime.datetime.strftime(date_preconvert5, "%H:%M:%S")

            
        
        #    top_performers_clean.loc[rowIndex, '2%_timing'] = output2
        #    top_performers_clean.loc[rowIndex, '3.5%_timing'] = output3
        #    top_performers_clean.loc[rowIndex, '5%_timing'] = output5
            
            top_performers_clean.at[rowIndex, '2%'] = output2 # copy over time format
            top_performers_clean.at[rowIndex, '3.5%'] = output3
            top_performers_clean.at[rowIndex, '5%'] = output5

            
         #   print('output', output2, output3, output5)


        
        except:
            
            pass
                        
             


# Marinda's request for 6 and 8 place finish analysis

In [4062]:
import pandas_gbq
from google.oauth2 import service_account


credentials = service_account.Credentials.from_service_account_file(
    '/Users/veesheenyuen/Desktop/DataScience/Keys/saa-analytics-7c8937b70609.json',
)

sql="""
SELECT NAME, RESULT, RANK, EVENT, CATEGORY_EVENT, GENDER, COMPETITION, STAGE
FROM `saa-analytics.results.saa_full`
WHERE STAGE='Final' AND COMPETITION='SEA Games' AND (RANK='3' OR RANK='6' OR RANK='8')
"""

SEAG = pandas_gbq.read_gbq(sql, project_id="saa-analytics", credentials=credentials)



Downloading: 100%|[32m███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████[0m|[0m


In [4064]:
SEAG.tail(60)

Unnamed: 0,NAME,RESULT,RANK,EVENT,CATEGORY_EVENT,GENDER,COMPETITION,STAGE
53,Farell Glen Felix Jerus,2.15 m,3,High jump,Jump,Male,SEA Games,Final
54,Sun Soklim,NM,6,High jump,Jump,Female,SEA Games,Final
55,Kobsit Sittichai,2.07 m,6,High jump,Jump,Male,SEA Games,Final
56,Suwandi Wijaya,7.47,6,Long jump,Jump,Male,SEA Games,Final
57,Pok Pisey,4.76,8,Long jump,Jump,Female,SEA Games,Final
58,Sapwaturrahman Sapwaturrahman,7.62,3,Long jump,Jump,Male,SEA Games,Final
59,Bùi Thị Loan,6.02,3,Long jump,Jump,Female,SEA Games,Final
60,Nurul Ashikin Abas,5.67,6,Long jump,Jump,Female,SEA Games,Final
61,John Marvin Aragon Rafols,7.19,8,Long jump,Jump,Male,SEA Games,Final
62,Sunisa Khotseemueang,,3,Heptathlon,Heptathlon,Female,SEA Games,Final


In [247]:
for i in range(len(SEAG)):
        
    rowIndex = SEAG.index[i]

    input_string=SEAG.iloc[rowIndex,3]  # event is column index 3
    
    metric=SEAG.iloc[rowIndex,1] # result is column index 1
    
    if (metric==None or metric=='NH' or metric=='NM'):  # skip those non-numeric values
        continue
        
   # elif metric=='NH':
    #    continue
        
    print(i, input_string, metric)
        
    out = convert_time(i, input_string, metric)
     
    SEAG.loc[rowIndex, 'Metric'] = out

0 1500m     03:59.40
1 1500m     04:04.3
2 1500m     04:05.4
3 Decathlon 6891
4 100m 10.78
5 100m 11.75
6 100m 11.96
7 100m 10.443
8 200m 24.09
9 200m 23.6
11 200m 21.02
12 200m 21.58
13 400m 46.63
14 400m 48.26
15 400m 57.54
16 400m 1:01.20
17 400m 53.84
18 800m 1:57.98
19 800m 1:53.86
20 800m 2:15.77
21 800m 2:18.25
22 800m 2:09.15
23 800m 1:55.39
24 1500m 4:26.33
25 1500m 4:38.64
26 1500m 5:00.090
27 5000m 17:13.63
28 5000m 18:41.76
29 5000m 14:43.45
30 5000m 15:01.79
31 5000m 18:02.52
32 5000m 15:09.65
33 10000m 33:00.23
34 10000m 32:36.32
35 10000m 38:11.04
36 10000m 37:29.58
37 10000m 35:31.03
38 10000m 31:25.55
39 Marathon 2:35:49
40 Marathon 3:17:36
41 Marathon 3:46:44
42 Marathon 2:52:41
43 Marathon 2:50:27
44 Marathon 2:41:36
45 Shot put 11.69
46 Shot put 12.12
47 Shot put 14.44
48 Shot put 14.65
49 Shot put 17.3
50 Shot put 16.06
52 High jump 2.07 m
53 High jump 2.15 m
54 High jump 1.73
55 High jump 1.95 m
56 Long jump 7.47
57 Long jump 5.67
58 Long jump 6.02
59 Long jump 4.

In [248]:
SEAG

Unnamed: 0,NAME,RESULT,RANK,EVENT,CATEGORY_EVENT,GENDER,COMPETITION,STAGE,Metric
0,Wahyudi Putra,03:59.40,3,1500m,Mid,Male,SEA Games,Final,239.40
1,Edwin GIRON,04:04.3,6,1500m,Mid,Male,SEA Games,Final,244.30
2,Van Dung GIANG,04:05.4,8,1500m,Mid,Male,SEA Games,Final,245.40
3,Aries TOLEDO,6891,3,Decathlon,Decathlon,Male,SEA Games,Final,6891.00
4,Joshua Hanwei Chua,10.78,6,100m,Sprint,Male,SEA Games,Final,10.78
...,...,...,...,...,...,...,...,...,...
107,"Sukanya Janchaona, Benny Nontanam, Sasipim S...",3:39.29,3,4 x 400m relay,Relay,Female,SEA Games,Final,219.29
108,Nguyễn Thị Huong,11:00.85,3,3000m steeplechase,Steeple,Female,SEA Games,Final,660.85
109,Pandu Sukarya,8:55.05,3,3000m steeplechase,Steeple,Male,SEA Games,Final,535.05
110,Ri Udom,10:36.06,8,3000m steeplechase,Steeple,Male,SEA Games,Final,636.06


In [249]:
SEAG.to_csv('check_variation.csv', encoding='utf-8')

In [250]:
comps = SEAG[SEAG['RANK']=='3']

In [251]:
comps

Unnamed: 0,NAME,RESULT,RANK,EVENT,CATEGORY_EVENT,GENDER,COMPETITION,STAGE,Metric
0,Wahyudi Putra,03:59.40,3,1500m,Mid,Male,SEA Games,Final,239.4
3,Aries TOLEDO,6891,3,Decathlon,Decathlon,Male,SEA Games,Final,6891.0
5,Trần Thị Nhi Yến,11.75,3,100m,Sprint,Female,SEA Games,Final,11.75
7,Muhammad Haiqal Hanafi,10.443,3,100m,Sprint,Male,SEA Games,Final,10.443
9,Zaidatul Husniah Zulkifli,23.6,3,200m,Sprint,Female,SEA Games,Final,23.6
11,Lalu Muhammad Zohri,21.02,3,200m,Sprint,Male,SEA Games,Final,21.02
13,Frederick Ramirez,46.63,3,400m,Sprint,Male,SEA Games,Final,46.63
17,Nguyễn Thị Hằng,53.84,3,400m,Sprint,Female,SEA Games,Final,53.84
19,Wan Muhammad Fazri Wan Zahari,1:53.86,3,800m,Mid,Male,SEA Games,Final,113.86
22,Goh Chui Ling,2:09.15,3,800m,Mid,Female,SEA Games,Final,129.15


In [252]:
new_comps = comps[['EVENT', 'GENDER', 'Metric']]

In [253]:
#comps=SEAG.groupby(['EVENT', 'GENDER'])['Metric'].min()


In [254]:
df2 = SEAG.merge(new_comps, on=['EVENT', 'GENDER'], how='left')


In [255]:
df2

Unnamed: 0,NAME,RESULT,RANK,EVENT,CATEGORY_EVENT,GENDER,COMPETITION,STAGE,Metric_x,Metric_y
0,Wahyudi Putra,03:59.40,3,1500m,Mid,Male,SEA Games,Final,239.40,239.400
1,Edwin GIRON,04:04.3,6,1500m,Mid,Male,SEA Games,Final,244.30,239.400
2,Van Dung GIANG,04:05.4,8,1500m,Mid,Male,SEA Games,Final,245.40,239.400
3,Aries TOLEDO,6891,3,Decathlon,Decathlon,Male,SEA Games,Final,6891.00,6891.000
4,Joshua Hanwei Chua,10.78,6,100m,Sprint,Male,SEA Games,Final,10.78,10.443
...,...,...,...,...,...,...,...,...,...,...
107,"Sukanya Janchaona, Benny Nontanam, Sasipim S...",3:39.29,3,4 x 400m relay,Relay,Female,SEA Games,Final,219.29,219.290
108,Nguyễn Thị Huong,11:00.85,3,3000m steeplechase,Steeple,Female,SEA Games,Final,660.85,660.850
109,Pandu Sukarya,8:55.05,3,3000m steeplechase,Steeple,Male,SEA Games,Final,535.05,535.050
110,Ri Udom,10:36.06,8,3000m steeplechase,Steeple,Male,SEA Games,Final,636.06,535.050


In [256]:
df2.to_csv('check_metric.csv', encoding='utf-8')

In [257]:
df2['% VARIATION']=(df2['Metric_x']/df2['Metric_y']*100-100)

In [258]:
df2

Unnamed: 0,NAME,RESULT,RANK,EVENT,CATEGORY_EVENT,GENDER,COMPETITION,STAGE,Metric_x,Metric_y,% VARIATION
0,Wahyudi Putra,03:59.40,3,1500m,Mid,Male,SEA Games,Final,239.40,239.400,0.000000
1,Edwin GIRON,04:04.3,6,1500m,Mid,Male,SEA Games,Final,244.30,239.400,2.046784
2,Van Dung GIANG,04:05.4,8,1500m,Mid,Male,SEA Games,Final,245.40,239.400,2.506266
3,Aries TOLEDO,6891,3,Decathlon,Decathlon,Male,SEA Games,Final,6891.00,6891.000,0.000000
4,Joshua Hanwei Chua,10.78,6,100m,Sprint,Male,SEA Games,Final,10.78,10.443,3.227042
...,...,...,...,...,...,...,...,...,...,...,...
107,"Sukanya Janchaona, Benny Nontanam, Sasipim S...",3:39.29,3,4 x 400m relay,Relay,Female,SEA Games,Final,219.29,219.290,0.000000
108,Nguyễn Thị Huong,11:00.85,3,3000m steeplechase,Steeple,Female,SEA Games,Final,660.85,660.850,0.000000
109,Pandu Sukarya,8:55.05,3,3000m steeplechase,Steeple,Male,SEA Games,Final,535.05,535.050,0.000000
110,Ri Udom,10:36.06,8,3000m steeplechase,Steeple,Male,SEA Games,Final,636.06,535.050,18.878609


In [259]:
df2.to_csv('SEAG_variation.csv', sep=',', encoding='utf-8-sig', index=False)


# Convert timing format from seconds to HMSf

In [4803]:
datetime.datetime.fromtimestamp(120).strftime('%M:%S.%f')
        

'32:00.000000'

In [4808]:
def convert(seconds):
    seconds = seconds % (24 * 3600)
    hour = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
     
    return "%d:%02d:%02d" % (hour, minutes, seconds)

In [5295]:
import datetime
def format_result(result):
    date = datetime.datetime.utcfromtimestamp(result)
    output = datetime.datetime.strftime(date, "%H:%M:%S.%f")
    return output
print(format_result(113.86))

00:01:53.860000


# Clean manual load data

In [4]:
os.chdir('/Users/veesheenyuen/Desktop/DataScience/SAA/Tilastopaja/SEA Games/')


df_SHA = pd.read_csv("SEA_manual_additions.csv", encoding='latin-1')

In [5]:
df_SHA

Unnamed: 0,FIRST_NAME,LAST_NAME,OTHER_NAME,NAME,RANK,TAG_ID,TEAM,SEED,RESULT,QUALIFICATION,...,SOURCE,REMARKS,TIMESTAMP,VENUE,SUB_EVENT,SESSION,EVENT_CLASS,DISTANCE,HOST_CITY,RX_TIME
0,,,,Agus Prayogo,1.0,,,,2:32:59,,...,https://en.wikipedia.org/wiki/Athletics_at_the...,,,,,,,,Phnom Phen,
1,,,,ÊArlan Estobo Arbois,2.0,,,,2:33:27,,...,https://en.wikipedia.org/wiki/Athletics_at_the...,,,,,,,,Phnom Phen,
2,,,,Nguyen Thanh Hoang,3.0,,,,2:35:49,,...,https://en.wikipedia.org/wiki/Athletics_at_the...,,,,,,,,Phnom Phen,
3,,,,Tan Huong Leong,4.0,,,,2:40:26,,...,https://en.wikipedia.org/wiki/Athletics_at_the...,,,,,,,,Phnom Phen,
4,,,,Vanh Pheara,5.0,,,,2:41:26,,...,https://en.wikipedia.org/wiki/Athletics_at_the...,,,,,,,,Phnom Phen,
5,,,,Quoc Luong Trinh,6.0,,,,2:41:36,,...,https://en.wikipedia.org/wiki/Athletics_at_the...,,,,,,,,Phnom Phen,
6,,,,Sanchai Namkhet,7.0,,,,2:43:47,,...,https://en.wikipedia.org/wiki/Athletics_at_the...,,,,,,,,Phnom Phen,
7,,,,Joanito Fernandes,8.0,,,,2:52:41,,...,https://en.wikipedia.org/wiki/Athletics_at_the...,,,,,,,,Phnom Phen,
8,,,,Tony Ah-Thit Payne,,,,,DNF,,...,https://en.wikipedia.org/wiki/Athletics_at_the...,,,,,,,,Phnom Phen,
9,,,,Yang Piseth,,,,,DNF,,...,https://en.wikipedia.org/wiki/Athletics_at_the...,,,,,,,,Phnom Phen,


In [6]:
df_SHA['NAME'] = df_SHA['NAME'].str.replace('\xa0', '', regex=True)
df_SHA['NAME'] = df_SHA['NAME'].str.replace('[\x00-\x1f\x7f-\x9f]', '', regex=True)
df_SHA['NAME'] = df_SHA['NAME'].str.replace('\r', '', regex=True)
df_SHA['NAME'] = df_SHA['NAME'].str.replace('\n', '', regex=True)
df_SHA['NAME'] = df_SHA['NAME'].str.strip()


In [7]:
df_SHA.to_csv('SEA_manual_additions.csv', index=False, encoding='utf-8')

# Read a csv of name list variations

In [2566]:
os.chdir('/Users/veesheenyuen/Desktop/DataScience/SAA/OCTC/')

names = pd.read_csv("name_variations.csv")

In [2567]:
names

Unnamed: 0,VARIATION,NAME
0,^Harry\sIrfan\sCurra$,HARRY IRFAN CURRAN
1,"Curran, Harry Irfann",HARRY IRFAN CURRAN
2,"^Curran,\sHarry$",HARRY IRFAN CURRAN
3,Harry Irfan Curran Q,HARRY IRFAN CURRAN
4,Wei Jun Huang,HUANG WEI JUN
...,...,...
182,Chua Hsin-Wen Clara,"Chua Hsin-Wen, Clara"
183,Chua Hsin-Wen Clara Q,"Chua Hsin-Wen, Clara"
184,"HSIN-WEN CLARA, Chua","Chua Hsin-Wen, Clara"
185,CHUA HSIN-WEN CLARA,"Chua Hsin-Wen, Clara"


In [None]:
lines = [line.strip() for line in open('file.csv')]
for x in lines:
    match=re.search(r'^_.*_$',x)
    if match: print x

In [None]:
df['NAME'] = df['NAME'].replace(regex=r'^Harry\sIrfan\sCurra$', value='HARRY IRFAN CURRAN')


In [2568]:
for index, row in names.iterrows():
    print(row['VARIATION'])
    

^Harry\sIrfan\sCurra$
Curran, Harry Irfann
^Curran,\sHarry$
Harry Irfan Curran Q
Wei Jun Huang
HUANG, WEI JUN
., Hariharan
S/O KrishnS/O Krishna, Hariharan
KRISHNAN, HARIHARAN S/O
Andrew George Medina
Medina, Andrew
Medina, Andrew George
George Medina, Andrew
George George Medina, Andrew
Gabriel Lee
Lee, Gabriel
Jun Yu Low
Low Jun Yu
Low, Jun Yu
Caleb Hia
Melvin, Wong
MELVIN, WONG
Conrad Kangli Emery
Conrad, Kangli Emery
Kangli Emery Conrad
Conrad Kangli Emery Q
Mun Jern Wei
Mun, Jern Wei Ivan
Mun, Ivan
Wong Yijie Lucas
Wong, Yijie Lucas
Tung Hon Andrew Pak
Pak, Andrew
Brandon Heng Fu Hong
Heng, Brandon
Brandon Heng Fu Hong
HENG, BRANDON
HENG, FU HONG, BRANDON
HENG, FU HONG BRANDON
Mohamed, Haja Fayiz
Osman, Amir Rusyaidi
nan
Goh, Shaun
Shaun Goh
Ethan Yan
TAN, AARON JUSTIN
Tan, Aaron Justin
Rui Yong Soh
YEE, ERIC CHUN WAI
Yee, Eric
YEE CHUN WAI, ERIC
Eric Yee Chun Wai
Yee Chun Wai, Eric
Yee, Chun Wai, Eric
Yee Chun Wai, Eric
YEE, ERIC CHUN WAI
Lim, Oliver
Jie Cong Jayden, Tan
TAN , JI