<a href="https://www.kaggle.com/code/tgomesjuliana/crossfit-competitions-feature-engineering?scriptVersionId=137077885" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Imports

In [1]:
import pandas as pd
import numpy as np

# Definitions

In [2]:
years = [2021, 2022, 2023]
final_competiton = 'games'
qualifying_competitions = ['open', 'quarterfinals', 'semifinals']

# Feature Engineering

## Athletes Information

In [3]:
information_data = pd.read_csv("../input/crossfit-competitions/consolidated_athletes_information.csv")
information_data

Unnamed: 0,competitorId,competitorName,firstName,lastName,gender,countryOfOriginName,regionName,affiliateId,affiliateName,age,height,weight,overallRank,overallScore,year,competition,height_cm,weight_kg
0,469656.0,Jeffrey Adler,Jeffrey,Adler,M,Canada,North America,18059.0,CrossFit Wonderland,27.0,69 in,197 lb,1.0,101.0,2021,open,176.0,90.0
1,34796.0,Scott Panchik,Scott,Panchik,M,United States,North America,7991.0,CrossFit Mentality,33.0,69 in,187 lb,2.0,141.0,2021,open,176.0,85.0
2,105875.0,Travis Mead,Travis,Mead,M,United States,North America,9155.0,Iron Valley CrossFit,34.0,73 in,205 lb,3.0,165.0,2021,open,186.0,93.0
3,310970.0,Saxon Panchik,Saxon,Panchik,M,United States,North America,22505.0,CrossFit Cliffside,25.0,69 in,180 lb,4.0,217.0,2021,open,176.0,82.0
4,11435.0,Richard Froning Jr.,Richard,Froning Jr.,M,United States,North America,3220.0,CrossFit Mayhem,33.0,69 in,194 lb,5.0,254.0,2021,open,176.0,88.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399337,872275.0,Caroline Stanley,Caroline,Stanley,F,United States,North America,3617.0,College Hill CrossFit,23.0,63 in,145 lb,,0.0,2023,games,161.0,66.0
399338,174547.0,Amanda Barnhart,Amanda,Barnhart,F,United States,North America,14583.0,CrossFit High Gear,31.0,67 in,160 lb,,0.0,2023,games,171.0,73.0
399339,121033.0,Sydney Wells,Sydney,Wells,F,United States,North America,19593.0,CrossFit East Nashville,28.0,66 in,151 lb,,0.0,2023,games,168.0,69.0
399340,503582.0,Alexis Raptis,Alexis,Raptis,F,United States,North America,25335.0,TTT CrossFit,24.0,65 in,155 lb,,0.0,2023,games,166.0,71.0


In [4]:
# Create an empty DataFrame to accumulate the selected columns
merged_df = pd.DataFrame()

for year in years:
    # Filter rows with 'games' competition and the current year
    filtered_df = information_data[(information_data['competition'] == final_competiton) & (information_data['year'] == year)].copy()

    # Get unique competitorIds for the current year
    unique_ids = filtered_df['competitorId'].unique()

    # Create a DataFrame to accumulate the selected columns for the current year
    merged_df_current_year = filtered_df.copy()

    for competition in qualifying_competitions:
        # Filter rows with selected competitorIds, specific competition, and the current year
        result_df = information_data[(information_data['competitorId'].isin(unique_ids)) & (information_data['competition'] == competition) & (information_data['year'] == year)].copy()

        # Select relevant columns and rename them based on the competition
        selected_columns = result_df[['competitorId', 'overallRank', 'overallScore']].copy()
        selected_columns.rename(columns={'overallRank': f'{competition}_rank', 'overallScore': f'{competition}_score'}, inplace=True)

        # Merge the selected columns with the accumulated DataFrame based on 'competitorId' and 'year'
        merged_df_current_year = pd.merge(merged_df_current_year, selected_columns, on=['competitorId'], how='left')

    # Sort by 'rank_semifinals' and drop duplicates based on 'competitorId'
    merged_df_current_year = merged_df_current_year.sort_values('semifinals_rank').drop_duplicates('competitorId', keep='first')

    # Sort the final DataFrame for the current year by 'gender' and 'overallRank'
    merged_df_current_year = merged_df_current_year.sort_values(by=['gender', 'overallRank'])

    # Append the current year's DataFrame to the merged_df DataFrame
    merged_df = pd.concat([merged_df, merged_df_current_year], ignore_index=True)

# Output the selected columns
merged_df

Unnamed: 0,competitorId,competitorName,firstName,lastName,gender,countryOfOriginName,regionName,affiliateId,affiliateName,age,...,year,competition,height_cm,weight_kg,open_rank,open_score,quarterfinals_rank,quarterfinals_score,semifinals_rank,semifinals_score
0,163097.0,Tia-Clair Toomey,Tia-Clair,Toomey,F,Australia,Oceania,19593.0,CrossFit East Nashville,27.0,...,2021,games,163.0,58.0,1.0,26.0,1.0,100.0,1.0,588.0
1,591912.0,Laura Horváth,Laura,Horváth,F,Hungary,Europe,22540.0,CrossFit Glasshouse,24.0,...,2021,games,170.0,70.0,52.0,845.0,3.0,185.0,2.0,544.0
2,18588.0,Annie Thorisdottir,Annie,Thorisdottir,F,Iceland,Europe,2025.0,CrossFit Reykjavík,31.0,...,2021,games,170.0,69.0,118.0,1501.0,39.0,766.0,3.0,528.0
3,120480.0,Kristin Holte,Kristin,Holte,F,Norway,Europe,372.0,CrossFit Oslo,35.0,...,2021,games,162.0,59.0,17.0,414.0,10.0,241.0,1.0,572.0
4,671666.0,Haley Adams,Haley,Adams,F,United States,North America,3220.0,CrossFit Mayhem,20.0,...,2021,games,171.0,64.0,10.0,307.0,56.0,963.0,2.0,528.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,665223.0,Michal Wesolowski,Michal,Wesolowski,M,Poland,Europe,29893.0,CrossFit Strong House,30.0,...,2023,games,184.0,98.0,,,,,10.0,426.0
236,867741.0,Jack Farlow,Jack,Farlow,M,Canada,North America,21726.0,CrossFit PSC,21.0,...,2023,games,186.0,96.0,30.0,1301.0,17.0,415.0,10.0,477.0
237,900251.0,James Sprague,James,Sprague,M,United States,North America,1893.0,CrossFit Naples,21.0,...,2023,games,188.0,96.0,168.0,3299.0,31.0,514.0,11.0,460.0
238,1657300.0,Fabian Beneito,Fabian,Beneito,M,Spain,Europe,17272.0,CrossFit Zarautz,27.0,...,2023,games,178.0,90.0,70.0,2149.0,7.0,230.0,11.0,420.0


## Athletes Scores

In [5]:
scores_data = pd.read_csv("../input/crossfit-competitions/consolidated_athletes_scores.csv")
scores_data

Unnamed: 0,workout,rank,score,valid,scoreDisplay,competitorId,year,competition,score_type,total_workouts
0,1.0,20.0,6050185.0,1.0,11:55,469656.0,2021,open,time,4
1,2.0,8.0,2250646.0,1.0,9:14,469656.0,2021,open,time,4
2,3.0,27.0,1800405.0,1.0,8:15,469656.0,2021,open,time,4
3,4.0,46.0,317180405.0,1.0,317 lbs,469656.0,2021,open,weight,4
4,1.0,33.0,6050155.0,1.0,12:25,34796.0,2021,open,time,4
...,...,...,...,...,...,...,...,...,...,...
2714561,6.0,29.0,3.0,1.0,CAP +29,2094617.0,2023,semifinals,time,7
2714562,7.0,29.0,3.0,1.0,CAP +68,2094617.0,2023,semifinals,time,7
2714563,1.0,24.0,18.0,1.0,CAP +85,1313021.0,2023,semifinals,time,7
2714564,2.0,23.0,21.0,1.0,0,1313021.0,2023,semifinals,repetition,7
