# Imports

In [1]:
import pandas as pd
import requests

# Functions

In [2]:
def scraping_data_open_and_games(year, competition, scores=False):
    base_url = "https://c3po.crossfit.com/api/leaderboards/v2/competitions"
    data_list = []
    genders = [1, 2]

    # Iterate over each gender
    for gender in genders:
        # Create the initial API URL to fetch the data for the first page
        url = f"{base_url}/{competition}/{year}/leaderboards?division={gender}&page=1"
        try:
            # Fetch the response for the first page
            response = requests.get(url).json()
            # Extract the total number of pages from the response
            total_pages = response['pagination']['totalPages']
            # Iterate over each page
            for page in range(1, total_pages + 1):
                # Create the API URL for each page
                url = f"{base_url}/{competition}/{year}/leaderboards?division={gender}&page={page}"
                try:
                    # Fetch the response for the current page
                    response = requests.get(url).json()
                    # Extract the data for each row in the leaderboardRows
                    for row in response['leaderboardRows']:
                        # Append entrant data for the first query
                        if not scores:
                            row_data = row['entrant'].copy()
                            row_data['overallRank'] = row['overallRank']
                            row_data['overallScore'] = row['overallScore']
                            # Append the row data to the data_list
                            data_list.append(row_data)
                        # Append score data for the second query
                        else:
                            total_ordinals = pd.DataFrame(row['scores'] for row in response['leaderboardRows']).shape[1]
                            for ordinal in range(0, total_ordinals):
                                score_data = row['scores'][ordinal].copy()
                                score_data['competitorId'] = row['entrant']['competitorId']
                                # Append the score data to the data_list
                                data_list.append(score_data)
                except Exception as e:
                    # Handle any errors that occur during the API request for a specific page
                    print(f"Error occurred while fetching data for gender={gender}, page={page}: {e}")
        except Exception as e:
            # Handle any errors that occur during the API request for fetching total_pages
            print(f"Error occurred while fetching total_pages for gender={gender}: {e}")

    # Create a DataFrame from the collected data_list
    df = pd.DataFrame(data_list)
    
    return df

# Data Scraping

### 2021

##### Open

###### Athletes information

In [8]:
# Query 1 - 2021 Open Not Scores
df_2021_open_not_scores = scraping_data_open_and_games('2021', 'open')

df_2021_open_not_scores

In [2]:
# Query 1 - 2021 Open Not Scores
df_2021_open_not_scores = scraping_data_open_and_games('2021', 'open')

df_2021_open_not_scores

# base_url = "https://c3po.crossfit.com/api/leaderboards/v2/competitions/open/2021/leaderboards"

# data_list = []
# genders = [1, 2]

# # Iterate over each gender
# for gender in genders:
#     # Create the initial API URL to fetch the data for the first page
#     url = f"{base_url}?division={gender}&page=1"
#     try:
#         # Fetch the response for the first page
#         response = requests.get(url).json()
#         # Extract the total number of pages from the response
#         total_pages = response['pagination']['totalPages']
#         # Iterate over each page
#         for page in range(1, total_pages + 1):
#             # Create the API URL for each page
#             url = f"{base_url}?division={gender}&page={page}"
#             try:
#                 # Fetch the response for the current page
#                 response = requests.get(url).json()
#                 # Extract the data for each row in the leaderboardRows
#                 for row in response['leaderboardRows']:
#                     # Copy the entrant data and add additional fields for overallRank and overallScore
#                     row_data = row['entrant'].copy()
#                     row_data['overallRank'] = row['overallRank']
#                     row_data['overallScore'] = row['overallScore']
#                     # Append the row data to the data_list
#                     data_list.append(row_data)
#             except Exception as e:
#                 # Handle any errors that occur during the API request for a specific page
#                 print(f"Error occurred while fetching data for gender={gender}, page={page}: {e}")
#     except Exception as e:
#         # Handle any errors that occur during the API request for fetching total_pages
#         print(f"Error occurred while fetching total_pages for gender={gender}: {e}")

# # Create a DataFrame from the collected data_list
# df = pd.DataFrame(data_list)

# df

Unnamed: 0,competitorId,competitorName,firstName,lastName,status,postCompStatus,gender,profilePicS3key,countryOfOriginCode,countryOfOriginName,...,regionName,divisionId,affiliateId,affiliateName,age,height,weight,teamCaptain,overallRank,overallScore
0,469656,Jeffrey Adler,Jeffrey,Adler,ACT,,M,e480e-P469656_1-184.jpg,CA,Canada,...,North America,1,18059,CrossFit Wonderland,27,69 in,197 lb,0,1,101
1,34796,Scott Panchik,Scott,Panchik,ACT,,M,e23e0-P34796_8-184.jpg,US,United States,...,North America,1,7991,CrossFit Mentality,33,69 in,187 lb,1,2,141
2,105875,Travis Mead,Travis,Mead,ACT,,M,7ebb9-P105875_6-184.jpg,US,United States,...,North America,1,9155,Iron Valley CrossFit,34,73 in,205 lb,0,3,165
3,310970,Saxon Panchik,Saxon,Panchik,ACT,,M,00087-P310970_11-184.jpg,US,United States,...,North America,1,22505,CrossFit Cliffside,25,69 in,180 lb,0,4,217
4,11435,Richard Froning Jr.,Richard,Froning Jr.,ACT,,M,e61ee-P11435_12-184.jpg,US,United States,...,North America,1,3220,CrossFit Mayhem,33,69 in,194 lb,1,5,254
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246100,170162,Sarah Lucas,Sarah,Lucas,ACT,,F,73b48-P170162_1-184.jpg,US,United States,...,North America,19,,,38,66 in,142 lb,0,108565,374151
246101,1166348,Karen Thomson,Karen,Thomson,ACT,,F,women-square.jpg,US,United States,...,North America,13,18077,CrossFit SFP,40,70 in,145 lb,0,108565,374151
246102,27733,Natascha Heller,Natascha,Heller,ACT,,F,e57e0-P27733_2-184.jpg,US,United States,...,North America,6,12853,UnScared CrossFit,50,,,0,108565,374151
246103,1953600,Lindsey Miller,Lindsey,Miller,ACT,,F,women-square.jpg,US,United States,...,North America,2,,,27,,,0,108565,374151


###### Athletes scores

In [3]:
# Query 2 - 2021 Open Scores
df_2021_open_scores = scraping_data_open_and_games('2021', 'open', scores=True)

df_2021_open_scores

# base_url = "https://c3po.crossfit.com/api/leaderboards/v2/competitions/open/2021/leaderboards"

# data_list = []
# genders = [1, 2]

# # Iterate over each gender
# for gender in genders:
#     # Create the initial API URL to fetch the data for the first page
#     url = f"{base_url}?division={gender}&page=1"
#     try:
#         # Fetch the response for the first page
#         response = requests.get(url).json()
#         # Extract the total number of pages from the response
#         total_pages = response['pagination']['totalPages']
#         # Iterate over each page
#         for page in range(1, total_pages + 1):
#             # Create the API URL for each page
#             url = f"{base_url}?division={gender}&page={page}"
#             try:
#                 # Fetch the response for the current page
#                 response = requests.get(url).json()
#                 # Extract the data for each row in the leaderboardRows
#                 for row in response['leaderboardRows']:
#                     # Extract the total number of ordinals from the scores
#                     total_ordinals = pd.DataFrame(row['scores'] for row in response['leaderboardRows']).shape[1]
#                     # Iterate over each ordinal
#                     for ordinal in range(0, total_ordinals):
#                         # Copy the entrant data and add additional fields for overallRank and overallScore
#                         row_data = row['scores'][ordinal].copy()
#                         row_data['competitorId'] = row['entrant']['competitorId']
#                         # Append the row data to the data_list
#                         data_list.append(row_data)
#             except Exception as e:
#                 # Handle any errors that occur during the API request for a specific page
#                 print(f"Error occurred while fetching data for gender={gender}, page={page}: {e}")
#     except Exception as e:
#         # Handle any errors that occur during the API request for fetching total_pages
#         print(f"Error occurred while fetching total_pages for gender={gender}: {e}")

# # Create a DataFrame from the collected data_list
# df = pd.DataFrame(data_list)

# df

##### Quarterfinals

###### Athletes information

In [None]:
base_url = "https://c3po.crossfit.com/api/leaderboards/v2/competitions/quarterfinalsindividual/2021/leaderboards"

data_list = []
genders = [1, 2]

# Iterate over each gender
for gender in genders:
    # Create the initial API URL to fetch the data for the first page
    url = f"{base_url}?division={gender}&page=1"
    try:
        # Fetch the response for the first page
        response = requests.get(url).json()
        # Extract the total number of pages from the response
        total_pages = response['pagination']['totalPages']
        # Iterate over each page
        for page in range(1, total_pages + 1):
            # Create the API URL for each page
            url = f"{base_url}?division={gender}&page={page}"
            try:
                # Fetch the response for the current page
                response = requests.get(url).json()
                # Extract the data for each row in the leaderboardRows
                for row in response['leaderboardRows']:
                    # Copy the entrant data and add additional fields for overallRank and overallScore
                    row_data = row['entrant'].copy()
                    row_data['overallRank'] = row['overallRank']
                    row_data['overallScore'] = row['overallScore']
                    # Append the row data to the data_list
                    data_list.append(row_data)
            except Exception as e:
                # Handle any errors that occur during the API request for a specific page
                print(f"Error occurred while fetching data for gender={gender}, page={page}: {e}")
    except Exception as e:
        # Handle any errors that occur during the API request for fetching total_pages
        print(f"Error occurred while fetching total_pages for gender={gender}: {e}")

# Create a DataFrame from the collected data_list
df = pd.DataFrame(data_list)

df

###### Athletes scores

In [None]:
base_url = "https://c3po.crossfit.com/api/leaderboards/v2/competitions/quarterfinalsindividual/2021/leaderboards"

data_list = []
genders = [1, 2]

# Iterate over each gender
for gender in genders:
    # Create the initial API URL to fetch the data for the first page
    url = f"{base_url}?division={gender}&page=1"
    try:
        # Fetch the response for the first page
        response = requests.get(url).json()
        # Extract the total number of pages from the response
        total_pages = response['pagination']['totalPages']
        # Iterate over each page
        for page in range(1, total_pages + 1):
            # Create the API URL for each page
            url = f"{base_url}?division={gender}&page={page}"
            try:
                # Fetch the response for the current page
                response = requests.get(url).json()
                # Extract the data for each row in the leaderboardRows
                for row in response['leaderboardRows']:
                    # Extract the total number of ordinals from the scores
                    total_ordinals = pd.DataFrame(row['scores'] for row in response['leaderboardRows']).shape[1]
                    # Iterate over each ordinal
                    for ordinal in range(0, total_ordinals):
                        # Copy the entrant data and add additional fields for overallRank and overallScore
                        row_data = row['scores'][ordinal].copy()
                        row_data['competitorId'] = row['entrant']['competitorId']
                        # Append the row data to the data_list
                        data_list.append(row_data)
            except Exception as e:
                # Handle any errors that occur during the API request for a specific page
                print(f"Error occurred while fetching data for gender={gender}, page={page}: {e}")
    except Exception as e:
        # Handle any errors that occur during the API request for fetching total_pages
        print(f"Error occurred while fetching total_pages for gender={gender}: {e}")

# Create a DataFrame from the collected data_list
df = pd.DataFrame(data_list)

df

##### Semifinals

###### Athletes information

In [None]:
base_url = "https://c3po.crossfit.com/api/leaderboards/v2/competitions/semifinals/2021/leaderboards"

data_list = []
semifinals = [176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 187]
genders = [1, 2]

# Iterate over each semifinal
for semifinal in semifinals:
    # Iterate over each gender
    for gender in genders:
        # Create the initial API URL to fetch the data for the first page
        url = f"{base_url}?semifinal={semifinal}&division={gender}&page=1"
        try:
            # Fetch the response for the first page
            response = requests.get(url).json()
            # Extract the total number of pages from the response
            total_pages = response['pagination']['totalPages']
            # Iterate over each page
            for page in range(1, total_pages + 1):
                # Create the API URL for each page
                url = f"{base_url}?semifinal={semifinal}&division={gender}&page={page}"
                try:
                    # Fetch the response for the current page
                    response = requests.get(url).json()
                    # Extract the data for each row in the leaderboardRows
                    for row in response['leaderboardRows']:
                        # Copy the entrant data and add additional fields for overallRank and overallScore
                        row_data = row['entrant'].copy()
                        row_data['overallRank'] = row['overallRank']
                        row_data['overallScore'] = row['overallScore']
                        # Append the row data to the data_list
                        data_list.append(row_data)
                except Exception as e:
                    # Handle any errors that occur during the API request for a specific page
                    print(f"Error occurred while fetching data for semifinal={semifinal}, gender={gender}, page={page}: {e}")
        except Exception as e:
            # Handle any errors that occur during the API request for fetching total_pages
            print(f"Error occurred while fetching total_pages for semifinal={semifinal}, gender={gender}: {e}")

# Create a DataFrame from the collected data_list
df = pd.DataFrame(data_list)

df

###### Athletes scores

In [None]:
base_url = "https://c3po.crossfit.com/api/leaderboards/v2/competitions/semifinals/2021/leaderboards"

data_list = []
semifinals = [176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 187]
genders = [1, 2]

# Iterate over each semifinal
for semifinal in semifinals:
    # Iterate over each gender
    for gender in genders:
        # Create the initial API URL to fetch the data for the first page
        url = f"{base_url}?semifinal={semifinal}&division={gender}&page=1"
        try:
            # Fetch the response for the first page
            response = requests.get(url).json()
            # Extract the total number of pages from the response
            total_pages = response['pagination']['totalPages']
            # Iterate over each page
            for page in range(1, total_pages + 1):
                # Create the API URL for each page
                url = f"{base_url}?semifinal={semifinal}&division={gender}&page={page}"
                try:
                    # Fetch the response for the current page
                    response = requests.get(url).json()
                    # Extract the data for each row in the leaderboardRows
                    for row in response['leaderboardRows']:
                        # Extract the total number of ordinals from the scores
                        total_ordinals = pd.DataFrame(row['scores'] for row in response['leaderboardRows']).shape[1]
                        # Iterate over each ordinal
                        for ordinal in range(0, total_ordinals):
                            # Copy the entrant data and add additional fields for overallRank and overallScore
                            row_data = row['scores'][ordinal].copy()
                            row_data['competitorId'] = row['entrant']['competitorId']
                            # Append the row data to the data_list
                            data_list.append(row_data)
                except Exception as e:
                    # Handle any errors that occur during the API request for a specific page
                    print(f"Error occurred while fetching data for semifinal={semifinal}, gender={gender}, page={page}: {e}")
        except Exception as e:
            # Handle any errors that occur during the API request for fetching total_pages
            print(f"Error occurred while fetching total_pages for semifinal={semifinal}, gender={gender}: {e}")

# Create a DataFrame from the collected data_list
df = pd.DataFrame(data_list)

df

##### Games

###### Athletes information

In [5]:
# Query 7 - 2021 Games Not Scores
df_2021_games_not_scores = scraping_data_open_and_games('2021', 'games')

df_2021_games_not_scores

Unnamed: 0,competitorId,competitorName,firstName,lastName,status,postCompStatus,gender,profilePicS3key,countryOfOriginCode,countryOfOriginName,...,regionName,divisionId,affiliateId,affiliateName,age,height,weight,teamCaptain,overallRank,overallScore
0,811708,Justin Medeiros,Justin,Medeiros,ACT,,M,672d4-P811708_4-184.jpg,US,United States,...,North America,1,1792,CrossFit Fort Vancouver,22,69 in,195 lb,0,1,1234
1,158264,Patrick Vellner,Patrick,Vellner,ACT,,M,d471c-P158264_7-184.jpg,CA,Canada,...,North America,1,1918,CrossFit Nanaimo,31,71 in,195 lb,0,2,1152
2,107101,Brent Fikowski,Brent,Fikowski,ACT,,M,93ab7-P107101_10-184.jpg,CA,Canada,...,North America,1,,,30,74 in,220 lb,0,3,1028
3,81616,Björgvin Karl Guðmundsson,Björgvin Karl,Guðmundsson,ACT,,M,4c5dc-P81616_4-184.jpg,IS,Iceland,...,Europe,1,4860,CrossFit Hengill,28,178 cm,190 lb,0,4,1004
4,310970,Saxon Panchik,Saxon,Panchik,ACT,,M,00087-P310970_11-184.jpg,US,United States,...,North America,1,22505,CrossFit Cliffside,25,69 in,180 lb,0,5,996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,2942,Kara Saunders,Kara,Saunders,WD,,F,5ca2a-P2942_14-184.jpg,AU,Australia,...,Oceania,2,9043,CrossFit Carv,31,162 cm,162 lb,0,36,93
76,305891,Kari Pearce,Kari,Pearce,WD,,F,17c8c-P305891_3-184.jpg,US,United States,...,North America,2,7589,CrossFit Culmination,32,63 in,139 lb,0,37,0
77,455677,Larissa Cunha,Larissa,Cunha,ACT,,F,d65e1-P455677_3-184.jpg,BR,Brazil,...,South America,2,10585,Cavaleiros CrossFit,30,153 cm,62 kg,0,0,0
78,592472,Bethany Shadburne,Bethany,Shadburne,ACT,,F,69340-P592472_6-184.jpg,US,United States,...,North America,2,7589,CrossFit Culmination,27,64 in,144 lb,0,0,0


###### Athletes scores

In [7]:
# Query 8 - 2021 Games Scores
df_2021_games_scores = scraping_data_open_and_games('2021', 'games', scores=True)

df_2021_games_scores

Unnamed: 0,ordinal,rank,score,valid,scoreDisplay,scoreIdentifier,mobileScoreDisplay,scaled,video,heat,lane,breakdown,competitorId
0,1,5,88,1,1:10:54.31,7c1a5d8597b52e58b888,,0,0,,,88 pts,811708
1,2,15,58,1,09:13.54,17abd90e392bf5f82867,,0,0,,,58 pts,811708
2,3,11,70,1,01:18.21,e36bf0a693c297204fc2,,0,0,,,70 pts,811708
3,4,2,97,1,14:50.28,2909e36dc32303d62419,,0,0,,,97 pts,811708
4,5,4,91,1,12:05.77,42fa841b175f953cd3fe,,0,0,,,91 pts,811708
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,11,,-1,,,,,0,0,,,,1019212
1196,12,,-1,,,,,0,0,,,,1019212
1197,13,,-1,,,,,0,0,,,,1019212
1198,14,,-1,,,,,0,0,,,,1019212
