In [52]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [2]:
# perform GET request to the page
r = requests.get("https://www.tfrrs.org/all_performances/IN_college_m_Indiana_IN.html?list_hnd=4153&amp;season_hnd=608")

# use .content method to extract html content and parse it
soup = BeautifulSoup(r.content, "html.parser")

# return the first htmlElement with the myTable id, in this case the 100m table
hundred_table = soup.find(id="myTable")

# td element are the data entries in the table
entries = hundred_table.find_all('td')

print(len(entries))
print()
print(entries[1])

154

<td class="tablesaw-priority-1">
<a data-turbo="false" data-turbo-frame="_top" href="https://www.tfrrs.org/athletes/7989193/Indiana_IN/Antonio_Laidler">Laidler, Antonio</a>
</td>


In [3]:
items = []

for i in range(len(entries)):
    # If i is 0 or divisible by 7 we want to skip it. This is due to the 0th column being blank in the table
    if not i % 7: 
        continue
    # using the .text method to get the innerHTML value and the .strip() because some text has new line charecters attached to the data
    items.append(entries[i].text.strip()) 

print(items[0:5])

['Laidler, Antonio', 'JR-3', '10.19', 'NCAA East Preliminary Round', 'May 25, 2023']


In [47]:
# create empty dataframe with column names
df = pd.DataFrame(columns=["Athlete", "Year", "Time", "Meet", "Meet Date", "Wind"])

# each grouping of 6 items is a row in the dataframe
for i in range(5, len(items), 6):
    tempDf = pd.DataFrame([{"Athlete": items[i - 5], "Year": items[i - 4], "Time": items[i - 3],
                            "Meet": items[i - 2], "Meet Date": items[i - 1], "Wind": items[i]}])
    # update the current dataframe with the new row
    df = pd.concat([df, tempDf])

df.head()

Unnamed: 0,Athlete,Year,Time,Meet,Meet Date,Wind
0,"Laidler, Antonio",JR-3,10.19,NCAA East Preliminary Round,"May 25, 2023",1.9
0,"Laidler, Antonio",JR-3,10.2,Jim Freeman Louisville Invitational,"Apr 14, 2023",1.6
0,"Grant, Christopher",JR-3,10.28,Jim Freeman Louisville Invitational,"Apr 14, 2023",1.6
0,"Laidler, Antonio",JR-3,10.3,Big Ten Outdoor Championships,"May 12, 2023",0.0
0,"Grant, Christopher",JR-3,10.33,2023 Billy Hayes Invitational,"May 5, 2023",0.5


In [50]:
# apply name modifications
df["Athlete"] = df["Athlete"].apply(lambda x: " ".join(x.replace(" ", "").split(",")[::-1]))

# apply year modifications
df["Year"] = df["Year"].apply(lambda x: x.split("-")[-1])

#apply meet date modifications, assuming "epoch date" is seconds since Unix epoch
df["Meet Date"] = pd.to_datetime(df["Meet Date"], format="%b %d, %Y").astype(np.int64) / int(1e9)

ValueError: time data "1684972800" doesn't match format "%b %d, %Y", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [51]:
df

Unnamed: 0,Athlete,Year,Time,Meet,Meet Date,Wind
0,AntonioLaidler,3,10.19,NCAA East Preliminary Round,1684973000.0,1.9
0,AntonioLaidler,3,10.2,Jim Freeman Louisville Invitational,1681430000.0,1.6
0,ChristopherGrant,3,10.28,Jim Freeman Louisville Invitational,1681430000.0,1.6
0,AntonioLaidler,3,10.3,Big Ten Outdoor Championships,1683850000.0,0.0
0,ChristopherGrant,3,10.33,2023 Billy Hayes Invitational,1683245000.0,0.5
0,ChristopherGrant,3,10.38,Pepsi Florida Relays,1680221000.0,1.1
0,AntonioLaidler,3,10.38,Big Ten Outdoor Championships,1683850000.0,-0.4
0,ChristopherGrant,3,10.41,2023 Billy Hayes Invitational,1683245000.0,1.9
0,AntonioLaidler,3,10.41,Pepsi Florida Relays,1680221000.0,1.5
0,ChristopherGrant,3,10.48,Big Ten Outdoor Championships,1683850000.0,0.1
