In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from urllib.request import urlopen
from bs4 import BeautifulSoup
from pylab import rcParams

In [None]:
# website url: http://www.hubertiming.com/results/2017GPTR10K
# download html
url = 'https://www.hubertiming.com/results/2017GPTR10K'
html = urlopen(url)

In [None]:
soup = BeautifulSoup(html, 'html.parser')

In [None]:
# get the individual resutls table
individual_result = soup.find(id='individualResults')
type(individual_result)
print(individual_result)

In [None]:
# get the headers
headers = individual_result.thead
print(headers)

In [None]:
# extract text from headers 
headers = headers.find_all('th')
headers = list(map(lambda h: h.text, headers))
print('headers', headers)

In [None]:
# get each row of the table
rows = individual_result.find_all('tr')
print('There are a total of {} rows in the table'.format(len(rows)))

In [None]:
# for each row get the values of the td
table_data = []
for row in rows:
    row_data = []
    for td in row.find_all('td'):
        row_data.append(td.text)
    table_data.append(row_data)
    
# create DataFrame
table_df = pd.DataFrame(table_data)
table_df.head(10)

In [None]:
# remove first row 
table_data = table_data[1:]

# map Place and Bib from str to number
for row in table_data:
    row[0] = int(row[0])
    row[1] = int(row[1])

In [None]:
# map all time values to ints
# 
def datetimeToMinutes(date_time):
    hour = date_time.hour
    minute = date_time.minute
    second = date_time.second
    return hour * 60 + minute + (second / 60)

for row in table_data:
    row[6] = datetimeToMinutes(datetime.datetime.strptime(row[6], '%H:%M:%S'))
    row[7] = datetimeToMinutes(datetime.datetime.strptime(row[7], '%M:%S'))
    row[11] = datetimeToMinutes(datetime.datetime.strptime(row[11], '%H:%M:%S'))
    row[12] = datetimeToMinutes(datetime.datetime.strptime(row[12], '%H:%M:%S'))

In [None]:
# create DataFrame
table_df = pd.DataFrame(table_data)
table_df.head(10)

In [None]:
# set column names for table
table_df.columns = headers
table_df.head(10)

In [None]:
table_df.describe(include=[np.number])

In [None]:
rcParams['figure.figsize'] = 15, 5

table_df.boxplot(column='Chip Time')
plt.grid(True, axis='y')
plt.ylabel('Chip Time')
plt.xticks([1], ['Runners'])

In [None]:
table_df.boxplot(column='Chip Pace')
plt.grid(True, axis='y')
plt.ylabel('Chip Pace')
plt.xticks([1], ['Runners'])

In [None]:
x = table_df['Chip Time']
ax = sns.distplot(x, hist=True, kde=True, rug=False, color='b', bins=25, hist_kws={'edgecolor':'black'})
plt.show()

In [None]:
f_fuko = table_df.loc[table_df['Gender']=='F']['Chip Time']
m_fuko = table_df.loc[table_df['Gender']=='M']['Chip Time']
sns.distplot(f_fuko, hist=True, kde=True, rug=False, hist_kws={'edgecolor':'black'}, label='Female')
sns.distplot(m_fuko, hist=False, kde=True, rug=False, hist_kws={'edgecolor':'black'}, label='Male')
plt.legend()

In [None]:
g_stats = table_df.groupby('Gender', as_index = True).describe()
print(g_stats)