In [1]:
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd


column_headings = ["HelmName", "Class", "PY", "SailNo", "Fleet", "Rank", "Elapsed", "Corrected", "Points", "Reg No.", "Reg Date"]
alternative_headings = ["HelmName", "Class", "PY", "SailNo", "Fleet", "Rank", "Place", "Points", "Reg No.", "Reg Date"]

def scrape_race(url):
    """Scrape all the results for a single race."""
    print(url)
    with urllib.request.urlopen(url) as response:
       html = response.read()
    
    soup = BeautifulSoup(html)
    
    tables = soup.findAll('table')
    return tables[:4]


def get_dataframe_from(table):
    """Turn a table into a data frame."""
    table_rows = table.find_all('tr')
    l = []
    for tr in table_rows[1:]:
        td = tr.find_all('td')
        row = [tr.text for tr in td]
        l.append(row)
        
    # Different column headers are used if there were no results recorded in a fleet.
    if l and len(l[0]) == 11:        
        df = pd.DataFrame(l, columns=column_headings)
    else:
        df = pd.DataFrame(l, columns=alternative_headings)
        
    return df

In [2]:
base_url = 'https://www.warsashsc.org.uk/results/18'
series = ['wa', 'wb', 'wc', 'fa', 'fb', 'fc']

all_races_frames = []

for s in series:
    for i in range(1, 9):
        try:
            race = scrape_race(base_url + s + str(i) + '.htm')
        except:
            print('No racing')
        
        for table in race:
            df = get_dataframe_from(table)
            df['Day'] = s[0]
            df['Series'] = s[1]
            df['Race'] = i
            
            all_races_frames.append(df)
            
all_races = pd.concat(all_races_frames, sort=True)

https://www.warsashsc.org.uk/results/18wa1.htm
https://www.warsashsc.org.uk/results/18wa2.htm
https://www.warsashsc.org.uk/results/18wa3.htm
https://www.warsashsc.org.uk/results/18wa4.htm
https://www.warsashsc.org.uk/results/18wa5.htm
https://www.warsashsc.org.uk/results/18wa6.htm
https://www.warsashsc.org.uk/results/18wa7.htm
https://www.warsashsc.org.uk/results/18wa8.htm
https://www.warsashsc.org.uk/results/18wb1.htm
https://www.warsashsc.org.uk/results/18wb2.htm
https://www.warsashsc.org.uk/results/18wb3.htm
https://www.warsashsc.org.uk/results/18wb4.htm
https://www.warsashsc.org.uk/results/18wb5.htm
https://www.warsashsc.org.uk/results/18wb6.htm
https://www.warsashsc.org.uk/results/18wb7.htm
https://www.warsashsc.org.uk/results/18wb8.htm
https://www.warsashsc.org.uk/results/18wc1.htm
https://www.warsashsc.org.uk/results/18wc2.htm
https://www.warsashsc.org.uk/results/18wc3.htm
https://www.warsashsc.org.uk/results/18wc4.htm
https://www.warsashsc.org.uk/results/18wc5.htm
No racing
htt

In [3]:
len(all_races.index)

1468

In [4]:
(all_races['Elapsed'] == 'OCS').sum()

22

In [5]:
(all_races['Elapsed'] == 'Duty').sum()

95

In [6]:
(all_races['Elapsed'] == 'DNF').sum()

154

In [11]:
excluding_duties = all_races[~all_races['Elapsed'].isin(['Duty'])]
len(excluding_duties.index)

only_finishes = all_races[~all_races['Elapsed'].isin(['Duty', 'OCS', 'DNE', 'DNF', 'DSQ'])]
len(only_finishes.index)

1188

In [None]:
print(all_races)

In [13]:
only_finishes['Class'].nunique()

34

In [15]:
excluding_duties['Class'].value_counts()

RS 400           186
BLAZE            103
RS AERO 7        102
TOPPER            97
FINN              95
WAYFARER          95
LASER 4.7         90
BUZZ              66
LASER RADIAL      60
LASER PICO        43
RS 200            42
LASER 2000        40
HADRON H2         35
WANDERER          29
STREAKER          28
D ONE             27
LASER             25
NAT 12            23
MIRROR            20
RS FEVA XL        20
GRADUATE          18
CONTENDER         17
420               17
RS 300            16
D ZERO            15
KESTREL           12
ALBACORE          11
LASER STRATOS     10
D ZERO BLUE        8
LASER VAGO XD      6
TASAR              5
HOBIE 405          3
RS AERO 5          3
OSPREY             3
LARK               1
RS 100 8.4         1
Hobie 405          1
Name: Class, dtype: int64