In [None]:
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd


column_headings = ["HelmName", "Class", "PY", "SailNo", "Fleet", "Rank", "Elapsed", "Corrected", "Points", "Reg No.", "Reg Date"]
alternative_headings = ["HelmName", "Class", "PY", "SailNo", "Fleet", "Rank", "Place", "Points", "Reg No.", "Reg Date"]

def scrape_race(url):
    """Scrape all the results for a single race."""
    print(url)
    with urllib.request.urlopen(url) as response:
       html = response.read()
    
    soup = BeautifulSoup(html)
    
    tables = soup.findAll('table')
    return tables[:4]


def get_dataframe_from(table):
    """Turn a table into a data frame."""
    table_rows = table.find_all('tr')
    l = []
    for tr in table_rows[1:]:
        td = tr.find_all('td')
        row = [tr.text for tr in td]
        l.append(row)
        
    # Different column headers are used if there were no results recorded in a fleet.
    if l and len(l[0]) == 11:        
        df = pd.DataFrame(l, columns=column_headings)
    else:
        df = pd.DataFrame(l, columns=alternative_headings)
        
    return df

In [4]:
base_url = 'https://www.warsashsc.org.uk/results/18'
series = ['wa', 'wb', 'wc', 'fa', 'fb', 'fc']

all_races_frames = []

for s in series:
    for i in range(1, 9):
        try:
            race = scrape_race(base_url + s + str(i) + '.htm')
        except:
            print('No racing')
        
        for table in race:
            df = get_dataframe_from(table)
            df['Day'] = 'Wednesday'
            df['Series'] = s[1]
            df['Race'] = i
            
            all_races_frames.append(df)
            
all_races = pd.concat(all_races_frames)

https://www.warsashsc.org.uk/results/18wa1.htm


https://www.warsashsc.org.uk/results/18wa2.htm
https://www.warsashsc.org.uk/results/18wa3.htm




https://www.warsashsc.org.uk/results/18wa4.htm
https://www.warsashsc.org.uk/results/18wa5.htm


https://www.warsashsc.org.uk/results/18wa6.htm
https://www.warsashsc.org.uk/results/18wa7.htm


https://www.warsashsc.org.uk/results/18wa8.htm
https://www.warsashsc.org.uk/results/18wb1.htm


https://www.warsashsc.org.uk/results/18wb2.htm
https://www.warsashsc.org.uk/results/18wb3.htm


https://www.warsashsc.org.uk/results/18wb4.htm


https://www.warsashsc.org.uk/results/18wb5.htm
https://www.warsashsc.org.uk/results/18wb6.htm


https://www.warsashsc.org.uk/results/18wb7.htm
https://www.warsashsc.org.uk/results/18wb8.htm




https://www.warsashsc.org.uk/results/18wc1.htm
https://www.warsashsc.org.uk/results/18wc2.htm


https://www.warsashsc.org.uk/results/18wc3.htm


https://www.warsashsc.org.uk/results/18wc4.htm


https://www.warsashsc.org.uk/results/18wc5.htm


No racing
https://www.warsashsc.org.uk/results/18wc6.htm


https://www.warsashsc.org.uk/results/18wc7.htm


No racing
https://www.warsashsc.org.uk/results/18wc8.htm


No racing
https://www.warsashsc.org.uk/results/18fa1.htm
https://www.warsashsc.org.uk/results/18fa2.htm


https://www.warsashsc.org.uk/results/18fa3.htm


https://www.warsashsc.org.uk/results/18fa4.htm
https://www.warsashsc.org.uk/results/18fa5.htm


https://www.warsashsc.org.uk/results/18fa6.htm
https://www.warsashsc.org.uk/results/18fa7.htm


https://www.warsashsc.org.uk/results/18fa8.htm
https://www.warsashsc.org.uk/results/18fb1.htm


https://www.warsashsc.org.uk/results/18fb2.htm


https://www.warsashsc.org.uk/results/18fb3.htm


https://www.warsashsc.org.uk/results/18fb4.htm
https://www.warsashsc.org.uk/results/18fb5.htm


https://www.warsashsc.org.uk/results/18fb6.htm
https://www.warsashsc.org.uk/results/18fb7.htm


https://www.warsashsc.org.uk/results/18fb8.htm


https://www.warsashsc.org.uk/results/18fc1.htm


https://www.warsashsc.org.uk/results/18fc2.htm
https://www.warsashsc.org.uk/results/18fc3.htm


https://www.warsashsc.org.uk/results/18fc4.htm
https://www.warsashsc.org.uk/results/18fc5.htm


https://www.warsashsc.org.uk/results/18fc6.htm


https://www.warsashsc.org.uk/results/18fc7.htm


https://www.warsashsc.org.uk/results/18fc8.htm


No racing


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [8]:
(all_races['Elapsed'] == 'OCS').sum()

22

In [9]:
(all_races['Elapsed'] == 'Duty').sum()

95

In [10]:
(all_races['Elapsed'] == 'DNF').sum()

154