In [None]:
import pandas as pd
import requests
import io


In [None]:
# URLs for EMass 2017-11-11
# http://www.coolrunning.com/results/17/ma.shtml#11

# Format is: URL, results format, division

# 0 = Default format: Place, number, name, ...
# 1 = Place, name, ...

urls_boys = [('http://www.coolrunning.com/results/17/ma/Nov11_MIAAEa_set4.shtml', 0, 1),  ('http://www.coolrunning.com/results/17/ma/Nov11_MIAAEa_set5.shtml', 0, 2),  ('http://www.coolrunning.com/results/17/ma/Nov11_MIAAEa_set6.shtml', 0, 3),  ('http://www.coolrunning.com/results/17/ma/Nov11_MIAAEa_1_set4.shtml', 0, 4),  ('http://www.coolrunning.com/results/17/ma/Nov11_MIAAEa_1_set5.shtml', 0, 5),  ('http://www.coolrunning.com/results/17/ma/Nov11_MIAAEa_1_set6.shtml', 1, 6)]

urls_girls = [('http://www.coolrunning.com/results/17/ma/Nov11_MIAAEa_set1.shtml', 0, 1), ('http://www.coolrunning.com/results/17/ma/Nov11_MIAAEa_set2.shtml', 0, 2), ('http://www.coolrunning.com/results/17/ma/Nov11_MIAAEa_set3.shtml', 0, 3), ('http://www.coolrunning.com/results/17/ma/Nov11_MIAAEa_1_set1.shtml', 0, 4), ('http://www.coolrunning.com/results/17/ma/Nov11_MIAAEa_1_set2.shtml', 0, 5), ('http://www.coolrunning.com/results/17/ma/Nov11_MIAAEa_1_set3.shtml', 0, 6)]

In [None]:
for url, f, div in urls_boys:
    print(url, f, div)

In [None]:
def get_fixed_width_table(webpage, start_string, end_string, first_col_width=3):
    '''
    webpage is a multi-line text object, typically a web page URL
    
    This function ignores any text before it sees the first occurrence of start_string, 
    and any text after end_string
    '''
    
    #print(webpage)
    
    str_tbl = ''
    
    start_flag = False
    end_flag = False
    
    for l in webpage.splitlines():
        
        #print('Line: ' + l)
        
        if end_flag:
            break
        
        if not (start_flag):
            
            # Look for the start string
            if l.find(start_string) >= 0:
                start_flag = True
    
        else:
            
            if l.find(end_string) >= 0:
                end_flag = True
            else:
                # Grab data, if clean:
                if l[0:first_col_width].strip().isdigit():
                    str_tbl = str_tbl + l + '\n'
    
    return str_tbl

In [None]:
# Build the master dataFrame for boys

df_list = []

for u, layout, div in urls_boys:
    results_page = requests.get(u)
    results_table = get_fixed_width_table(results_page.text, '====', 'Team Scores')

    # Convert a string to a string we can send to read_fwf
    io_results_table = io.StringIO(results_table)

    if layout == 1:
        fields=[(0, 3), (0, 0), (4, 30), (30, 32), (33, 54), (0, 0), (55,65), (66, 69)]
    else:
        # Default
        fields=[(0, 3), (4, 9), (10, 30), (30, 32), (33, 56), (57, 65), (66,75), (76, 80)]
  
    df_results = pd.read_fwf(io_results_table, colspecs=fields, names = ['Place', 'Number', 'Name', 'Grade', 'School', 'Pace', 'TimeString', 'Score'] )
    df_results['Division'] = div
    df_list.append(df_results)
    
#    print(df_results.head(10))
    
df_results_boys = pd.concat(df_list)


In [None]:
df_list = []

for u, layout, div in urls_girls:
    results_page = requests.get(u)
    results_table = get_fixed_width_table(results_page.text, '====', 'Team Scores')

    # Convert a string to a string we can send to read_fwf
    io_results_table = io.StringIO(results_table)

    if layout == 1:
        fields=[(0, 3), (0, 0), (4, 30), (30, 32), (33, 54), (0, 0), (55,65), (66, 69)]
    else:
        # Default
        fields=[(0, 3), (4, 9), (10, 30), (30, 32), (33, 56), (57, 65), (66,75), (76, 80)]
  
    df_results = pd.read_fwf(io_results_table, colspecs=fields, names = ['Place', 'Number', 'Name', 'Grade', 'School', 'Pace', 'TimeString', 'Score'] )
    df_results['Division'] = div
    df_list.append(df_results)
    
    print(df_results.head(10))
    
df_results_girls = pd.concat(df_list)

In [None]:
print(df_results_boys.info())

In [None]:
#df_results_boys = df_results_boys.sort_values('Time')

df_results_boys['Time'] = pd.to_datetime(df_results_boys['TimeString'], format='%M:%S.%f').dt.time
df_results_boys = df_results_boys.sort_values('Time')

df_results_girls['Time'] = pd.to_datetime(df_results_girls['TimeString'], format='%M:%S.%f').dt.time
df_results_girls = df_results_girls.sort_values('Time')


In [None]:
df_results_boys.head(10)

In [None]:
# This is for convenience when summing runners' relative and overall places
df_results_boys["tmpCnt"] = 1
df_results_boys["RunnerNumber"] = df_results_boys["tmpCnt"].groupby(df_results_boys['School']).cumsum()
df_results_boys["OverallPlace"] = df_results_boys["tmpCnt"].cumsum()

df_results_girls["tmpCnt"] = 1
df_results_girls["RunnerNumber"] = df_results_girls["tmpCnt"].groupby(df_results_girls['School']).cumsum()
df_results_girls["OverallPlace"] = df_results_girls["tmpCnt"].cumsum()


In [None]:
df_results_boys.head(20)

In [None]:
# Figure out scores

df_results_boys[df_results_boys["RunnerNumber"] < 6].groupby('School').agg({'OverallPlace':'sum'}).sort_values('OverallPlace')


In [None]:
# Figure out scores

df_results_girls[df_results_girls["RunnerNumber"] < 6].groupby('School').agg({'OverallPlace':'sum'}).sort_values('OverallPlace')

In [None]:
df_results_boys[df_results_boys["RunnerNumber"] == 6]