In [None]:
# to init players collections, go to
# https://www.ustanorcal.com/ntrpSearch.asp and
# get html tables from 'View Page Source'
# for all areas, genders, and levels with
# filenames "../html/<area>_<gender>_<level>.txt"
import json, os, bs4, requests, time, glob
from datetime import datetime
from bson import json_util

In [None]:
cutoff_date = datetime(2013, 1, 1)
base_url = 'https://www.ustanorcal.com'
player_matches_url = '/'.join([base_url, 'PlayerMatches.asp?id={}'])

In [None]:
def get_player_id(name):
    last_name, first_name = (n.strip() for n in name.split(','))
    for player in players:
        if player['last_name'] == last_name and player['first_name'] == first_name:
            return player['_id']
    raise ValueError('{} not found!'.format(name))

In [None]:
players, player_ids = [], []
for filename in glob.iglob('../html/*.txt'):
    fn = os.path.basename(filename)
    area, gender, level = os.path.splitext(fn)[0].split('_')
    print(area, gender, level)
    with open(filename, 'r') as f:
        soup = bs4.BeautifulSoup(f, 'html.parser')
    for row in soup.table.children:
        if isinstance(row, bs4.element.Tag):
            if not row.td.text or row.td.text == 'Player' or 'Total' in row.td.text:
                continue
            columns = [col for col in row.children if isinstance(col, bs4.element.Tag)]
            _id = int(columns[0].a['href'].split('=')[-1])
            if _id in player_ids: continue
            else: player_ids.append(_id)
            try:
                d = dict(
                    _id = _id, city = columns[1].text,
                    rating_level = float(columns[4].text[:3]),
                    rating_type = columns[4].text[3:],
                    age_group = columns[-1].text, gender = gender, area = area
                )
                name = columns[0].a.text.split(',')
                d['first_name'] = name[-1].strip().split()[0]
                d['last_name'] = name[0].strip()
                players.append(d)
            except:
                print(row)
                raise

with open('../data/players.json', 'w') as players_file:
    json.dump(players, players_file)
print(len(players))
print(players[0])

In [None]:
matches, individual_match_ids = {}, []
for player_idx, player in enumerate(players):
    if player['_id'] != 169636: continue
    #if player_idx > 5: break
    response = requests.get(player_matches_url.format(player['_id']))
    matches_html = bs4.BeautifulSoup(response.content, 'html.parser')
    for league_idx, row_league in enumerate(matches_html.find(id='leagues_section').children):
        if isinstance(row_league, bs4.element.Tag) and \
        row_league.td and row_league.td.a:
            print(player_idx, player['last_name'], league_idx)
            url = '/'.join([base_url, row_league.td.a['href']])
            r = requests.get(url).content
            season_matches_html = bs4.BeautifulSoup(r, 'html.parser')
            for row in season_matches_html.find("table", class_="table well").children:
                if isinstance(row, bs4.element.Tag) and row.td:
                    columns = [col for col in row.children if isinstance(col, bs4.element.Tag)]
                    match_id = int(columns[0].a['href'].split('?')[-1].split('&')[0].split('=')[1])
                    line, match_type = columns[6].text.lower().split()
                    individual_match_id = '_'.join([str(match_id), line + match_type[0]])
                    if individual_match_id in individual_match_ids:
                        continue
                    else:
                        individual_match_ids.append(individual_match_id)
                    line_idx = int(line) - 1
                    if match_id not in matches:
                        match_date = datetime.strptime(columns[0].a.text, '%m/%d/%Y')
                        if match_date < cutoff_date:
                            continue
                        matches[match_id] = dict(
                            date = match_date,
                            league = columns[1].a.text.split()[-1][:-1],
                            singles = [None, None], doubles = [None, None, None]
                        )
                    matches[match_id][match_type][line_idx] = dict(
                        score = columns[5].text.strip().split(','), # TODO save numerically?
                    )
                    home_players = [p.text for p in columns[2].find_all('a')]
                    visiting_players = [p.text for p in columns[4].find_all('a')]
                    winners = [p.text for p in columns[7].find_all('a')]
                    losers = home_players if visiting_players == winner else visiting_players
                    print(match_id)
                    matches[match_id][match_type][line_idx].update(dict(
                            winners = [get_player_id(name) for name in winners],
                            losers = [get_player_id(name) for name in losers]
                        ))
            time.sleep(.5)

matches_coll = []
for match_id in list(matches.keys()):
    match = matches.pop(match_id)
    match['_id'] = match_id
    matches_coll.append(match)

with open('../data/matches.json', 'w') as matches_file:
    json.dump(matches_coll, matches_file, default=json_util.default)
print(len(matches_coll))
if matches_coll:
    print(matches_coll[0])