In [None]:
import pymongo, itertools, collections
from decimal import Decimal
import networkx as nx
import matplotlib.pyplot as plt
from circos import CircosPlot
from hiveplot import HivePlot
%matplotlib inline

In [None]:
client = pymongo.MongoClient('mongodb://localhost:27017')
players, matches = client['usta'].players, client['usta'].matches
print(players.count(), matches.count())

In [None]:
def get_total_games(score):
    return sum([int(s[0]) + int(s[1]) for s in score])

def pretty_score(score):
    return '/'.join(['-'.join(s) for s in score])

def is_bagel(score):
    return all([not int(s[1]) for s in score])

score = ['76', '64']
assert(get_total_games(score) == 23)
assert(pretty_score(score) == '7-6/6-4')
assert(is_bagel(score) == False)
assert(is_bagel(['60', '30']) == True)

In [None]:
def get_player_graph(player, prev_player_id=None, graph=None):
    indent = '\t' if prev_player_id is not None else ''
    player_id = int(player['_id'])
    player_full_name = ' '.join([player['first_name'], player['last_name']])
    print('{}{}: {} ({})'.format(
        indent, player_id, player_full_name, player['rating_level']
    ))
    if prev_player_id is None and graph is not None:
        player_rating = player['rating_level'] - 0.25 # start in the middle
        kwargs = dict(
            name=player_full_name, rating=player_rating,
            level='{0:.1f}'.format(player['rating_level'])
        )
        graph.add_node(player_id, **kwargs)
    counter = collections.Counter()
    query = {'$or': []}
    # sd_keys, wl_keys = ['singles', 'doubles'], ['winner', 'loser']
    sd_keys, wl_keys = ['singles'], ['winner', 'loser']
    for key in itertools.product(sd_keys, wl_keys):
        query['$or'].append({'.'.join(key): player_id})
    for match in matches.find(query):
        individual_match_found = False
        for sd in sd_keys:
            for individual_match in match[sd]:
                if individual_match_found:
                    break
                for iwl, wl in enumerate(wl_keys):
                    if individual_match[wl] == player_id or (
                        isinstance(individual_match[wl], list) and \
                        player_id in individual_match[wl]
                    ):
                        individual_match_found = True
                        score = individual_match['score']
                        opponent_id = individual_match[wl_keys[int(not(iwl))]]
                        if (
                            get_total_games(score) < 6 or # less than 1 set played
                            is_bagel(score) or # skip bagels
                            opponent_id is None or # default
                            opponent_id == prev_player_id # already added                         
                        ):
                            break
                        counter['{}.{}'.format(sd, wl)] += 1
                        win_or_loss = wl[0].upper()
                        opponent = players.find_one({'_id': str(opponent_id)})
                        opponent_full_name = ' '.join([opponent['first_name'], opponent['last_name']])
                        print(
                            '{}{}:'.format(indent, match['_id']), win_or_loss,
                            pretty_score(score), '\tvs', opponent_full_name, '({})'.format(opponent_id)
                        )
                        if prev_player_id is None: # only go one level down
                            get_player_graph(opponent, prev_player_id=player_id, graph=graph)
                        if graph is not None:
                            opponent_rating = opponent['rating_level'] - 0.25 # start in the middle
                            kwargs = dict(
                                name=opponent_full_name, rating=opponent_rating,
                                level='{0:.1f}'.format(opponent['rating_level'])
                            )
                            graph.add_node(opponent_id, **kwargs)
                        if graph is not None:
                            kwargs = dict(score=score, date=match['date'])
                            if win_or_loss == 'W':
                                graph.add_edge(player_id, opponent_id, **kwargs)
                            else:
                                graph.add_edge(opponent_id, player_id, **kwargs)
                        break
    #print('{}{}'.format(indent, counter))

In [None]:
G = nx.DiGraph()
player = players.find_one({'last_name': 'Huck', 'first_name': 'Patrick'})
get_player_graph(player, graph=G)
# print(G.nodes(data=True))
# print(G.edges(data=True))

In [None]:
def crd(score):
    """
    CRD - Computer Rated Differential (numerical value for the difference in score)
    
    One way to assign a value to a specific score, is to count the number of service breaks
    and scale it to a value appropriate for NTRP ratings. For instance, at-level/true 4.5
    players should populate the core of the 4.5 interval. Defining the core of a 0.5-wide
    interval as its inner 90%, yields the range 4.05 - 4.45. An average upper 4.5 player
    would then correspond to a 4.35 rating and a lower 4.5 player to 4.15.
    
    A good scale for the CRD reflects the fact that an upper 4.5 player routinely beats a
    lower 4.5 player. A sensible choice for a routine but competitive win is a score of
    6-3/6-3 or 6-3/6-2 [see below].
    
    The number of service breaks in these cases should hence be equivalent to the difference
    of 0.2 between an upper and a lower 4.5 player. A 6-3/6-3 win entails 3 service breaks
    whereas a 6-3/6-2 win could be counted as 3.5 service breaks. Assigning a scaling factor
    of 0.06 for each additional service break is thus a good choice and results in CRDs of
    0.18 and 0.21, respectively.
    
    In the CRD, the number of service breaks in the third set also counts half since the
    opponents are basically even but the loser of the third set doesn't get a chance to even
    out in a fourth.
    
    A match is considered competitive if the loser plays one competitive set (>= 3 games) or
    scores at least 4 games in total. If this isn't the case, the outcome of the match would
    almost always be a non-competitive score regardless of how often they play each other.
    Since neither player probably plays their best due to the non-competitiveness of the match,
    the score likely is inaccurate.
    
    In the worst case of a 6-0/6-0 score, it's tough to assess performance of either player
    at all without a game on the scoreboard for the loser. The loser might have been close
    to scoring a game in every game of the match or in none at all.
    
    It hence makes sense to treat 6-0/6-0 scores as outliers in the calculation of ratings
    and discard them. But we can use this score to estimate the score inaccuracy of a
    non-competitive match. In such a match, the winner scored 6 service breaks, and an
    additional game for the loser would correspond to half a service break difference in
    the CRD, i.e. 0.03. Say the loser got close to winning one of the winner's 6 service
    games. The change in CRD value equivalent to this performance would be 0.03/6 = 0.005.
    For non-competitive matches, the CRD is hence adjusted downward by 0.005 to not affect
    both the players ratings as much as two competitive sets [which would have been more fun
    for both]. 
    
    See http://web.archive.org/web/20051211104109/http://www.wetennis.com/rate.htm
    """
    if score == ['60', '60']:
        raise ValueError('{} should be ignored!'.format(pretty_score(score)))
    crd = Decimal(0) # computer rated differential (CRD)
    scf = Decimal(0.06) # scaling factor = CRD equivalent for one service break
    competitive_set = False # one competitive set played (> 2 games)
    glt = 0 # number of games scored by loser
    for i,s in enumerate(score):
        gw, gl = map(int, s) # games winner and loser
        nb = Decimal(gw-gl)/2 # number of breaks
        crd += (nb/2 if i == 2 else nb) * scf # third set counts half
        glt += gl
        if i < 2 and gl > 2: # skip third set
            competitive_set = True
    return crd if competitive_set or glt > 3 else crd-scf/12

# competitive 3-set matches
assert(crd(['75', '57', '10']) == Decimal('0.015'))
assert(crd(['64', '46', '64']) == Decimal('0.03'))
assert(crd(['75', '57', '75']) == Decimal('0.03'))
assert(crd(['75', '57', '63']) == Decimal('0.045'))
assert(crd(['75', '57', '61']) == Decimal('0.075'))
# competitive 2-set matches
# one competitive set (>= 3 games) or
# total >= 4 games
assert(crd(['76', '76']) == Decimal('0.06'))
assert(crd(['76', '75']) == Decimal('0.09'))
assert(crd(['76', '64']) == Decimal('0.09'))
assert(crd(['75', '64']) == Decimal('0.12'))
assert(crd(['64', '64']) == Decimal('0.12'))
assert(crd(['64', '63']) == Decimal('0.15'))
assert(crd(['63', '63']) == Decimal('0.18'))
assert(crd(['75', '62']) == Decimal('0.18'))
assert(crd(['62', '63']) == Decimal('0.21'))
assert(crd(['62', '62']) == Decimal('0.24'))
assert(crd(['63', '61']) == Decimal('0.24'))
assert(crd(['61', '63']) == Decimal('0.24'))
assert(crd(['63', '60']) == Decimal('0.27'))
assert(crd(['60', '63']) == Decimal('0.27'))
# non-competitive matches
assert(crd(['62', '61']) == Decimal('0.265'))
assert(crd(['62', '60']) == Decimal('0.295'))
assert(crd(['61', '61']) == Decimal('0.295'))
assert(crd(['61', '60']) == Decimal('0.325'))

# crd(['60', '60'])

In [None]:
w, l = map(int, '63')
print(w, l)

In [None]:
edges = sorted(G.edges(data=True), key=lambda x: x[-1]['date'])
for winner_id, loser_id, d in edges:
    winner_rating = G.node[winner_id]['rating']
    loser_rating = G.node[loser_id]['rating']
    print(winner_rating, loser_rating, d['score'])

In [None]:
# nx.draw(G)
fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot(111)

nodes = sorted(G.nodes())
edges = G.edges()
node_cmap = {'4.0': 'blue', '4.5': 'red', '5.0': 'green'}
nodecolors = [node_cmap[G.node[n]['level']] for n in G.nodes()]

c = CircosPlot(nodes, edges, radius=10, ax=ax, fig=fig, nodecolor=nodecolors)
c.draw()

In [None]:
nodes = dict(
    (level, [n for n,d in G.nodes(data=True) if d['level'] == level])
    for level in ['4.0', '4.5', '5.0']
)
edges = dict(group1=G.edges(data=True))
edge_cmap = dict(group1='black')
h = HivePlot(nodes, edges, node_cmap, edge_cmap)
h.draw()

In [None]:
# sorted([(n, G.neighbors(n)) for n in G.nodes()], key=lambda x: len(x[1]), reverse=True)
# print(nx.degree_centrality(G))
# print(nx.has_path(G, 400, 1))
fig = plt.figure(0)
degree_centralities = list(nx.degree_centrality(G).values())
plt.hist(degree_centralities)
plt.title('Degree Centralities')