Chapter 9. PageRank with Map and Reduce in PySpark
====
### Mastering Large Datasets with Python by JT Wolohan 



### Elo ratings in Spark

In [3]:
import json
import re

from pyspark import SparkContext

In [4]:
def round5(x):
    return 5 * int(x / 5)

In [5]:
def clean_match(match):
    ms = match.split(",")
    match_data = {"winner": ms[10], "loser": ms[20], "surface": ms[2]}
    return match_data

In [6]:
def elo_acc(acc, nxt):
    w_elo = acc.get(nxt["winner"], 1600)
    l_elo = acc.get(nxt["loser"], 1600)
    Qw = 10 ** (w_elo / 400)
    Ql = 10 ** (l_elo / 400)
    Qt = Qw + Ql
    acc[nxt["winner"]] = round5(w_elo + 25 * (1 - (Qw / Qt)))
    acc[nxt["loser"]] = round5(l_elo - 25 * (Ql / Qt))
    return acc

In [7]:
def elo_comb(a, b):
    a.update(b)
    return a

In [12]:
sc = SparkContext(appName="TennisRatings")
text_files = sc.textFile("/path/to/my/data/wta_matches*")
xs = text_files.map(clean_match).aggregate({}, elo_acc, elo_comb)

for x in sorted(xs.items(), key=lambda x: x[1], reverse=True)[:20]:
    print("{:<30}{}".format(*x))

Martina Hingis                1865
Venus Williams                1830
Monica Seles                  1765
Serena Williams               1755
Lindsay Davenport             1745
Maria Sharapova               1720
Petra Russegger               1710
Akiko Morigami                1690
Garbine Muguruza              1685
Victoria Azarenka             1665
Nour Abbes                    1660
Timea Bacsinszky              1660
Belinda Bencic                1655
Amelie Mauresmo               1655
Mary Pierce                   1655
Jennifer Saret                1650
Angelique Kerber              1650
Bermet Duvanaeva              1650
Svetlana Komleva              1650
Cecilia Costa Melgar          1650


### Page rank in Spark

In [25]:
from functools import partial
from math import ceil, log2

from pyspark import SparkContext

In [16]:
def ceil5(x):
    return ceil(x / 5) * 5

In [17]:
def get_winner_loser(match):
    ms = match.split(",")
    # Put the loser in first position, winner in second
    return (ms[20], ms[10])

In [18]:
def initialize_for_voting(losses):
    return {"losses": losses, "n_losses": len(losses), "rating": 100}

In [19]:
def empty_ratings(d):
    d["rating"] = 0
    return d

In [20]:
def allocate_points(acc, nxt):
    k, v = nxt
    boost = v["rating"] / (v["n_losses"] + 0.01)
    for loss in v["losses"]:
        if loss not in acc.keys():
            acc[loss] = {"losses": [], "n_losses": 0}
        opp_rating = acc.get(loss, {}).get("rating", 0)
        acc[loss]["rating"] = opp_rating + boost
    return acc

In [21]:
def combine_scores(a, b):
    for k, v in b.items():
        try:
            a[k]["rating"] = a[k]["rating"] + b[k]["rating"]
        except KeyError:
            a[k] = v
    return a

To run the cell below, you may need to un-comment the Spark context. If you ran the Elo rating example above, leave it commented.

In [26]:
# sc = SparkContext(appName="TennisRatings")
match_data = sc.textFile("path/to/tennis/files")
xs = match_data.map(get_winner_loser).groupByKey().mapValues(initialize_for_voting)

for i in range(8):
    if i > 0:
        xs = sc.parallelize(zs.items())
    acc = dict(xs.mapValues(empty_ratings).collect())
    zs = xs.aggregate(acc, allocate_points, combine_scores)

ratings = [(k, v["rating"]) for k, v in zs.items()]
for player, rating in sorted(ratings, key=lambda x: x[1], reverse=True)[:20]:
    print("{:<30}{}\t{}".format(player, round(log2(rating + 1), 1), ceil5(rating)))

Serena Williams               12.4	5475
Venus Williams                12.0	4230
Kim Clijsters                 11.9	3870
Maria Sharapova               11.9	3785
Justine Henin                 11.8	3660
Elena Dementieva              11.6	3130
Amelie Mauresmo               11.6	3115
Svetlana Kuznetsova           11.6	3060
Jelena Jankovic               11.6	3055
Lindsay Davenport             11.6	3055
Victoria Azarenka             11.3	2485
Ana Ivanovic                  11.2	2405
Daniela Hantuchova            11.2	2385
Nadia Petrova                 11.2	2360
Caroline Wozniacki            11.2	2350
Agnieszka Radwanska           11.2	2335
Vera Zvonareva                11.2	2320
Patty Schnyder                11.1	2220
Samantha Stosur               11.1	2215
Francesca Schiavone           11.0	2100


[Read for more? Go to chapter 10!](./Ch03_notebook.ipynb)