In [1]:
from pyspark import SparkContext
import numpy as np

In [2]:
sc = SparkContext(master="local[*]", appName="SparkApp")

In [3]:
rdd = sc.textFile("../data/customer-orders.csv")

In [4]:
rdd.take(5)

['44,8602,37.19',
 '35,5368,65.89',
 '2,3391,40.64',
 '47,6694,14.98',
 '29,680,13.08']

In [5]:
res_rdd = rdd.map(lambda x: x.split(',')).map(lambda x: (int(x[0]), float(x[2])))
res_rdd.take(5)

[(44, 37.19), (35, 65.89), (2, 40.64), (47, 14.98), (29, 13.08)]

In [6]:
def extract_customer_price_pairs(line):
    fields = line.split(',')
    return (int(fields[0]), float(fields[2]))

In [7]:
map_rdd = rdd.map(extract_customer_price_pairs)

In [8]:
map_rdd.take(5)

[(44, 37.19), (35, 65.89), (2, 40.64), (47, 14.98), (29, 13.08)]

In [9]:
total_by_customer = map_rdd.reduceByKey(lambda x, y: x + y)

In [10]:
total_by_customer.take(5)

[(44, 4756.890000000001),
 (2, 5994.59),
 (70, 5368.249999999999),
 (14, 4735.030000000001),
 (42, 5696.840000000002)]

In [11]:
price_rounded_rdd = total_by_customer.map(lambda x: (x[0], round(x[1], 2)))

In [12]:
customer_sorted_rdd = price_rounded_rdd.sortByKey()

In [13]:
customer_sorted_rdd.take(10)

[(0, 5524.95),
 (1, 4958.6),
 (2, 5994.59),
 (3, 4659.63),
 (4, 4815.05),
 (5, 4561.07),
 (6, 5397.88),
 (7, 4755.07),
 (8, 5517.24),
 (9, 5322.65)]

In [14]:
highest_price_sorted_rdd = customer_sorted_rdd.sortBy(lambda x: x[1], False)
highest_price_sorted_rdd.take(10)

[(68, 6375.45),
 (73, 6206.2),
 (39, 6193.11),
 (54, 6065.39),
 (71, 5995.66),
 (2, 5994.59),
 (97, 5977.19),
 (46, 5963.11),
 (42, 5696.84),
 (59, 5642.89)]