In [1]:
#Def our sparkContext
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
sc

In [5]:
#Function to extract from data only the things what i need
def parseLines(file):
    lines = file.split(",")
    return [int(lines[0]), float(lines[2])]

In [6]:
#Read a file
file = sc.textFile("../datasets/customer-orders.csv")
file.take(10)

['44,8602,37.19',
 '35,5368,65.89',
 '2,3391,40.64',
 '47,6694,14.98',
 '29,680,13.08',
 '91,8900,24.59',
 '70,3959,68.68',
 '85,1733,28.53',
 '53,9900,83.55',
 '14,1505,4.32']

In [7]:
#Apply the function to extract Uid and amount
rdd = file.map(parseLines)
rdd.take(10)

[[44, 37.19],
 [35, 65.89],
 [2, 40.64],
 [47, 14.98],
 [29, 13.08],
 [91, 24.59],
 [70, 68.68],
 [85, 28.53],
 [53, 83.55],
 [14, 4.32]]

In [9]:
#Make a reduce to sum all the amount
reduced = rdd.reduceByKey(lambda x,y: x+y)
reduced.take(10)

[(44, 4756.890000000001),
 (2, 5994.59),
 (70, 5368.249999999999),
 (14, 4735.030000000001),
 (42, 5696.840000000002),
 (50, 4517.2699999999995),
 (20, 4836.860000000001),
 (48, 4384.33),
 (4, 4815.050000000001),
 (36, 4278.049999999999)]

In [10]:
#Transform the value in key to sort by it
flipped = reduced.map(lambda x: [x[1],x[0]])
flipped.take(10)

[[4756.890000000001, 44],
 [5994.59, 2],
 [5368.249999999999, 70],
 [4735.030000000001, 14],
 [5696.840000000002, 42],
 [4517.2699999999995, 50],
 [4836.860000000001, 20],
 [4384.33, 48],
 [4815.050000000001, 4],
 [4278.049999999999, 36]]

In [11]:
#Sort it
sortByAmount = flipped.sortByKey()
sortByAmount.take(10)

[(3309.3799999999997, 45),
 (3790.5699999999997, 79),
 (3924.2300000000005, 96),
 (4042.65, 23),
 (4172.29, 99),
 (4178.5, 75),
 (4278.049999999999, 36),
 (4297.259999999999, 98),
 (4316.299999999998, 47),
 (4327.73, 77)]

In [17]:
#Print in the console a formatted string
results =  sortByAmount.collect()
for result in results:
    print("User: {} . Amount: {:.02f}".format(result[1], result[0]))

User: 45 . Amount: 3309.38
User: 79 . Amount: 3790.57
User: 96 . Amount: 3924.23
User: 23 . Amount: 4042.65
User: 99 . Amount: 4172.29
User: 75 . Amount: 4178.50
User: 36 . Amount: 4278.05
User: 98 . Amount: 4297.26
User: 47 . Amount: 4316.30
User: 77 . Amount: 4327.73
User: 13 . Amount: 4367.62
User: 48 . Amount: 4384.33
User: 49 . Amount: 4394.60
User: 94 . Amount: 4475.57
User: 67 . Amount: 4505.79
User: 50 . Amount: 4517.27
User: 78 . Amount: 4524.51
User: 5 . Amount: 4561.07
User: 57 . Amount: 4628.40
User: 83 . Amount: 4635.80
User: 91 . Amount: 4642.26
User: 74 . Amount: 4647.13
User: 84 . Amount: 4652.94
User: 3 . Amount: 4659.63
User: 12 . Amount: 4664.59
User: 66 . Amount: 4681.92
User: 56 . Amount: 4701.02
User: 21 . Amount: 4707.41
User: 80 . Amount: 4727.86
User: 14 . Amount: 4735.03
User: 37 . Amount: 4735.20
User: 7 . Amount: 4755.07
User: 44 . Amount: 4756.89
User: 31 . Amount: 4765.05
User: 82 . Amount: 4812.49
User: 4 . Amount: 4815.05
User: 10 . Amount: 4819.70
User:

In [19]:
def totalAmountByCostumer(rdd):
    preProcessed = rdd.map(parseLines).reduceByKey(lambda x,y: x+y)
    flippedAndSorted = preProcessed.map(lambda x: [x[1],x[0]]).sortByKey()
    for result in flippedAndSorted.collect():
        print("User: {} . Amount: {:.02f}".format(result[1], result[0]))

In [20]:
totalAmountByCostumer(file)

User: 45 . Amount: 3309.38
User: 79 . Amount: 3790.57
User: 96 . Amount: 3924.23
User: 23 . Amount: 4042.65
User: 99 . Amount: 4172.29
User: 75 . Amount: 4178.50
User: 36 . Amount: 4278.05
User: 98 . Amount: 4297.26
User: 47 . Amount: 4316.30
User: 77 . Amount: 4327.73
User: 13 . Amount: 4367.62
User: 48 . Amount: 4384.33
User: 49 . Amount: 4394.60
User: 94 . Amount: 4475.57
User: 67 . Amount: 4505.79
User: 50 . Amount: 4517.27
User: 78 . Amount: 4524.51
User: 5 . Amount: 4561.07
User: 57 . Amount: 4628.40
User: 83 . Amount: 4635.80
User: 91 . Amount: 4642.26
User: 74 . Amount: 4647.13
User: 84 . Amount: 4652.94
User: 3 . Amount: 4659.63
User: 12 . Amount: 4664.59
User: 66 . Amount: 4681.92
User: 56 . Amount: 4701.02
User: 21 . Amount: 4707.41
User: 80 . Amount: 4727.86
User: 14 . Amount: 4735.03
User: 37 . Amount: 4735.20
User: 7 . Amount: 4755.07
User: 44 . Amount: 4756.89
User: 31 . Amount: 4765.05
User: 82 . Amount: 4812.49
User: 4 . Amount: 4815.05
User: 10 . Amount: 4819.70
User: