In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("SparkExercisesApp").master("spark://master:7077").getOrCreate()

In [3]:
sc = spark.sparkContext
sc.setLogLevel("FATAL")

### Exercise 1

How many distinct products have been sold in each store? 

In [4]:
df = spark.read.format("csv").option("header", "true").load("../data/sales.csv")
df.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- time_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- promotion_id: string (nullable = true)
 |-- store_id: string (nullable = true)
 |-- store_sales: string (nullable = true)
 |-- store_cost: string (nullable = true)
 |-- unit_sales: string (nullable = true)



In [5]:
df.select("store_id", "product_id").distinct().show()

+--------+----------+
|store_id|product_id|
+--------+----------+
|       2|        22|
|       2|       138|
|       2|      1245|
|       2|       756|
|       3|       502|
|       3|       310|
|       3|      1009|
|       3|       315|
|       3|        42|
|       3|      1486|
|       3|        49|
|       3|       587|
|       3|       539|
|       6|      1238|
|       6|      1069|
|       6|      1401|
|       6|       152|
|       6|       162|
|       7|       273|
|       7|       752|
+--------+----------+
only showing top 20 rows



### Exercise 2

In [6]:
df = spark.read.format("csv").option("header", "true").load("../data/online-retail-dataset.csv")
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- Country: string (nullable = true)



**Exercise 2.1**

What is the average quantity bought by the customer 14769?

In [7]:
from pyspark.sql.functions import col, avg

In [8]:
df.where(df.CustomerID == "14769").agg(avg(col("Quantity"))).collect()[0][0]

6.769652650822669

**Exercise 2.2**

What is the most occurring word in the items bought by the customers from France?

In [9]:
from pyspark.sql.functions import explode, split, concat, lit, desc

In [10]:
words = df.where(df.Country == "France").select("Description").select(explode(split(col("Description"), " ")).alias("word"))

words_count = words.groupby(words.word).count().orderBy(col("count").desc())

words_count.collect()[1][0]

'RED'

### Exercise 3

In [11]:
data_rdd = spark.sparkContext.textFile("../data/employee_info.csv")

**Exercise 3.1**

How many police officers are there at the POLICE department?

In [12]:
def filter_fun(x):
    return True if "POLICE OFFICER" in x else False

In [13]:
data_rdd.filter(filter_fun).count()

10639

**Exercise 3.2**

Who is the lieutenant with the highest salary at the FIRE department?

In [14]:
import re

from operator import add

In [15]:
def filter_fun(x):
    return True if "FIRE" in x and re.search(r"\b,LIEUTENANT,\b", x) is not None else False

def map_fun(x):
    values = x.split(",")
    
    return (float(values[-2]), values[1] + " " + values[0])

In [16]:
fire_lieutenant_rdd = data_rdd.filter(filter_fun)

fire_lieutenant_rdd.map(map_fun).sortByKey(ascending=False).take(1) 

[(114846.0, 'TIM P EDWARDS')]

In [17]:
# Stop the spark context
spark.stop()