# Module 02 - 02 Create charts using Spark

Set up variable for later reference. Make sure the name of your datalake is correct

In [41]:
from pyspark.sql import types

datalake = 'datalake3mstaeetovkk4'

customSchema = types.StructType([
    types.StructField("SalesOrderNumber", types.StringType(), True),
    types.StructField("SalesTerritoryKey", types.IntegerType(), True),
    types.StructField("OrderDate", types.DateType(), True),
    types.StructField("Customer", types.StringType(), True),
    types.StructField("Email", types.StringType(), True),
    types.StructField("Adress", types.StringType(), True),
    types.StructField("Quantity", types.IntegerType(), True),
    types.StructField("UnitPrice", types.DoubleType(), True),
    types.StructField("ShippingCost", types.DoubleType(), True),
])

df = spark.read \
    .csv('abfss://landing@' + datalake + '.dfs.core.windows.net/Allfiles/01/data/*.csv', schema=customSchema)
df.createOrReplaceTempView("sales")

Execute the following Spark SQL And explore the **Chart** feature

In [42]:
%%sql

SELECT 
    OrderDate,
    SUM( (Quantity * UnitPrice) + ShippingCost ) AS TotalSales
FROM Sales
GROUP BY OrderDate
ORDER BY OrderDate
LIMIT 50

Let's create some chart using MATPLOTLIB

In [43]:
from matplotlib import pyplot as plt

totalOrdersByTerritory = sqlContext.sql("SELECT SalesTerritoryKey, COUNT(*) as TotalOrders \
                                         FROM Sales \
                                         GROUP BY SalesTerritoryKey\
                                         ORDER BY SalesTerritoryKey").toPandas()

# clear the plot area
plt.clf()

# create a figure
figure = plt.figure(figsize=(12, 8))

# create a bar plot of total sales by datalake
plt.bar(x=totalOrdersByTerritory['SalesTerritoryKey'], height=totalOrdersByTerritory['TotalOrders'], color='magenta')

# customize the chart
plt.title('Order count by sales territory')
plt.xlabel('Sales territory')
plt.ylabel('Total number of orders')
plt.grid(color='#95a5a6', linestyle='--')

# show the plot area
plt.show()