# Requirement
- Get Total Quantity and Total Sales by week, by store, by Department, Category etc.

# Loading big data
- The data is not available in git repo as it is too big
- Download it before the class

In [1]:
import os
import pandas as pd

df_products = pd.read_parquet('./data/PRODUCTS')

In [None]:
df_products.info()

In [None]:
df_stores = pd.read_parquet('./data/STORES')

In [4]:
df_transactions = pd.read_parquet('./data/TRANSACTIONS')

In [5]:
df_transactions["TransactionDate"] = pd.to_datetime(df_transactions['TransactionDate'])

# Below code on a 16gb, 4 core cpu mac takes more than 5 min

In [None]:
%%time
df_merge_tran_product = pd.merge(df_transactions, df_products, how='left', left_on=['UPC'], right_on = ['UPC'])
del df_products

In [None]:
pd.set_option('display.float_format', lambda x: '%.1f' % x)
df_merge_tran_product.groupby(['CATEGORY','week_number_of_year'])['Qty','SoldRate'].sum().rename(columns={'Qty':'Total Qty','SoldRate' : 'Total Sales'})

# Do with Spark

In [None]:
# The Python packaging for Spark is not intended to replace all of the other use cases. 
# This Python packaged version of Spark is suitable for interacting with an existing cluster (be it Spark standalone, YARN, or Mesos) - but does not contain the tools required to setup your own standalone Spark cluster. 
# You can download the full version of Spark from the Apache Spark downloads page.
! pip install pyspark==2.4.4

In [None]:
# Below variables are to be set in the shell profile
# export SPARK_HOME=/Users/pmacharl/spark-2.4.4-bin-hadoop2.7
# export PATH=$PATH:$SPARK_HOME/bin
# export PYSPARK_SUBMIT_ARGS="pyspark-shell"
# export PYSPARK_DRIVER_PYTHON=/usr/local/bin/python3
# export PYSPARK_PYTHON=/usr/local/bin/python3

In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

# https://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.SparkConf
config = SparkConf()
config.set("spark.driver.memory", "2g")
config.set("spark.executor.memory", "1g")

#Because you are likely running in local mode, it is a good practice to set the number of shuffle partitions
# to something that is going to fit local mode. By default, the value is 200, but there aren't many executors
# on this machine, its worth reducing this to 5
config.set("spark.sql.shuffle.partitions", "5")

# Cluster mode
# https://spark.apache.org/docs/latest/submitting-applications.html
# config.setMaster("spark://192.168.0.7:7077") # If spark is started in local cluster mode

<pyspark.conf.SparkConf at 0x109108950>

In [2]:
spark = SparkSession.builder.config(conf=config).master("local").appName("MyApp").getOrCreate()
spark

# Using DataFrames

In [None]:
df_products = spark.read.load("./data/PRODUCTS")
df_products.show()

In [4]:
df_products.createOrReplaceTempView("products")

In [None]:
df_transactions = spark.read.load("./data/TRANSACTIONS")
df_transactions.show()

In [6]:
df_transactions.createOrReplaceTempView("transactions")

In [26]:
df_join_product_transactions = df_transactions.join(df_products, df_transactions.UPC == df_products.UPC, 'inner')

In [None]:
%%time
df_join_product_transactions.groupby("DEPARTMENT").agg({"SoldRate": "sum", "Qty": "count"}).show()

# Using Spark SQL
- [Spark SQL JOINS](https://jaceklaskowski.gitbooks.io/mastering-spark-sql/spark-sql-joins.html)

In [18]:
df_join_tran_products = spark.sql("select p.DEPARTMENT as Department, count(t.Qty) as TotalQty, sum(t.SoldRate) as TotalSales from transactions t join products p on t.UPC=p.UPC GROUP BY p.DEPARTMENT")

In [None]:
%%time
from pyspark.sql.types import *
df_join_tran_products.withColumn('TotalSales', df_join_tran_products.TotalSales.cast(DecimalType(18, 2))).show()

In [None]:
spark.stop()

# Simulating more data 
- Use Dbeaver (version < 6.2 , as they made "Generate mock data" feature enterprise from 6.2)
- Generate mock data is intuitive to use (play with it for some time and it will be easier)
- Understanding the sequence of loading tables ensures PK-FK validations are automatically taken care of for e.g. generating store records with address_ids that don't exist in address table doesn't make sense (so select FK for address_id field)