## Load supermarket data into hdfs for later usage

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark import SparkConf

In [2]:
SRC_FILE = 'supermarket_data.csv'

In [3]:
!hdfs dfs -put "/data/supermarket_data.csv" "/tmp"
!hdfs dfs -chmod 755 /tmp/supermarket_data.csv
!hdfs dfs -ls /tmp

Found 3 items
drwxrwxrwt   - hdfs hadoop          0 2020-06-28 08:01 /tmp/hadoop-yarn
drwx-wx-wx   - hive hadoop          0 2020-06-28 08:02 /tmp/hive
-rwxr-xr-x   1 root hadoop   48260395 2020-06-28 08:09 /tmp/supermarket_data.csv


In [4]:
spark = SparkSession \
        .builder \
        .appName("SuperMarket ETL") \
        .enableHiveSupport() \
        .getOrCreate()

sc = spark.sparkContext

In [7]:
SuperMarket_Schema = StructType([
    StructField('SHOP_DATE' , StringType()  , False),
    StructField('SHOP_HOUR' , IntegerType() , False),
    StructField('BASKET_ID' , LongType()    , False),
    StructField('CUST_CODE' , StringType()  , False), 
    StructField('STORE_CODE', StringType()  , False),
    StructField('PROD_CODE' , StringType()  , False),
    StructField('QUANTITY'  , IntegerType() , False),
    StructField('SPEND'     , DoubleType()  , False)
])

In [8]:
df = spark.read \
        .option("delimiter", ",") \
        .option("header", "true") \
        .csv('/tmp/' + SRC_FILE, schema=SuperMarket_Schema)

In [9]:
# Create Temporary View
df.createOrReplaceTempView("view_supermarket")

# Create Table

spark.sql("""
CREATE TABLE default.supermarket STORED AS PARQUET AS
SELECT *
FROM view_supermarket
""")

DataFrame[]

In [10]:
spark.sql("""
SELECT *
FROM supermarket
LIMIT 5
""").show()

+---------+---------+---------------+--------------+----------+----------+--------+-----+
|SHOP_DATE|SHOP_HOUR|      BASKET_ID|     CUST_CODE|STORE_CODE| PROD_CODE|QUANTITY|SPEND|
+---------+---------+---------------+--------------+----------+----------+--------+-----+
| 20070930|       14|994107700805249|CUST0000967892|STORE00003|PRD0904745|       1| 0.96|
| 20080703|       21|994111700279338|CUST0000170099|STORE00004|PRD0903050|       1| 1.12|
| 20080316|       12|994110100519787|CUST0000531407|STORE00003|PRD0902671|       3| 4.65|
| 20070406|       10|994105200274911|CUST0000170099|STORE00004|PRD0900670|       1| 1.22|
| 20070327|       11|994105100237909|CUST0000111155|STORE00003|PRD0904870|       1| 1.22|
+---------+---------+---------------+--------------+----------+----------+--------+-----+

