## Requirement

```
1. Ensure use the spark cluster
2. Read Ecommerce data from hadoop as csv
3. Create a database "ecommercedb" in hive / SQL/PySpark  if it is not exist
4. Save the ecommerce data as sparkTable [parquet]
5. Compare the size difference between csv and parquest in the HDFS browser http://192.168.93.128:50070
6. Using Hive meta data url

```

In [1]:
import findspark
findspark.init()

In [None]:


# """
# Since Spark 2.x, Spark unified Spark APIs, DF, Datasets, & SQL.
# SparkSession uses SparkContext internally.
# """

# from pyspark.conf import SparkConf

# config = SparkConf()
# config.setMaster("spark://192.168.11.77:7077").setAppName("E-COMMERCE:CLUSTER")


In [2]:
# winutils.exe  chmod 777  C:\spark-temp

from pyspark.conf import SparkConf
conf = SparkConf()


# config.setMaster("spark://192.168.11.71:7077").setAppName("HiveApp")
(
    conf.setMaster("local[1]").setAppName("ecommerce->HIVE")
    .set("spark.executor.memory", "4G")
    .set("spark.executor.cores", 2)
    .set("spark.cores.max", 2)
    .set("spark.driver.memory", "4G")

    .set("hive.metastore.uris", "thrift://192.168.93.128:9083")

    # .set("spark.sql.warehouse.dir", "/home/ubuntu/spark-warehose")
    .set("spark.sql.warehouse.dir", "hdfs://192.168.93.128:9000/user/hive/warehouse")  # spark warehouse

    # .set("hive.metastore.warehouse.dir", "hdfs://192.168.93.128:9000/user/hive/warehouse")  # ?
    # .set("spark.local.dir", "c:/spark-temp")
)


from pyspark.sql import SparkSession, HiveContext
ss = (
    SparkSession.builder
    .config(conf=conf)
    .enableHiveSupport()
    .getOrCreate()
)

In [3]:
from pyspark.sql import SparkSession

ss = SparkSession.builder.config(conf=conf).getOrCreate()

In [None]:
ss

<br><br>

## Read e-commerce data

In [4]:

"""
Read CSV from HDFS
"""

import datetime as dt
from pyspark.sql.types import StructType, IntegerType, DoubleType, StringType, DateType
from pyspark.sql.functions import col, asc, desc, count, sum, avg, to_date, to_timestamp

schema_ecomm = (
    StructType()
    .add("InvoiceNo", StringType(), True)
    .add("StockCode", StringType(), True)
    .add("Description", StringType(), True)
    .add("Quantity", IntegerType(), True)
    .add("InvoiceDate", DateType(), True)
    .add("UnitPrice", DoubleType(), True)
    .add("CustomerId", StringType(), True)
    .add("Country", StringType(), True)
)

df_ecomm_full = (
    ss.read
    .format("csv")
    .option("header", True)
    .option("dateFormat", "MM/dd/yyyy HH:mm")
    .schema(schema_ecomm)
    .load("hdfs://192.168.93.128:9000/input/e-commerce/data.csv")
)


"""
DROP un-necessary data/columns
.drop('column_name', 'column_name')
"""
df_ecomm_full = df_ecomm_full[["Country", "CustomerId", "Quantity", "UnitPrice"]]

In [None]:
df_ecomm_full.show(2)

In [6]:
ss.sql("CREATE DATABASE IF NOT EXISTS ecommercedb_pg")

DataFrame[]

In [7]:
ss.sql("USE ecommercedb_pg")

DataFrame[]

In [8]:
ss.sql("SHOW DATABASES").show()

+--------------+
|  databaseName|
+--------------+
|       default|
|ecommercedb_pg|
+--------------+



<br><br>

## Write to HDFS

In [9]:
# (
#     df_ecomm_full
#     .coalesce(1)
#     .write.mode('overwrite')
#     .option("header", True)
#     .csv("hdfs://192.168.93.128:9000/output/e-commerce/ecommerce")
# )
df_ecomm_full.coalesce(1).write.mode("overwrite").saveAsTable("ecommerce")

In [10]:
ss.sql("SELECT * FROM ecommercedb_pg.ecommerce").show(2)

+--------------+----------+--------+---------+
|       Country|CustomerId|Quantity|UnitPrice|
+--------------+----------+--------+---------+
|United Kingdom|     17850|       6|     2.55|
|United Kingdom|     17850|       6|     3.39|
+--------------+----------+--------+---------+
only showing top 2 rows



In [None]:
df_ecomm_full.columns

In [None]:
%%timeit
df_ecomm_full.groupby("Country").sum("Quantity")

In [None]:
%%timeit
ss.sql("""
SELECT country, SUM(quantity)
  FROM ecommercedb.ecommerce
 GROUP BY 1
""")

# Devanshu's code

In [None]:
dataSet = spark.read.format("csv") \
                .option("header", True) \
                .schema(schema) \
                .option("dateFormat", "MM/dd/yyyy HH:mm")\
                .load("hdfs://192.168.93.128:9000/ecommerce/data.csv")

spark.sql("CREATE DATABASE IF NOT EXISTS ecommercedb")

dataSet.write.parquet("sparkTable.parquet")

parquetFile = spark.read.parquet("sparkTable.parquet")

parquetFile.write.mode('overwrite')\
                    .parquet("hdfs://192.168.93.128:9000/ecommerce/sparkTable.parquet")


---
# DELETE BELOW

<br><br>

### How may partitions did I coalesce?

In [None]:
df_ecomm = df_ecomm_full[["Country", "CustomerId", "Quantity", "UnitPrice"]]

In [None]:
"""
EXPLAIN PLAN
"""

df_ecomm.explain(True)