In [0]:
# Pegando a tabela em csv
file_location = "/FileStore/tables/Products.csv"
file_type = 'csv'
infer_schema = "false"
first_row_is_header = "true"
delimiter = ','

products = spark.read.format(file_type)\
.option("inferSchema", infer_schema)\
.option("header", first_row_is_header)\
.option("sep", delimiter)\
.load(file_location)

display(products)

ProductID,ProductName,SupplierID,CategoryID,QuantityPerUnit,UnitPrice,UnitsInStock,UnitsOnOrder,ReorderLevel,Discontinued
1,Chai,1,1,10 boxes x 20 bags,18.0,39,0,10,0
2,Chang,1,1,24 - 12 oz bottles,19.0,17,40,25,0
3,Aniseed Syrup,1,2,12 - 550 ml bottles,10.0,13,70,25,0
4,Chef Anton's Cajun Seasoning,2,2,48 - 6 oz jars,22.0,53,0,0,0
5,Chef Anton's Gumbo Mix,2,2,36 boxes,21.35,0,0,0,1
6,Grandma's Boysenberry Spread,3,2,12 - 8 oz jars,25.0,120,0,25,0
7,Uncle Bob's Organic Dried Pears,3,7,12 - 1 lb pkgs.,30.0,15,0,10,0
8,Northwoods Cranberry Sauce,3,2,12 - 12 oz jars,40.0,6,0,0,0
9,Mishi Kobe Niku,4,6,18 - 500 g pkgs.,97.0,29,0,0,1
10,Ikura,4,8,12 - 200 ml jars,31.0,31,0,0,0


In [0]:
#Transformando csv em parquet
products.write.parquet("/FileStore/tables/northwind/Products.parquet")

In [0]:
products_parquet = spark.read.parquet("/FileStore/tables/northwind/Products.parquet")
products_parquet.show()

+---------+--------------------+----------+----------+--------------------+---------+------------+------------+------------+------------+
|ProductID|         ProductName|SupplierID|CategoryID|     QuantityPerUnit|UnitPrice|UnitsInStock|UnitsOnOrder|ReorderLevel|Discontinued|
+---------+--------------------+----------+----------+--------------------+---------+------------+------------+------------+------------+
|        1|                Chai|         1|         1|  10 boxes x 20 bags|  18.0000|          39|           0|          10|           0|
|        2|               Chang|         1|         1|  24 - 12 oz bottles|  19.0000|          17|          40|          25|           0|
|        3|       Aniseed Syrup|         1|         2| 12 - 550 ml bottles|  10.0000|          13|          70|          25|           0|
|        4|Chef Anton's Caju...|         2|         2|      48 - 6 oz jars|  22.0000|          53|           0|           0|           0|
|        5|Chef Anton's Gumb...|  

In [0]:
#Transformando em view para usar o sql
products_parquet.createOrReplaceTempView("tabela_products_view")

In [0]:
#Filtrando dados da tabeça de produtos
%sql
SELECT ProductID, ProductName, UnitsOnOrder FROM tabela_products_view

ProductID,ProductName,UnitsOnOrder
1,Chai,0
2,Chang,40
3,Aniseed Syrup,70
4,Chef Anton's Cajun Seasoning,0
5,Chef Anton's Gumbo Mix,0
6,Grandma's Boysenberry Spread,0
7,Uncle Bob's Organic Dried Pears,0
8,Northwoods Cranberry Sauce,0
9,Mishi Kobe Niku,0
10,Ikura,0


In [0]:
#Pegando a tabela de order
file_location = "/FileStore/tables/Order_Details_.csv"
file_type = 'csv'
infer_schema = "false"
first_row_is_header = "true"
delimiter = ','

order_details_csv = spark.read.format(file_type)\
.option("inferSchema", infer_schema)\
.option("header", first_row_is_header)\
.option("sep", delimiter)\
.load(file_location)

display(order_details_csv)

OrderID,ProductID,UnitPrice,Quantity,Discount
10248,11,14.0,12,0.0
10248,42,9.8,10,0.0
10248,72,34.8,5,0.0
10249,14,18.6,9,0.0
10249,51,42.4,40,0.0
10250,41,7.7,10,0.0
10250,51,42.4,35,0.15
10250,65,16.8,15,0.15
10251,22,16.8,6,0.05
10251,57,15.6,15,0.05


In [0]:
#Transformandp csv em parquet e tranformando em view
order_details_csv.write.parquet("/FileStore/tables/northwind/Order_Details.parquet")

spark.read.parquet("/FileStore/tables/northwind/Order_Details.parquet").createOrReplaceTempView("order_view")

In [0]:

%sql
/*Pegando produtos por quantidade de vendas */
SELECT ProductID, count(OrderID) AS total_order FROM order_view
GROUP BY ProductID
ORDER BY total_order asc

ProductID,total_order
9,5
37,6
48,6
15,6
66,8
27,9
5,10
67,10
50,10
3,12


In [0]:
%sql
/* pegando os 3 menos vendidos com os nomes */
SELECT count(OrderID) AS total_orders, vo.ProductID, pv.ProductName
FROM order_view AS vo
INNER JOIN tabela_products_view AS pv
ON vo.ProductID = pv.ProductID
GROUP BY vo.ProductID, pv.ProductName
ORDER BY total_orders asc
LIMIT 3;

total_orders,ProductID,ProductName
5,9,Mishi Kobe Niku
6,15,Genen Shouyu
6,48,Chocolade


In [0]:
%sql
/* pegando os 10 mais vendidos com os nomes */
SELECT count(OrderID) AS total_orders, vo.ProductID, pv.ProductName
FROM order_view AS vo
INNER JOIN tabela_products_view AS pv
ON vo.ProductID = pv.ProductID
GROUP BY vo.ProductID, pv.ProductName
ORDER BY total_orders desc
LIMIT 10;

total_orders,ProductID,ProductName
54,59,Raclette Courdavault
51,60,Camembert Pierrot
51,24,Guaran� Fant�stica
51,31,Gorgonzola Telino
50,56,Gnocchi di nonna Alice
48,62,Tarte au sucre
47,41,Jack's New England Clam Chowder
46,75,Rh�nbr�u Klosterbier
44,2,Chang
43,16,Pavlova
