In [0]:
# dbutils.fs.ls('FileStore/tables/northwind')
dbutils.fs.ls('FileStore/tables/')

Out[7]: [FileInfo(path='dbfs:/FileStore/tables/species.csv', name='species.csv', size=1605),
 FileInfo(path='dbfs:/FileStore/tables/surveys.csv', name='surveys.csv', size=1021588),
 FileInfo(path='dbfs:/FileStore/tables/ufoSightings.csv', name='ufoSightings.csv', size=13685183),
 FileInfo(path='dbfs:/FileStore/tables/vehicles.csv', name='vehicles.csv', size=8732953),
 FileInfo(path='dbfs:/FileStore/tables/vehicles.parquet', name='vehicles.parquet', size=4975246)]

In [0]:
file_links = {
  "categories": "s3://2u-data-curriculum-team/dataviz-classroom/v1.1/22-big-data/categories.csv",
  "customers": "s3://2u-data-curriculum-team/dataviz-classroom/v1.1/22-big-data/customers.csv",
  "employee_territories": "s3://2u-data-curriculum-team/dataviz-classroom/v1.1/22-big-data/employee-territories.csv",
  "employees": "s3://2u-data-curriculum-team/dataviz-classroom/v1.1/22-big-data/employees.csv",
  "order_details": "s3://2u-data-curriculum-team/dataviz-classroom/v1.1/22-big-data/order-details.csv",
  "orders": "s3://2u-data-curriculum-team/dataviz-classroom/v1.1/22-big-data/orders.csv",
  "products": "s3://2u-data-curriculum-team/dataviz-classroom/v1.1/22-big-data/products.csv",
  "regions": "s3://2u-data-curriculum-team/dataviz-classroom/v1.1/22-big-data/regions.csv",
  "shippers": "s3://2u-data-curriculum-team/dataviz-classroom/v1.1/22-big-data/shippers.csv",
  "suppliers": "s3://2u-data-curriculum-team/dataviz-classroom/v1.1/22-big-data/suppliers.csv",
  "territories": "s3://2u-data-curriculum-team/dataviz-classroom/v1.1/22-big-data/territories.csv"
}

In [0]:
for table_name, table_path in file_links.items():
  df_name = table_name + "_df"
  df_name = spark.read.csv(table_path, inferSchema=True, header=True)
  df_name.createOrReplaceTempView(table_name)
  

In [0]:
spark.catalog.listTables()

Out[5]: [Table(name='categories', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='customers', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='employee_territories', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='employees', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='order_details', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='orders', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='products', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='regions', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='shippers', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='suppliers', database=None, description=None, tableType='TEMPORARY', is

In [0]:
%sql
-- A list of all countries that orders have been shipped to
SELECT DISTINCT ShipCountry
FROM orders;

ShipCountry
Sweden
Germany
France
Argentina
Belgium
Finland
Italy
Norway
Spain
Denmark


In [0]:
%sql
SELECT COUNT(*)
FROM orders;

count(1)
830


In [0]:
%sql
SELECT ShipCountry, COUNT(*) As NumOrders
FROM orders
GROUP BY ShipCountry
ORDER BY NumOrders DESC;

ShipCountry,NumOrders
Germany,122
USA,122
Brazil,83
France,77
UK,56
Venezuela,46
Austria,40
Sweden,37
Canada,30
Italy,28


In [0]:
%sql
-- For each product, list its product ID, product name, and the company name
SELECT ProductID, ProductName, CompanyName
FROM products
JOIN suppliers
ON products.SupplierID = suppliers.SupplierID;

ProductID,ProductName,CompanyName
1,Chai,Exotic Liquids
2,Chang,Exotic Liquids
3,Aniseed Syrup,Exotic Liquids
4,Chef Anton's Cajun Seasoning,New Orleans Cajun Delights
5,Chef Anton's Gumbo Mix,New Orleans Cajun Delights
6,Grandma's Boysenberry Spread,Grandma Kelly's Homestead
7,Uncle Bob's Organic Dried Pears,Grandma Kelly's Homestead
8,Northwoods Cranberry Sauce,Grandma Kelly's Homestead
9,Mishi Kobe Niku,Tokyo Traders
10,Ikura,Tokyo Traders


In [0]:
%sql
--How many products exist in each category?
SELECT CategoryName, COUNT(*) AS ProductCount
FROM categories
JOIN products
ON categories.CategoryID = products.CategoryID
GROUP BY CategoryName
ORDER BY ProductCount DESC;

CategoryName,ProductCount
Confections,13
Condiments,12
Seafood,12
Beverages,12
Dairy Products,10
Grains/Cereals,7
Meat/Poultry,6
Produce,5


In [0]:
%sql
-- Which 5 countries have the most customers?
SELECT Country, COUNT(*) AS NumberOfCustomers
FROM customers
GROUP BY Country
ORDER BY NumberOfCustomers DESC
LIMIT 5;

Country,NumberOfCustomers
USA,13
France,11
Germany,11
Brazil,9
UK,7


In [0]:
%sql
--Which 5 countries have the heaviest shipments, on average?
SELECT ShipCountry, AVG(Freight) AS AverageFreight
FROM orders
GROUP BY ShipCountry
ORDER BY AverageFreight DESC
LIMIT 5;

ShipCountry,AverageFreight
Austria,184.7875
Ireland,145.01263157894738
USA,112.87942622950818
Germany,92.48590163934426
Sweden,87.50270270270272


In [0]:
%sql
-- Which 10 companies have placed the most orders?
-- List the company contacts as well
SELECT CompanyName, ContactName, COUNT(OrderID) as NumberOfOrders
FROM customers c
JOIN orders o
ON c.CustomerID = o.CustomerID
GROUP BY CompanyName, ContactName
ORDER BY NumberOfOrders DESC
LIMIT 10;

CompanyName,ContactName,NumberOfOrders
Save-a-lot Markets,Jose Pavarotti,31
Ernst Handel,Roland Mendel,30
QUICK-Stop,Horst Kloss,28
Hungry Owl All-Night Grocers,Patricia McKenna,19
Folk och fä HB,Maria Larsson,19
HILARION-Abastos,Carlos Hernández,18
Berglunds snabbköp,Christina Berglund,18
Rattlesnake Canyon Grocery,Paula Wilson,18
Bon app',Laurence Lebihan,17
Frankenversand,Peter Franken,15


In [0]:
%sql
-- Which customers are the biggest spenders in terms of total amount spent on orders?
SELECT customers.CompanyName, SUM(Quantity * UnitPrice) AS TotalSpending
FROM customers
JOIN orders
ON customers.CustomerID = orders.CustomerID
JOIN order_details
ON orders.OrderID = order_details.OrderID
GROUP BY customers.CustomerID, customers.CompanyName
ORDER BY TotalSpending DESC;

CompanyName,TotalSpending
QUICK-Stop,117483.39
Save-a-lot Markets,115673.39
Ernst Handel,113236.68
Hungry Owl All-Night Grocers,57317.39000000001
Rattlesnake Canyon Grocery,52245.9
Hanari Carnes,34101.15
Folk och fä HB,32555.55
Mère Paillarde,32203.9
Königlich Essen,31745.75
Queen Cozinha,30226.099999999995


In [0]:
%sql
-- Which customers are the biggest spenders in terms of total amount spent on orders?
-- List only companies with a total spending of greater than 20,000.
SELECT customers.CompanyName, SUM(Quantity * UnitPrice) AS TotalSpending
FROM customers
JOIN orders
ON customers.CustomerID = orders.CustomerID
JOIN order_details
ON orders.OrderID = order_details.OrderID
GROUP BY customers.CustomerID, customers.CompanyName
HAVING TotalSpending > 20000
ORDER BY TotalSpending DESC;

CompanyName,TotalSpending
QUICK-Stop,117483.39
Save-a-lot Markets,115673.39
Ernst Handel,113236.68
Hungry Owl All-Night Grocers,57317.39000000001
Rattlesnake Canyon Grocery,52245.9
Hanari Carnes,34101.15
Folk och fä HB,32555.55
Mère Paillarde,32203.9
Königlich Essen,31745.75
Queen Cozinha,30226.099999999995


In [0]:
%sql
-- Which employees have sold the most orders?
SELECT LastName, FirstName, COUNT(OrderID) AS NumOrders
FROM employees e
JOIN orders o
ON e.EmployeeID = o.EmployeeID
GROUP BY LastName, FirstName
ORDER BY NumOrders DESC;

LastName,FirstName,NumOrders
Peacock,Margaret,156
Leverling,Janet,127
Davolio,Nancy,123
Callahan,Laura,104
Fuller,Andrew,96
King,Robert,72
Suyama,Michael,67
Dodsworth,Anne,43
Buchanan,Steven,42


In [0]:
%sql
-- List customer IDs of customers who have never placed an order.
SELECT c.CustomerID, o.CustomerID
FROM customers c
LEFT JOIN orders o
ON c.CustomerID = o.CustomerID
WHERE o.CustomerID IS NULL;

CustomerID,CustomerID.1
FISSA,
PARIS,


In [0]:
%sql
-- Show the top 15 order IDs
SELECT o.OrderID, COUNT(*) AS TotalOrders
FROM orders o
JOIN order_details od
ON o.OrderID = od.OrderID
GROUP BY o.OrderID
ORDER BY TotalOrders DESC
LIMIT 15;

OrderID,TotalOrders
11077,25
10979,6
10657,6
10847,6
10514,5
10558,5
10294,5
10845,5
10458,5
10553,5


In [0]:
%sql
-- List the most heavily discounted products in descending order
SELECT p.ProductName, od.Discount
FROM products p
JOIN order_details od
ON p.ProductID = od.ProductID
ORDER BY od.Discount DESC;

ProductName,Discount
Flotemysost,0.25
Singaporean Hokkien Fried Mee,0.25
Gorgonzola Telino,0.25
Tarte au sucre,0.25
Rhönbräu Klosterbier,0.25
Mishi Kobe Niku,0.25
Scottish Longbreads,0.25
Schoggi Schokolade,0.25
Côte de Blaye,0.25
Queso Cabrales,0.25


In [0]:
spark.catalog.listTables() 

Out[8]: [Table(name='categories', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='customers', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='employee_territories', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='employees', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='order_details', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='orders', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='products', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='regions', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='shippers', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='suppliers', database=None, description=None, tableType='TEMPORARY', is