### Módulo Big Data - Santander Coder's

Notebook utilizado para desenvolvimento do projeto de conclusão do módulo Big Data.

Repositório disponível em: https://github.com/wesleyssantos27/Building-DW

In [None]:
files_name = ['Categories','CustomerCustomerDemo','CustomerDemographics','Customers','Employees','EmployeeTerritories','Orders','Order_Details_','Products','Region','Shippers','Suppliers','Territories']
for file in files_name:
    file_location = "/FileStore/tables/"+file+".csv"
    file_type = 'csv'
    infer_schema = "false"
    first_row_is_header = "true"
    delimiter = ','

    current_table = spark.read.format(file_type)\
    .option("inferSchema", infer_schema)\
    .option("header", first_row_is_header)\
    .option("sep", delimiter)\
    .option("multiLine", "true")\
    .load(file_location)
    
    current_table.write.parquet("/FileStore/tables/northwind/trusted/"+file+".parquet")

    spark.read.parquet("/FileStore/tables/northwind/trusted/"+file+".parquet").createOrReplaceTempView(file+"_view")

#### Relational model arquitecture
<img src="Images/Relational_Schema.png" width="800" /> ![](files/tables/Relational_Schema.png)

#### Dimentional model arquitecture
<img src="Images/Dimensional_Schema.png" width="800" /> ![](files/tables/Dimensional_Schema.png)

#### Building the dimensional model

In [None]:
%sql
CREATE TABLE IF NOT EXISTS Date_Dim (
  date_id INT NOT NULL,
  Date DATE,
  Quarter INT,
  Day INT,
  Month INT,
  Year INT
)
USING DELTA
LOCATION '/FileStore/tables/northwind/refined/date_dim';

INSERT INTO Date_Dim (date_id, Date, Quarter, Day, Month, Year)
SELECT 
  ROW_NUMBER() OVER (ORDER BY OrderDate),
  OrderDate,
  extract(QUARTER FROM OrderDate),
  extract(DAY FROM OrderDate),
  extract(MONTH FROM OrderDate),
  extract(YEAR FROM OrderDate)
FROM Orders_view;

num_affected_rows,num_inserted_rows
830,830


In [None]:
%sql
CREATE TABLE IF NOT EXISTS employee_dim 
(
  EmployeeID INT NOT NULL,
  LastName VARCHAR(20) NOT NULL,
  FirstName VARCHAR(10) NOT NULL,
  Region VARCHAR(15),
  City VARCHAR(15),
  Country VARCHAR(15)
)
USING DELTA
LOCATION '/FileStore/tables/northwind/refined/employee_dim';

INSERT INTO employee_dim (EmployeeID, LastName, FirstName, Region, City, Country)
SELECT 
  EmployeeID,
  LastName,
  FirstName,
  Region,
  City,
  Country
FROM Employees_view;



num_affected_rows,num_inserted_rows
9,9


In [None]:
%sql
CREATE TABLE IF NOT EXISTS product_dim 
(
  ProductID INT NOT NULL,
  ProductName VARCHAR(40) NOT NULL,
  CategoryID INT,
  QuantityPerUnit VARCHAR(20)
)
USING DELTA
LOCATION '/FileStore/tables/northwind/refined/product_dim';

INSERT INTO product_dim (ProductID, ProductName, CategoryID, QuantityPerUnit)
SELECT DISTINCT
  ProductID, 
  ProductName, 
  CategoryID, 
  QuantityPerUnit
FROM Products_view;



num_affected_rows,num_inserted_rows
77,77


In [None]:
%sql
CREATE TABLE IF NOT EXISTS customer_dim 
(
  CustomerID CHAR(5) NOT NULL,
  ContactName VARCHAR(30), 
  CompanyName VARCHAR(40) NOT NULL,
  Region VARCHAR(15),
  City VARCHAR(15),
  Country VARCHAR(15)
)
USING DELTA
LOCATION '/FileStore/tables/northwind/refined/customer_dim';

INSERT INTO customer_dim (CustomerID, ContactName, CompanyName, Region, City, Country)
SELECT DISTINCT
  CustomerID, 
  ContactName, 
  CompanyName, 
  Region, 
  City, 
  Country
FROM Customers_view;



num_affected_rows,num_inserted_rows
91,91


In [None]:
%sql
CREATE TABLE IF NOT EXISTS order_fact 
(
  OrderID INT NOT NULL,
  OrderDate DATE,
  ProductID INT NOT NULL, 
  CustomerID CHAR(5),
  EmployeeID INT, 
  RequiredDate DATE, 
  ShippedDate DATE,
  ShipVia INT,
  Date DATE,
  Quantity SMALLINT NOT NULL,
  Discount FLOAT NOT NULL,
  UnitPrice FLOAT NOT NULL,
  Freight FLOAT,
  UnitsInStock SMALLINT,
  UnitsInOrder SMALLINT,
  ReorderLevel SMALLINT
)
USING DELTA
LOCATION '/FileStore/tables/northwind/refined/order_fact';

INSERT INTO order_fact (OrderID, OrderDate, ProductID, CustomerID, EmployeeID, RequiredDate, ShippedDate, ShipVia, Date, Quantity, Discount, UnitPrice, Freight, UnitsInStock, UnitsInOrder, ReorderLevel)
SELECT DISTINCT
  o.OrderID, 
  o.OrderDate, 
  p.ProductID, 
  o.CustomerID,
  o.EmployeeID,
  o.RequiredDate, 
  o.ShippedDate,
  o.ShipVia,
  d.Date,
  od.Quantity,
  od.Discount,
  od.UnitPrice,
  o.Freight,
  p.UnitsInStock,
  p.UnitsOnOrder,
  p.ReorderLevel
FROM Orders_view o JOIN Order_Details__view od ON od.OrderID = o.OrderID JOIN Products_view p ON od.ProductID = p.ProductID JOIN date_dim d ON d.Date = o.OrderDate;


num_affected_rows,num_inserted_rows
2155,2155
