In [0]:
sc

In [0]:
spark

In [0]:
dbutils.fs.ls("/Volumes/azuredatabricks_1405569260508774/default/sales/")

[FileInfo(path='dbfs:/Volumes/azuredatabricks_1405569260508774/default/sales/sales_data_sample.csv', name='sales_data_sample.csv', size=527958, modificationTime=1727673529000)]

In [0]:
df = spark.read.format("csv").option("header","true").option("inferschema","true").load("/Volumes/azuredatabricks_1405569260508774/default/sales/sales_data_sample.csv")

In [0]:
df.show(5)

+-----------+---------------+---------+---------------+-------+---------------+-------+------+--------+-------+-----------+----+-----------+--------------------+----------------+--------------------+------------+-------------+-----+----------+-------+---------+---------------+----------------+--------+
|ORDERNUMBER|QUANTITYORDERED|PRICEEACH|ORDERLINENUMBER|  SALES|      ORDERDATE| STATUS|QTR_ID|MONTH_ID|YEAR_ID|PRODUCTLINE|MSRP|PRODUCTCODE|        CUSTOMERNAME|           PHONE|        ADDRESSLINE1|ADDRESSLINE2|         CITY|STATE|POSTALCODE|COUNTRY|TERRITORY|CONTACTLASTNAME|CONTACTFIRSTNAME|DEALSIZE|
+-----------+---------------+---------+---------------+-------+---------------+-------+------+--------+-------+-----------+----+-----------+--------------------+----------------+--------------------+------------+-------------+-----+----------+-------+---------+---------------+----------------+--------+
|      10107|             30|     95.7|              2| 2871.0| 2/24/2003 0:00|Shipped| 

In [0]:
df.printSchema()

root
 |-- ORDERNUMBER: integer (nullable = true)
 |-- QUANTITYORDERED: integer (nullable = true)
 |-- PRICEEACH: double (nullable = true)
 |-- ORDERLINENUMBER: integer (nullable = true)
 |-- SALES: double (nullable = true)
 |-- ORDERDATE: string (nullable = true)
 |-- STATUS: string (nullable = true)
 |-- QTR_ID: integer (nullable = true)
 |-- MONTH_ID: integer (nullable = true)
 |-- YEAR_ID: integer (nullable = true)
 |-- PRODUCTLINE: string (nullable = true)
 |-- MSRP: integer (nullable = true)
 |-- PRODUCTCODE: string (nullable = true)
 |-- CUSTOMERNAME: string (nullable = true)
 |-- PHONE: string (nullable = true)
 |-- ADDRESSLINE1: string (nullable = true)
 |-- ADDRESSLINE2: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- POSTALCODE: string (nullable = true)
 |-- COUNTRY: string (nullable = true)
 |-- TERRITORY: string (nullable = true)
 |-- CONTACTLASTNAME: string (nullable = true)
 |-- CONTACTFIRSTNAME: string (nullable = tr

In [0]:
df.createOrReplaceTempView("productssales")

##### 1. Summary of Product Sales 

In [0]:
%sql
select count(*) as salescount from productssales

salescount
2823


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
select ROUND(sum(SALES),2) as Total_Sales from productssales

Total_Sales
10032628.85


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
select ROUND(SUM(SALES - PRICEEACH*QUANTITYORDERED),2) AS TOTALPROFIT from productssales

TOTALPROFIT
1741742.06


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
select count(distinct CUSTOMERNAME) as TotalCustomers from productssales

TotalCustomers
92


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
select count(distinct PRODUCTCODE) as ProductCount from productssales

ProductCount
109


Databricks visualization. Run in Databricks to view.

##### 2. Quantity Sold Per Product

In [0]:
%sql
select PRODUCTLINE, SUM(QUANTITYORDERED) as ProductSold from productssales group by PRODUCTLINE ORDER BY PRODUCTLINE

PRODUCTLINE,ProductSold
Classic Cars,33992
Motorcycles,11663
Planes,10727
Ships,8127
Trains,2712
Trucks and Buses,10777
Vintage Cars,21069


Databricks visualization. Run in Databricks to view.

#####3. Quantity Sold Per Month

In [0]:
%sql
select MONTH_ID, SUM(QUANTITYORDERED) as ProductSold from productssales group by MONTH_ID ORDER BY MONTH_ID

MONTH_ID,ProductSold
1,7997
2,7903
3,7585
4,6704
5,8992
6,4620
7,4899
8,6538
9,5681
10,10998


Databricks visualization. Run in Databricks to view.

##### 4. Month on Month Sales Growth

In [0]:
%sql
select round(sum(sales),3) as total_sales, day(to_timestamp(orderdate, 'M/d/yyyy H:mm')) as dayofmonth, month_id
from productssales group by dayofmonth, month_id

total_sales,dayofmonth,month_id
67893.95,10,2
148418.29,17,2
5307.98,27,9
54251.66,13,10
27257.79,4,10
65824.18,6,1
20321.53,17,3
14066.8,7,4
27489.45,2,2
102666.6,25,11


Databricks visualization. Run in Databricks to view.

##### 5. Monthly and Weekly Sales Growth.

In [0]:
%sql
select round(sum(sales),3) as total_sales, dayofweek(to_timestamp(orderdate, 'M/d/yyyy H:mm')) as dayofweek, month_id
from productssales group by dayofweek, month_id

total_sales,dayofweek,month_id
141056.39,4,10
206098.67,6,1
9874.82,7,4
92822.98,2,2
153520.57,2,3
173801.23,4,7
50432.55,1,2
85229.4,1,9
57178.15,5,7
106951.47,5,6


Databricks visualization. Run in Databricks to view.

##### 6. Top 20 City with Most Quantity Orders

In [0]:
%sql
select city, round(sum(quantityordered),3) as quantity
from productssales group by city order by quantity  desc limit 20

city,quantity
Madrid,10958
San Rafael,6366
NYC,5294
Singapore,2760
Paris,2521
San Francisco,2139
Nantes,2102
New Bedford,2043
Melbourne,1926
Manchester,1778


Databricks visualization. Run in Databricks to view.

##### 7. Customer Sales Contribution

In [0]:
spark.sql("select CUSTOMERNAME, SUM(SALES) as sum_sales, SUM(SALES)/(select SUM(SALES) from productssales) as contribution_pcnt from productssales group by CUSTOMERNAME order by sum_sales DESC").show(10)

+--------------------+------------------+--------------------+
|        CUSTOMERNAME|         sum_sales|   contribution_pcnt|
+--------------------+------------------+--------------------+
|Euro Shopping Cha...| 912294.1100000002| 0.09093270803095634|
|Mini Gifts Distri...|         654858.06| 0.06527282826773756|
|Australian Collec...|200995.40999999997|0.020034171801341955|
|  Muscle Machine Inc|197736.93999999997|0.019709384544809486|
|   La Rochelle Gifts|          180124.9|0.017953908461389938|
|Dragon Souveniers...|172989.68000000008|0.017242707029872822|
|   Land of Toys Inc.|164069.44000000003|0.016353584135627607|
|The Sharp Gifts W...|160010.26999999996| 0.01594898728861078|
|      AV Stores, Co.|157807.80999999997| 0.01572945758877543|
|Anna's Decoration...|153996.13000000003|0.015349529251249026|
+--------------------+------------------+--------------------+
only showing top 10 rows



In [0]:
%sql
select CUSTOMERNAME as Customer_Name, SUM(SALES) as Sum_Sales, SUM(SALES)/(select SUM(SALES) from productssales) as Contribution_Pcnt from productssales group by CUSTOMERNAME order by sum_sales DESC LIMIT 10

Customer_Name,Sum_Sales,Contribution_Pcnt
Euro Shopping Channel,912294.1100000002,0.0909327080309563
Mini Gifts Distributors Ltd.,654858.06,0.0652728282677375
"Australian Collectors, Co.",200995.41,0.0200341718013419
Muscle Machine Inc,197736.94,0.0197093845448094
La Rochelle Gifts,180124.9,0.0179539084613899
"Dragon Souveniers, Ltd.",172989.68000000008,0.0172427070298728
Land of Toys Inc.,164069.44000000003,0.0163535841356276
The Sharp Gifts Warehouse,160010.26999999996,0.0159489872886107
"AV Stores, Co.",157807.80999999997,0.0157294575887754
"Anna's Decorations, Ltd",153996.13000000003,0.015349529251249


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

#####8. Find Seasonal Sales Trends

In [0]:
spark.sql("""
select MONTH_ID,
CASE 
  WHEN QTR_ID == 1 THEN 'Winter'
  WHEN QTR_ID == 2 THEN 'Spring'
  WHEN QTR_ID == 3 THEN 'Summer'
  WHEN QTR_ID == 4 THEN 'Fall'
END AS Season, sum(SALES) from productssales group by MONTH_ID, QTR_ID ORDER BY MONTH_ID ASC
""").show()

+--------+------+------------------+
|MONTH_ID|Season|        sum(SALES)|
+--------+------+------------------+
|       1|Winter| 785874.4400000008|
|       2|Winter|          810441.9|
|       3|Winter| 754501.3900000001|
|       4|Spring| 669390.9600000003|
|       5|Spring|         923972.56|
|       6|Spring|454756.77999999985|
|       7|Summer| 514875.9700000001|
|       8|Summer| 659310.5699999998|
|       9|Summer| 584724.2699999999|
|      10|  Fall|1121215.2199999997|
|      11|  Fall|        2118885.67|
|      12|  Fall| 634679.1199999998|
+--------+------+------------------+



In [0]:
%sql
select MONTH_ID,
CASE 
  WHEN QTR_ID == 1 THEN 'Winter'
  WHEN QTR_ID == 2 THEN 'Spring'
  WHEN QTR_ID == 3 THEN 'Summer'
  WHEN QTR_ID == 4 THEN 'Fall'
END AS Season, sum(SALES) from productssales group by MONTH_ID, QTR_ID ORDER BY MONTH_ID ASC

MONTH_ID,Season,sum(SALES)
1,Winter,785874.4400000008
2,Winter,810441.9
3,Winter,754501.3900000001
4,Spring,669390.9600000003
5,Spring,923972.56
6,Spring,454756.7799999999
7,Summer,514875.9700000001
8,Summer,659310.5699999998
9,Summer,584724.2699999999
10,Fall,1121215.2199999995


Databricks visualization. Run in Databricks to view.

##### 9. Categorize ProductLine as 'Hot', or 'Cold' based on their ORDER Quantity in each quarter, using a CASE WHEN statement

In [0]:
%sql
select ProductLine, QTR_ID, sum(quantity),
case
when quantity=max_qty then 'hot'
when quantity=min_qty then 'cold'
else 'medium'
end as category from
(select ProductLine, QTR_ID, sum(QUANTITYORDERED) quantity, MAX(sum(QUANTITYORDERED))over(partition by qtr_id) max_qty, MIN(sum(QUANTITYORDERED)) over(partition by qtr_id) min_qty from productssales group by ProductLine, QTR_ID ORDER BY QTR_ID) as fact group by
ProductLine, QTR_ID,category having category='hot' or category='cold'

ProductLine,QTR_ID,sum(quantity),category
Classic Cars,1,7784,hot
Trains,1,693,cold
Classic Cars,2,6447,hot
Trains,2,439,cold
Classic Cars,3,6190,hot
Trains,3,531,cold
Trains,4,1049,cold
Classic Cars,4,13571,hot


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
select ProductLine, QTR_ID, sum(QUANTITYORDERED) quantity from productssales group by ProductLine, QTR_ID ORDER BY QTR_ID

ProductLine,QTR_ID,quantity
Planes,1,2625
Classic Cars,1,7784
Ships,1,2163
Trucks and Buses,1,2068
Vintage Cars,1,5481
Trains,1,693
Motorcycles,1,2671
Ships,2,1620
Classic Cars,2,6447
Trucks and Buses,2,2286
