# Análises com Apache Spark

### 1. Configuração

In [0]:
username = 'thais'

northwind = f'/letscode/{username}/northwind/'

spark.sql(f'USE letscode_{username}_refined')

Out[1]: DataFrame[]

### 2. Análises

### 2.1 Quais são os três produtos MENOS vendidos?

In [0]:
from pyspark.sql.functions import col, count, sum

df_ft_orders = spark.read.table('ft_orders')

prod_menos_vend = (
  df_ft_orders
  .groupby('sk_products')
  .agg((count(col('ft_orders_id'))).alias('qtde_vend'))
  .sort(('qtde_vend'))
)

display(prod_menos_vend)

sk_products,qtde_vend
37,3
66,4
9,4
48,4
74,5
27,6
5,6
50,6
67,9
3,9


In [0]:
df_dm_products = spark.read.table('dm_products')
df_dm_products.toPandas()[(df_dm_products.toPandas()['sk_products'] == 37) |
                         (df_dm_products.toPandas()['sk_products'] == 66) |
                         (df_dm_products.toPandas()['sk_products'] == 9) |
                         (df_dm_products.toPandas()['sk_products'] == 48)]

Unnamed: 0,sk_products,nk_products,product_name,category_name,description,picture,quantity_per_unit,unit_price,units_in_stock,units_on_order,...,sup_contact_name,sup_contact_title,sup_address,sup_city,sup_region,sup_postal_code,sup_country,sup_phone,sup_fax,sup_homepage
8,9,9,Mishi Kobe Niku,Meat/Poultry,Prepared meats,binary data,18 - 500 g pkgs.,97.0,29,0,...,Yoshi Nagase,Marketing Manager,9-8 Sekimai Musashino-shi,Tokyo,,100,Japan,(03) 3555-5011,,
36,37,37,Gravad lax,Seafood,Seaweed and fish,binary data,12 - 500 g pkgs.,26.0,11,50,...,Michael Björn,Sales Representative,Brovallavägen 231,Stockholm,,S-123 45,Sweden,08-123 45 67,,
47,48,48,Chocolade,Confections,"Desserts, candies, and sweet breads",binary data,10 pkgs.,12.75,15,70,...,Dirk Luchte,Accounting Manager,Verkoop Rijnweg 22,Zaandam,,9999 ZZ,Netherlands,(12345) 1212,(12345) 1210,
65,66,66,Louisiana Hot Spiced Okra,Condiments,"Sweet and savory sauces, relishes, spreads, an...",binary data,24 - 8 oz jars,17.0,4,100,...,Shelley Burke,Order Administrator,P.O. Box 78934,New Orleans,LA,70117,USA,(100) 555-4822,,#CAJUN.HTM#


__Resposta:__
Os três (ou quatro) produtos menos vendidos foram: "Gravad lax", "Louisiana Hot Spiced Okra", "Mishi Kobe Niku" e "Chocolade".

### 2.2 - Quais são os cinco clientes que mais compras fizeram?

In [0]:
client_mais_compr = (
  df_ft_orders
  .groupby('sk_customers')
  .agg((count(col('ft_orders_id'))).alias('qtde_vend'))
  .sort(['qtde_vend'], ascending=False)
)

display(client_mais_compr)

sk_customers,qtde_vend
71,97
20,67
63,62
65,45
5,37
9,34
24,34
37,34
35,31
25,31


In [0]:
df_dm_customers = spark.read.table('dm_customers')
df_dm_customers.toPandas()[(df_dm_customers.toPandas()['sk_customers'] == 71) |
                         (df_dm_customers.toPandas()['sk_customers'] == 20) |
                         (df_dm_customers.toPandas()['sk_customers'] == 63) |
                         (df_dm_customers.toPandas()['sk_customers'] == 65) |
                          (df_dm_customers.toPandas()['sk_customers'] == 5)]

Unnamed: 0,nk_customers,company_name,contact_name,contact_title,address,city,region,postal_code,country,phone,fax,sk_customers
4,BERGS,Berglunds snabbköp,Christina Berglund,Order Administrator,Berguvsvägen 8,Luleå,,S-958 22,Sweden,0921-12 34 65,0921-12 34 67,5
19,ERNSH,Ernst Handel,Roland Mendel,Sales Manager,Kirchgasse 6,Graz,,8010,Austria,7675-3425,7675-3426,20
62,QUICK,QUICK-Stop,Horst Kloss,Accounting Manager,Taucherstraße 10,Cunewalde,,01307,Germany,0372-035188,,63
64,RATTC,Rattlesnake Canyon Grocery,Paula Wilson,Assistant Sales Representative,2817 Milton Dr.,Albuquerque,NM,87110,USA,(505) 555-5939,(505) 555-3620,65
70,SAVEA,Save-a-lot Markets,Jose Pavarotti,Sales Representative,187 Suffolk Ln.,Boise,ID,83720,USA,(208) 555-8097,,71


__Resposta:__
Os cinco clientes que mais compras fizeram foram: Jose Pavarotti, Roland Mendel, Horst Kloss, Paula Wilson e Christina Berglund.

### 2.3 - Quais são os cinco clientes com maior total de vendas?

In [0]:
client_maior_total_vend = (
  df_ft_orders
  .groupby('sk_customers')
  .agg((sum(col('unit_price') * col('quantity') - col('discount'))).alias('total_vend'))
  .sort(['total_vend'], ascending=False)
)

display(client_maior_total_vend)

sk_customers,total_vend
63,98668.24
71,94807.54000000002
20,84754.73
37,45002.89
34,30841.5
24,28868.3
39,27125.5
65,27018.210000000006
89,23710.1
5,21434.15


In [0]:
df_dm_customers = spark.read.table('dm_customers')
df_dm_customers.toPandas()[(df_dm_customers.toPandas()['sk_customers'] == 63) |
                         (df_dm_customers.toPandas()['sk_customers'] == 71) |
                         (df_dm_customers.toPandas()['sk_customers'] == 20) |
                         (df_dm_customers.toPandas()['sk_customers'] == 37) |
                          (df_dm_customers.toPandas()['sk_customers'] == 34)]

Unnamed: 0,nk_customers,company_name,contact_name,contact_title,address,city,region,postal_code,country,phone,fax,sk_customers
19,ERNSH,Ernst Handel,Roland Mendel,Sales Manager,Kirchgasse 6,Graz,,8010,Austria,7675-3425,7675-3426,20
33,HANAR,Hanari Carnes,Mario Pontes,Accounting Manager,"Rua do Paço, 67",Rio de Janeiro,RJ,05454-876,Brazil,(21) 555-0091,(21) 555-8765,34
36,HUNGO,Hungry Owl All-Night Grocers,Patricia McKenna,Sales Associate,8 Johnstown Road,Cork,Co. Cork,,Ireland,2967 542,2967 3333,37
62,QUICK,QUICK-Stop,Horst Kloss,Accounting Manager,Taucherstraße 10,Cunewalde,,01307,Germany,0372-035188,,63
70,SAVEA,Save-a-lot Markets,Jose Pavarotti,Sales Representative,187 Suffolk Ln.,Boise,ID,83720,USA,(208) 555-8097,,71


__Resposta:__
Os cinco clientes com o maior valor total de vendas foram: Horst Kloss, Jose Pavarotti, Roland Mendel, Patricia McKenna e Mario Pontes.

### 2.4 - Qual o melhor funcionário do último mês registrado? (total de vendas)

In [0]:
func_maior_total_vend = (
  df_ft_orders
  .groupby('sk_employees', 'order_year', 'order_month')
  .agg((sum(col('unit_price') * col('quantity') - col('discount'))).alias('total_vend'))
  .sort(['order_year', 'order_month', 'total_vend'], ascending=False)
)

display(func_maior_total_vend)

sk_employees,order_year,order_month,total_vend
1,1998,5,7052.659999999998
4,1998,5,6274.25
8,1998,5,3221.96
2,1998,5,2173.05
7,1998,5,1173.0
7,1998,4,34790.40000000001
2,1998,4,32679.15
3,1998,4,14296.949999999995
8,1998,4,14054.45
1,1998,4,13619.449999999995


In [0]:
df_dm_employees = spark.read.table('dm_employees')
df_dm_employees.toPandas()[df_dm_employees.toPandas()['sk_employees'] == 1]

Unnamed: 0,sk_employees,nk_employees,last_name,first_name,title,title_of_courtesy,birth_date,hire_date,address,city,...,region,region_description,postal_code,country,home_phone,extension,photo,notes,reports_to,photo_path
0,1,1,Davolio,Nancy,Sales Representative,Ms.,1948-12-08,1992-05-01,507 - 20th Ave. E.\nApt. 2A,Seattle,...,WA,Eastern,98122,USA,(206) 555-9857,5467,binary data,Education includes a BA in psychology from Col...,2,http://accweb/emmployees/davolio.bmp
1,1,1,Davolio,Nancy,Sales Representative,Ms.,1948-12-08,1992-05-01,507 - 20th Ave. E.\nApt. 2A,Seattle,...,WA,Eastern,98122,USA,(206) 555-9857,5467,binary data,Education includes a BA in psychology from Col...,2,http://accweb/emmployees/davolio.bmp


__Resposta:__
A melhor funcionária do último mês registrado foi: Nancy Davolio.

### 2.5 - Quais as regiões com menos clientes cadastrados?

In [0]:
reg_menos_client = (
  df_dm_customers
  .groupby('region')
  .agg((count(col('sk_customers'))).alias('qtde_client'))
  .sort(['qtde_client'])
)

display(reg_menos_client)

region,qtde_client
MT,1
NM,1
Québec,1
WY,1
Táchira,1
DF,1
Lara,1
AK,1
Co. Cork,1
ID,1


__Resposta:__
As regiões com menos clientes cadastrados são: MT, NM, Québec, WY, Táchira, DF, Lara, AK, Co. Cork, ID, Isle of Wight, CA e Nueva Esparta.