In [1]:
import numpy as np
import pandas as pd

In [2]:
df1 = pd.DataFrame(
    columns=["user_id", "store_id", "product_id"],
    data=[
        [1, 1, 1],
        [1, 2, 6],
        [3, 2, 7],
        [2, 2, 3],
        [3, 2, 3],
        [4, 3, 2],
        [2, 3, 1],
        [1, 1, 3],
        [1, 3, 3],
        [5, 1, 4],
        [5, 1, 1],
        [3, 2, 1],
        [1, 2, 1],
        [2, 2, 2],
        [3, 2, 2],
        [4, 2, 3],
        [4, 1, 5],
        [5, 1, 6],
        [3, 1, 6],
        [2, 3, 7],
    ]
)
df2 = pd.DataFrame(
    columns=["store_id", "store_name"],
    data=[
        [1, "Pão de Açúcar"],
        [2, "Dia"],
        [3, "Extra"],
    ]
)
df3 = pd.DataFrame(
    columns=["product_id", "product_name"],
    data=[
        [1, "Leite"],
        [2, "Ovos"],
        [3, "Arroz"],
        [4, "Feijão"],
        [5, "Carne"],
        [6, "Frango"],
        [7, "Peixe"]
    ]
)

df = pd.merge(pd.merge(df1, df2, on="store_id"), df3, on="product_id")[["user_id", "store_name", "product_name"]]
df

Unnamed: 0,user_id,store_name,product_name
0,1,Pão de Açúcar,Leite
1,5,Pão de Açúcar,Leite
2,3,Dia,Leite
3,1,Dia,Leite
4,2,Extra,Leite
5,1,Pão de Açúcar,Arroz
6,2,Dia,Arroz
7,3,Dia,Arroz
8,4,Dia,Arroz
9,1,Extra,Arroz


# pivot table

In [3]:
df.pivot(columns="store_name", values="product_name")

store_name,Dia,Extra,Pão de Açúcar
0,,,Leite
1,,,Leite
2,Leite,,
3,Leite,,
4,,Leite,
5,,,Arroz
6,Arroz,,
7,Arroz,,
8,Arroz,,
9,,Arroz,


In [4]:
df.pivot_table(columns="store_name", values="product_name", aggfunc="count")

store_name,Dia,Extra,Pão de Açúcar
product_name,9,4,7


# crosstab

In [5]:
pd.crosstab(df.store_name, df.product_name)

product_name,Arroz,Carne,Feijão,Frango,Leite,Ovos,Peixe
store_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Dia,3,0,0,1,2,2,1
Extra,1,0,0,0,1,1,1
Pão de Açúcar,1,1,1,2,2,0,0


In [None]:
#5 produtos mais comprados de cada departamento na terça-feira
tuesday_orders = dataset[dataset.order_dow == "tuesday"]
depart_prod_count = pd.crosstab(tuesday_orders.department, tuesday_orders.product_name)
top_5_prods_per_dept = depart_prod_count.apply
                                    (lambda x: x.sort_values(ascending=False)[:5].index.tolist(), axis=1)
top_5_prods_per_dept.head()
output:
department
alcohol      [Cabernet Sauvignon, Sauvignon Blanc, India Pa...
babies       [Baby Food Stage 2 Blueberry Pear & Purple Car...
bakery       [100% Whole Wheat Bread, Organic Bread with 21...
beverages    [Sparkling Water Grapefruit, Spring Water, Sod...
breakfast    [Honey Nut Cheerios, Organic Old Fashioned Rol...
dtype: object

for dept in top_5_prods_per_dept.index:
    print("Department '{}':".format(dept))
    print(" - " + "\n - ".join(top_5_prods_per_dept.loc[dept]))

output:
Department 'alcohol':
- Cabernet Sauvignon
- Sauvignon Blanc
- India Pale Ale
- Beer
- Chardonnay
Department 'babies':
- Baby Food Stage 2 Blueberry Pear & Purple Carrot
...

In [None]:
#2 corredores (aisles) visitados juntos no mesmo pedido mais vezes em uma sexta-feira
friday_orders = dataset[dataset.order_dow == "friday"]
aisles_visit_count = pd.crosstab(friday_orders.order_id, friday_orders.aisle)

aisles_visit_count = aisles_visit_count.applymap(lambda x: 1 if x > 0 else 0)
aisles_visit_countT = aisles_visit_count.T
co_occurrent_aisles = aisles_visit_countT.dot(aisles_visit_count)
co_occurrent_aisles -= np.triu(co_occurrent_aisles)

co_occurrent_aisles.stack().sort_values(ascending=False).head(1)

output:
aisle             aisle       
fresh vegetables  fresh fruits    5809
dtype: int64

# group by

In [8]:
#cria um monstrinho que não pode ser chamado da mesma forma que um dataframe
gb = df.groupby("product_name")
gb

<pandas.core.groupby.DataFrameGroupBy object at 0x000001E4385260B8>

In [11]:
#little monster methods
gb.user_id.count()

product_name
Arroz     5
Carne     1
Feijão    1
Frango    3
Leite     5
Ovos      3
Peixe     2
Name: user_id, dtype: int64

In [12]:
gb.max()

Unnamed: 0_level_0,user_id,store_name
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Arroz,4,Pão de Açúcar
Carne,4,Pão de Açúcar
Feijão,5,Pão de Açúcar
Frango,5,Pão de Açúcar
Leite,5,Pão de Açúcar
Ovos,4,Extra
Peixe,3,Extra


In [15]:
gb.describe()

Unnamed: 0_level_0,user_id,user_id,user_id,user_id,user_id,user_id,user_id,user_id
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
product_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Arroz,5.0,2.2,1.30384,1.0,1.0,2.0,3.0,4.0
Carne,1.0,4.0,,4.0,4.0,4.0,4.0,4.0
Feijão,1.0,5.0,,5.0,5.0,5.0,5.0,5.0
Frango,3.0,3.0,2.0,1.0,2.0,3.0,4.0,5.0
Leite,5.0,2.4,1.67332,1.0,1.0,2.0,3.0,5.0
Ovos,3.0,3.0,1.0,2.0,2.5,3.0,3.5,4.0
Peixe,2.0,2.5,0.707107,2.0,2.25,2.5,2.75,3.0


In [17]:
gb.aggregate(["min", "mean", "median", "max", "sum"])

Unnamed: 0_level_0,user_id,user_id,user_id,user_id,user_id
Unnamed: 0_level_1,min,mean,median,max,sum
product_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Arroz,1,2.2,2.0,4,11
Carne,4,4.0,4.0,4,4
Feijão,5,5.0,5.0,5,5
Frango,1,3.0,3.0,5,9
Leite,1,2.4,2.0,5,12
Ovos,2,3.0,3.0,4,9
Peixe,2,2.5,2.5,3,5


In [9]:
#to transform little monster in table
gb.unstack()

AttributeError: Cannot access callable attribute 'unstack' of 'DataFrameGroupBy' objects, try using the 'apply' method

In [None]:
aisles_sales_count = dataset.groupby("aisle").product_name.count()
l_inf, l_sup = aisles_sales_count.describe(percentiles=[.05, .95])[["5%", "95%"]]
output: 
(458.44999999999999, 28386.649999999991)

#piores corredores
aisles_sales_count[aisles_sales_count < l_inf].sort_values()
output:
aisle
beauty 247
frozen juice 251
baby accessories 273
baby bath body care 286
kitchen supplies 410
specialty wines champagnes 416
ice cream toppings 450
Name: product_name, dtype: int64

#melhores corredores
aisles_sales_count[aisles_sales_count > l_sup].sort_values(ascending=False)
output:
aisle
fresh fruits                     134576
fresh vegetables                 134559
packaged vegetables fruits        70176
yogurt                            49372
packaged cheese                   37390
water seltzer sparkling water     32647
milk                              29192
Name: product_name, dtype: int64