# Summarizing Retail Data - Grouping and aggregating data

For this module, we will be exploring the techniques for grouping and aggregating data. You will learn about:

* Speeding up data loading with Feather
* Grouping data using pandas library
* Plotting Aggregations
* Limiting Groups
* Aggregating data using built-in functions
* Manipulating Plot

## Load Data

In [None]:
#https://archive.ics.uci.edu/ml/datasets/online+retail
import pandas as pd

In [None]:
sales = pd.read_excel('data/Online Retail.xlsx',
                     dtype_backend='pyarrow')

In [None]:
sales

## Using Feather to speed up Loading

In [None]:
(sales
 .astype({'InvoiceNo': 'str', 'StockCode': 'str',
         'Description': 'str'})
 .to_feather('data/Online Retail.fth')
)

In [None]:
%%time 
sales_f = pd.read_feather('data/Online Retail.fth',
                         dtype_backend='pyarrow')

In [None]:
%%time
sales = pd.read_excel('data/Online Retail.xlsx',
                      dtype_backend='pyarrow')

In [None]:
# My Macbook Pro
26_700 / 20

## EDA

In [None]:
sales.describe()

In [None]:
sales.UnitPrice.hist(figsize=(8,3), bins=30)

In [None]:
(sales
 .query('UnitPrice < 0')
)

In [None]:
(sales
 .query('Quantity < 0')
)

In [None]:
(sales
 .query('CustomerID == 17548')
)

In [None]:
sales.select_dtypes('string')

In [None]:
sales.Country.value_counts()

In [None]:
sales.StockCode.value_counts()

## Sales by Year

In [None]:
(sales
 .assign(total=sales.Quantity * sales.UnitPrice)
)

In [None]:
(sales
 .assign(total=sales.Quantity * sales.UnitPrice,
        year=sales.InvoiceDate.dt.year)
 .groupby('year')
 .sum()
)

In [None]:
(sales
 .assign(total=sales.Quantity * sales.UnitPrice,
        year=sales.InvoiceDate.dt.year)
 .groupby('year')
 .sum(numeric_only=True)
)

In [None]:
(sales
 .assign(total=sales.Quantity * sales.UnitPrice,
        year=sales.InvoiceDate.dt.year)
 .groupby('year')
 .sum(numeric_only=True)
 .total
 .plot.bar(title='Sales by Year', figsize=(8,3))
)

## Sales by Country

In [None]:
(sales
 .assign(total=sales.Quantity * sales.UnitPrice,
        year=sales.InvoiceDate.dt.year)
 .groupby('Country')
 .sum(numeric_only=True)
 .total
 .plot.bar(title='Sales by Year', figsize=(8,3))
)

In [None]:
# Remove UK
(sales
 .query('Country != "United Kingdom"')
 .assign(total=sales.Quantity * sales.UnitPrice,
        year=sales.InvoiceDate.dt.year)
 .groupby('Country')
 .sum(numeric_only=True)
 .total
 .plot.bar(title='Sales by Year', figsize=(8,3))
)

In [None]:
# sort and change to horizontal
(sales
 .query('Country != "United Kingdom"')
 .assign(total=sales.Quantity * sales.UnitPrice,
        year=sales.InvoiceDate.dt.year)
 .groupby('Country')
 .sum(numeric_only=True)
 .total
 .sort_values()
 .plot.barh(title='Sales by Year')
)

In [None]:
# Limit countries
(sales
 .Country
 .value_counts()
)

In [None]:
n = 10
top = (sales
 .Country
 .value_counts()
)

topn = top.index[:n]

(sales
 .assign(Country = sales.Country.where(sales.Country.isin(topn), 'Other'))
 .Country
 .value_counts()
)

In [None]:
# add this to helpers
def limit_n(df, col, n=20, other='Other'):
  top = (df
   [col]
   .value_counts()
  )

  topn = top.index[:n]
  return df[col].where(df[col].isin(topn), other)

limit_n(sales, 'Country').value_counts()

In [None]:
# Use the function to create/update a column
(sales
 .query('Country != "United Kingdom"')
 .assign(Country=lambda df: limit_n(df, 'Country'),
         total=lambda df: df.Quantity * df.UnitPrice,
         year=lambda df: df.InvoiceDate.dt.year)
 .groupby('Country')
 .sum(numeric_only=True)
 .total
 .sort_values()
 .plot.barh(title='Sales by Year')
)

## Sales by Month

In [None]:
(sales
 .assign(total=lambda df: df.Quantity * df.UnitPrice)
 .groupby(pd.Grouper(key='InvoiceDate', freq='M'))
 .sum(numeric_only=True)
)

In [None]:
sales.dtypes

In [None]:
(sales
 .astype({'InvoiceDate':'datetime64[ns]'})
 #.dtypes
 .assign(total=lambda df: df.Quantity * df.UnitPrice)
 .groupby(pd.Grouper(key='InvoiceDate', freq='M'))
 .sum(numeric_only=True)
)

In [None]:
# Check whether old format uses more memory
(sales
 .astype({'InvoiceDate':'datetime64[ns]'})
 .memory_usage(deep=True)
)

In [None]:
(sales
 .memory_usage(deep=True)
)

In [None]:
(sales
 .astype({'InvoiceDate':'datetime64[ns]'}) 
 .assign(total=lambda df: df.Quantity * df.UnitPrice)
 .groupby(pd.Grouper(key='InvoiceDate', freq='M'))
 .sum(numeric_only=True)
 .total
 .plot(figsize=(8,3))
)

In [None]:
(sales
 .astype({'InvoiceDate':'datetime64[ns]'})
 .assign(total=lambda df: df.Quantity * df.UnitPrice)
 .groupby(pd.Grouper(key='InvoiceDate', freq='w'))
 .sum(numeric_only=True)
 .total
 .plot(figsize=(8,3))
)

In [None]:
(sales
 .astype({'InvoiceDate':'datetime64[ns]'}) 
 .assign(total=lambda df: df.Quantity * df.UnitPrice)
 .groupby(pd.Grouper(key='InvoiceDate', freq='3d'))
 .sum(numeric_only=True)
 .total
 .plot(figsize=(8,3))
)

## Sales by Month by Top N Countries

In [None]:
(sales
 .astype({'InvoiceDate':'datetime64[ns]'}) 
 .assign(total=lambda df: df.Quantity * df.UnitPrice)
 .groupby([pd.Grouper(key='InvoiceDate', freq='d'),
           'Country'])
 .sum(numeric_only=True)
 .total
)

In [None]:
(sales
 .astype({'InvoiceDate':'datetime64[ns]'})  
 .assign(total=lambda df: df.Quantity * df.UnitPrice)
 .groupby([pd.Grouper(key='InvoiceDate', freq='d'),
           'Country'])
 .sum(numeric_only=True)
 .total
 .unstack()
)

In [None]:
(sales
 .astype({'InvoiceDate':'datetime64[ns]'})  
 .assign(total=lambda df: df.Quantity * df.UnitPrice)
 .groupby([pd.Grouper(key='InvoiceDate', freq='d'),
           'Country'])
 .sum(numeric_only=True)
 .total
 .unstack()
 .fillna(0)
)

In [None]:
(sales
 .astype({'InvoiceDate':'datetime64[ns]'})  
 .assign(total=lambda df: df.Quantity * df.UnitPrice)
 .groupby([pd.Grouper(key='InvoiceDate', freq='d'),
           'Country'])
 .sum(numeric_only=True)
 .total
 .unstack()
 .fillna(0)
 .plot()
)

In [None]:
(sales
 .astype({'InvoiceDate':'datetime64[ns]'})  
 .query('Country != "United Kingdom"')
 .assign(Country=lambda df: limit_n(df, 'Country'),
         total=lambda df: df.Quantity * df.UnitPrice)
 .groupby([pd.Grouper(key='InvoiceDate', freq='d'),
           'Country'])
 .sum(numeric_only=True)
 .total
 .unstack()
 .fillna(0)
 .plot()
)

In [None]:
(sales
 .astype({'InvoiceDate':'datetime64[ns]'})  
 .query('Country != "United Kingdom"')
 .assign(Country=lambda df: limit_n(df, 'Country'),
         total=lambda df: df.Quantity * df.UnitPrice)
 .groupby([pd.Grouper(key='InvoiceDate', freq='w'),
           'Country'])
 .sum(numeric_only=True)
 .total
 .unstack()
 .fillna(0)
 .plot()
 .legend(bbox_to_anchor=(1,1), ncols=2)
)

In [None]:
colors = []
def set_colors(df, country, normal='#999999', hl='#990000'):
  cols = []
  for col in df.columns:
    if col != country:
      colors.append(normal)
      cols.append(col)
  colors.append(hl)
  cols.append(country)
  return df.loc[:,cols]

def plot(df):
  ax = df.plot(color=colors, title='Sales by Country')
  ax.legend(bbox_to_anchor=(1,1), ncols=2)
  ax.set_ylabel('USD')
  return df

final = (sales
 .astype({'InvoiceDate':'datetime64[ns]'})   
 .query('Country != "United Kingdom"')
 .assign(Country=lambda df: limit_n(df, 'Country'),
         total=lambda df: df.Quantity * df.UnitPrice)
 .groupby([pd.Grouper(key='InvoiceDate', freq='w'),
           'Country'])
 .sum(numeric_only=True)
 .total
 .unstack()
 .fillna(0)
 .pipe(set_colors, country='Finland')
 .pipe(plot)
)
