### The purpose of this notebook is work through data transformation, again

In [18]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
from datetime import datetime, timedelta
import collections

import plotly as py
import plotly.express as px
import plotly.offline as pyo
from plotly.offline import iplot, plot, init_notebook_mode, download_plotlyjs
import plotly.graph_objs as go
init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings('ignore')



 - first, prepare the data

In [79]:
from math import floor
def prep_data(filename, dropna,datecol):
    baskets = pd.read_csv(filename, parse_dates = [datecol])
    if dropna:
        baskets.dropna(inplace=True)
    for s in baskets.columns:
        if ("id" in s):
            baskets.loc[:,s] = pd.Categorical(baskets.loc[:,s].apply(lambda x: floor(x)))
    if datecol:
        baskets['date'] = baskets[datecol].dt.date
        baskets['year'] = baskets[datecol].dt.year
        baskets['month'] = baskets[datecol].dt.month
        baskets['month_num'] = (baskets['year'] - 2021) * 12 + baskets['month']
        baskets['week_num'] = baskets[datecol].dt.isocalendar().week
        baskets['week_num'] = (baskets['year'] - 2021) * 52 + baskets['week_num']
        baskets['day'] = baskets[datecol].dt.day
        baskets['hour'] = baskets[datecol].dt.hour
        baskets['weekday'] = baskets[datecol].dt.weekday
        baskets["spent"] = baskets["qty"] * baskets["price"]
    return baskets

def make_merchants(baskets):
    merchants = baskets.groupby(['merchant_id']).agg(
        total_spent = ('spent', 'sum'), 
        num_orders = ('order_id', 'nunique'), 
        num_days = ('date', 'nunique'), 
        num_skus = ('sku_id','nunique'), 
        num_top_cats = ('top_cat_id','nunique'), 
        num_sub_cats = ('sub_cat_id','nunique'),
    ).reset_index()
    merchants['avg_spent_per_order'] = merchants.total_spent / merchants.num_orders
    return merchants

def make_skus(baskets):
    skus_by_day = baskets.groupby(['sku_id','date']).agg(
        avg_price_by_day = ('price','mean'),
        num_order_by_day = ('order_id', 'nunique'), 
        num_merchants_by_day = ('merchant_id', 'nunique'),
    ).reset_index()
    return skus_by_day

def make_top_cats(baskets):
    top_cats = baskets.groupby(['top_cat_id']).agg(
        avg_price = ('price', 'mean'),
        total_spent = ('spent', 'sum'),
        total_quantity = ('qty' , 'sum'),
        num_orders = ('order_id', 'nunique'), 
        num_days = ('date' , 'nunique'),
        num_merchants = ('merchant_id', 'nunique')
    ).reset_index()
    return top_cats


In [80]:
dropna = True
datecol = 'placed_at'
filename = 'new_baskets_full.csv'

baskets = prep_data(filename, dropna,datecol)
#merchants = make_merchants(baskets)
#skus = make_skus(baskets)
#top_cats = make_top_cats(baskets)

In [81]:
baskets.week_num.max()

104

### subsetting

In [73]:
baskets.loc[:,['sku_id','spent']].__class__.__name__

'DataFrame'

 - selecting columns
 - baskets.loc[:,['sku_id','spent']]
 - baskets[['sku_id','spent']]

 - selecting cells
  - baskets.loc[1:3, 'year':'weekday'] selecting based on labels
  - baskets.iloc[1:3,7:9] based off position
 - check on the type of an object: obj.__class__
  - selecting row
   - baskets.loc[baskets['year'] == 2022]

### long expression, wrap around with parentheses
 - (baskets[baskets['year'] == 2022]
   .sort_values('spent', ascending = False)
   .head(7))

In [82]:
### application
df = baskets.groupby('week_num')['spent'].sum().reset_index()
px.line(df, x='week_num',y='spent',width=800,height=300)

In [31]:
df = (baskets.query('sub_cat_id == 43')
    .query('year == 2022')
    .query('month < 5'))
px.line(df,x='date',y='spent',width=800,height=300)

In [88]:
def data_range(counts):
    return counts.max() - counts.min()

df = (baskets.query('top_cat_id == 4')
    .groupby('week_num')
    ['price']
    .agg(data_range) # aggregate using custom function
    .reset_index()
)
px.line(df, x = 'week_num', y='price')

In [92]:
def count_unique(s):
    return len(s.unique())

unique_sku_id_by_week = (baskets
    .groupby('week_num')
    ['sku_id']
 .agg(count_unique) # aggregate using the custom count_unique function
 .reset_index()
)

px.line(unique_sku_id_by_week, x = 'week_num', y= 'sku_id')

In [100]:
df = pd.pivot_table(
    baskets,
    index='week_num',   # Column to turn into new index
    columns='top_cat_id',  # Column to turn into new columns
    values='spent', # Column to aggregate for values
    aggfunc=sum)    # Aggregation function
px.line(df, width=800,height=800)

In [120]:
df = (baskets.groupby(['top_cat_id','month_num'])
    ['spent']
    .sum()
    .reset_index()
)    
fig = px.bar(df, x='top_cat_id', y='spent',
              facet_col='month_num', facet_col_wrap=3,
              facet_row_spacing=0.10,
              width=800, height=1000)
#fig.update_layout(margin=dict(t=30))
fig.update_yaxes(matches=None, showticklabels=False)


apply works on Series, and Assign works on dataframe without altering the input dataframe
letters = baby.assign(Firsts=names.apply(first_letter))


baskets.iloc[0:5] select by row range, 0:5 inclusive left exclusive right
vs. baskets.loc[0:5] select by lable, both inclusive




Find the five sku with the highest standard deviation of total monthly qty. What might a large standard deviation tell us about of these sku over time?
