### Import pandas and read in the csv file and set it to a dataframe called baskets

In [56]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

### put the utility functions together

In [58]:
def add_date_cols(baskets):
    baskets['datetime'] = pd.to_datetime(baskets['placed_at'])
    baskets['year'] = baskets["datetime"].dt.year
    baskets['month'] = baskets["datetime"].dt.month
    baskets['date'] = baskets["datetime"].dt.date
    baskets['day'] = baskets["datetime"].dt.day
    baskets['hour'] = baskets["datetime"].dt.hour
    baskets['weekday'] = baskets["datetime"].dt.weekday
    baskets['year_month'] = baskets["datetime"].apply(lambda t: t.strftime("%Y-%m"))
    baskets['month_num'] = (baskets['year'] - 2021) * 12 + baskets['month']
    baskets['year_week'] = baskets["datetime"].apply(lambda t: t.strftime("%Y-%W")) # this makes the beginning of Jan 2022 as week 2022-00 , not 2022-52
    baskets['week_num'] = baskets["datetime"].apply(lambda t: int(t.strftime("%W"))) 
    baskets['iso_week_num'] = baskets["datetime"].dt.isocalendar().week # this returns week number 52 for Jan 1, 2021, not 0 which is what we want
    baskets['cum_week_num'] = (baskets['year'] - 2021) * 52 + baskets['week_num']
    return baskets
def get_merchant_attributes(baskets):
    merchant_attributes = baskets.groupby(['merchant_id']).agg(
        total_spent = ('spent', 'sum'), 
        num_orders = ('order_id', 'nunique'), 
        first_month = ('month_num', 'min'), 
        last_month = ('month_num', 'max'), 
        num_months = ('month_num', 'nunique'), 
        num_weeks = ('week_num', 'nunique'), 
        num_days = ('date', 'nunique'), 
        num_skus = ('sku_id','nunique'), 
        num_top_cats = ('top_cat','nunique'), 
        num_sub_cats = ('sub_cat','nunique'),
    ).reset_index()
    merchant_attributes['avg_spent_per_order'] = merchant_attributes.total_spent / merchant_attributes.num_orders
    merchant_attributes['tenure_month'] = merchant_attributes.last_month - merchant_attributes.first_month +1
    return merchant_attributes

In [59]:
filename = './AwanTunai_transaction_data.csv'
baskets = pd.read_csv(filename)
baskets["spent"] = baskets["qty"] * baskets["price"] # column wide multiplication
baskets = add_date_cols(baskets)
merchant_attributes = get_merchant_attributes(baskets)

### which merchants have the most orders

In [68]:
baskets.groupby(['merchant_id']).order_id.nunique().sort_values(ascending = False).head()

merchant_id
191    23864
11      3238
59       440
133      422
56       408
Name: order_id, dtype: int64

In [69]:
merchant_attributes.sort_values(by="num_orders", ascending=False).head()

Unnamed: 0,merchant_id,total_spent,num_orders,first_month,last_month,num_months,num_weeks,num_days,num_skus,num_top_cats,num_sub_cats,avg_spent_per_order,tenure_month
190,191,18910870000.0,23864,10,24,15,53,432,1332,33,95,792443.3,15
10,11,3554656000.0,3238,6,22,8,20,118,667,33,81,1097794.0,17
58,59,1525686000.0,440,9,24,16,53,430,718,30,85,3467468.0,16
132,133,1027903000.0,422,10,24,15,53,397,625,30,80,2435788.0,15
55,56,651282900.0,408,9,24,16,52,387,365,25,66,1596282.0,16


In [71]:
baskets.groupby(['merchant_id']).spent.sum().sort_values(ascending = False).head()


merchant_id
191    1.891087e+10
11     3.554656e+09
207    2.786035e+09
206    2.737438e+09
16     2.684756e+09
Name: spent, dtype: float64

In [72]:
baskets.groupby(['merchant_id']).date.nunique().sort_values(ascending = False)

merchant_id
191    432
59     430
133    397
56     387
61     381
      ... 
7        1
249      1
305      1
225      1
8        1
Name: date, Length: 357, dtype: int64

In [81]:
df = merchant_attributes.query("num_orders >= 408")["merchant_id"]
df


10      11
55      56
58      59
132    133
190    191
Name: merchant_id, dtype: int64

In [85]:
df1 = merchant_attributes.merge(df).sort_values(by="num_orders",ascending = False)
df1

Unnamed: 0,merchant_id,total_spent,num_orders,first_month,last_month,num_months,num_weeks,num_days,num_skus,num_top_cats,num_sub_cats,avg_spent_per_order,tenure_month
4,191,18910870000.0,23864,10,24,15,53,432,1332,33,95,792443.3,15
0,11,3554656000.0,3238,6,22,8,20,118,667,33,81,1097794.0,17
2,59,1525686000.0,440,9,24,16,53,430,718,30,85,3467468.0,16
3,133,1027903000.0,422,10,24,15,53,397,625,30,80,2435788.0,15
1,56,651282900.0,408,9,24,16,52,387,365,25,66,1596282.0,16


In [30]:
#Counting the frequency
freq = baskets['merchant_id'].value_counts()
print(freq)

191    128546
11      28605
59      10478
133      6882
52       6871
        ...  
186         6
305         3
225         2
8           1
7           1
Name: merchant_id, Length: 357, dtype: int64


In [31]:
frequency_df = baskets.groupby(
    by=['merchant_id'], as_index=False)['date'].count()
frequency_df.columns = ['merchant_id', 'Frequency']
frequency_df.head()

Unnamed: 0,merchant_id,Frequency
0,1,69
1,2,60
2,3,244
3,4,11
4,5,20


In [32]:
merchant_sku = baskets.groupby(['merchant_id', 'sku_id', 'month'], as_index= True).agg({'count'})
merchant_sku

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,order_id,placed_at,top_cat,sub_cat,qty,price,spent,datetime,year,date,day,hour,weekday,year_month,month_num,year_week,week_num,iso_week_num,cum_week_num
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count
merchant_id,sku_id,month,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,659,4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,697,9,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,718,4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,804,4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,843,4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,1386,12,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
357,1392,12,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
357,1418,12,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
357,1503,12,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2


In [33]:
merchant_sku = baskets.groupby(['merchant_id', 'sku_id'], as_index= True).agg({'count'})
merchant_sku

Unnamed: 0_level_0,Unnamed: 1_level_0,id,order_id,placed_at,top_cat,sub_cat,qty,price,spent,datetime,year,...,date,day,hour,weekday,year_month,month_num,year_week,week_num,iso_week_num,cum_week_num
Unnamed: 0_level_1,Unnamed: 1_level_1,count,count,count,count,count,count,count,count,count,count,...,count,count,count,count,count,count,count,count,count,count
merchant_id,sku_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,659,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,697,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,718,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,804,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,843,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,1386,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
357,1392,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
357,1418,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
357,1503,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
