### Import pandas and read in the csv file and set it to a dataframe called baskets

## Setting up python

In [2]:
import pandas as pd
import numpy as np

In [3]:
baskets = pd.read_csv('./AwanTunai_transaction_data.csv')

## Conduct some simple data inspection

 - take a look at the first ten rows, and last 7 rows

In [4]:
baskets.head(10)

Unnamed: 0,id,order_id,placed_at,merchant_id,sku_id,top_cat,sub_cat,qty,price
0,1,1,2021-06-17 14:00:31.373,9,10,27.0,86.0,2,343000.0
1,2,1,2021-06-17 14:00:31.373,9,6,27.0,86.0,1,159000.0
2,3,1,2021-06-17 14:00:31.373,9,14,27.0,86.0,2,225000.0
3,4,1,2021-06-17 14:00:31.373,9,5,27.0,86.0,1,246000.0
4,5,2,2021-06-17 14:29:31.918,10,15,27.0,86.0,5,19500.0
5,6,2,2021-06-17 14:29:31.918,10,9,27.0,86.0,5,17600.0
6,7,3,2021-06-17 15:06:17.597,11,11,27.0,86.0,2,68500.0
7,8,3,2021-06-17 15:06:17.597,11,13,27.0,86.0,2,159500.0
8,9,3,2021-06-17 15:06:17.597,11,27,27.0,86.0,2,116000.0
9,10,3,2021-06-17 15:06:17.597,11,7,27.0,86.0,1,193000.0


In [5]:
baskets.head(3)

Unnamed: 0,id,order_id,placed_at,merchant_id,sku_id,top_cat,sub_cat,qty,price
0,1,1,2021-06-17 14:00:31.373,9,10,27.0,86.0,2,343000.0
1,2,1,2021-06-17 14:00:31.373,9,6,27.0,86.0,1,159000.0
2,3,1,2021-06-17 14:00:31.373,9,14,27.0,86.0,2,225000.0


In [4]:
baskets.tail(7)

Unnamed: 0,id,order_id,placed_at,merchant_id,sku_id,top_cat,sub_cat,qty,price
492430,492431,53438,2022-12-31 18:14:45.659,184,973,27.0,86.0,1,79000.0
492431,492432,53438,2022-12-31 18:14:45.659,184,971,27.0,86.0,5,21300.0
492432,492433,53438,2022-12-31 18:14:45.659,184,1500,27.0,86.0,5,21000.0
492433,492434,53438,2022-12-31 18:14:45.659,184,1655,18.0,61.0,1,38000.0
492434,492435,53438,2022-12-31 18:14:45.659,184,1504,27.0,86.0,1,274000.0
492435,492436,53438,2022-12-31 18:14:45.659,184,984,27.0,86.0,5,19400.0
492436,492437,53438,2022-12-31 18:14:45.659,184,866,27.0,86.0,5,21200.0


### dataframe dimensions, column names, column data types, ranges of column values

In [5]:
baskets.shape

(492437, 9)

In [6]:
baskets.columns

Index(['id', 'order_id', 'placed_at', 'merchant_id', 'sku_id', 'top_cat',
       'sub_cat', 'qty', 'price'],
      dtype='object')

 - noticed columns "placed_at" is not numeric and the rest are numerical columns

In [7]:
baskets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 492437 entries, 0 to 492436
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   id           492437 non-null  int64  
 1   order_id     492437 non-null  int64  
 2   placed_at    492437 non-null  object 
 3   merchant_id  492437 non-null  int64  
 4   sku_id       492437 non-null  int64  
 5   top_cat      491449 non-null  float64
 6   sub_cat      491449 non-null  float64
 7   qty          492437 non-null  int64  
 8   price        492437 non-null  float64
dtypes: float64(3), int64(5), object(1)
memory usage: 33.8+ MB


 - another way to check the data types

In [8]:
baskets.dtypes

id               int64
order_id         int64
placed_at       object
merchant_id      int64
sku_id           int64
top_cat        float64
sub_cat        float64
qty              int64
price          float64
dtype: object

 - one way to get an overview of the data values

In [9]:
baskets.describe()

Unnamed: 0,id,order_id,merchant_id,sku_id,top_cat,sub_cat,qty,price
count,492437.0,492437.0,492437.0,492437.0,491449.0,491449.0,492437.0,492437.0
mean,246219.0,25313.575371,143.411031,1080.489256,21.282263,67.775401,3.059088,113102.0
std,142154.461592,15459.726935,82.515747,387.372125,7.73527,28.466744,17.958188,192318.1
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,123110.0,11579.0,61.0,873.0,15.0,45.0,1.0,19100.0
50%,246219.0,24300.0,164.0,1025.0,27.0,86.0,1.0,76500.0
75%,369328.0,38773.0,191.0,1400.0,27.0,86.0,3.0,172000.0
max,492437.0,53438.0,357.0,1860.0,35.0,108.0,6000.0,4240000.0


 - question: what can you observe from the above result?
 - why are the count on top_cat_id and sub_cat_id different from others? 

 - noticed that the "placed_at" column was not shown in the above result, maybe due to its type?  
 - wondering that ID columns' statistics may not make sense other than count, min, max, since they are supposed to be identifiers

 - take a look at 3 random rows, it is important to inspect data beyond the head and tail

In [9]:
# set seed for random function so that we get same rows when re-run the cell
np.random.seed(17)
np.random.randint(0, baskets.shape[0],3)


array([469615,  64753, 297103])

In [10]:
np.random.randint(0, baskets.shape[0],3)

array([491926, 304441, 125680])

In [8]:
# set seed for random function so that we get same rows when re-run the cell
np.random.seed(17)
baskets.iloc[np.random.randint(0, baskets.shape[0],3)]

Unnamed: 0,id,order_id,placed_at,merchant_id,sku_id,top_cat,sub_cat,qty,price
469615,469620,50662,2022-12-10 10:47:32.698,257,971,27.0,86.0,5,20700.0
64753,64758,6479,2021-11-03 11:53:01.836,48,1338,27.0,86.0,1,180500.0
297103,297115,30089,2022-07-01 13:14:03.976,191,1504,27.0,86.0,2,257000.0


 - take a look at transactions for a specific merchant_id, say merchant 48 from above result

In [11]:
baskets[baskets['merchant_id'] == 48]

Unnamed: 0,id,order_id,placed_at,merchant_id,sku_id,top_cat,sub_cat,qty,price
29583,29597,3236,2021-09-29 16:30:06.949,48,562,27.0,86.0,10,18000.0
29584,29598,3236,2021-09-29 16:30:06.949,48,406,27.0,86.0,10,23450.0
29585,29599,3236,2021-09-29 16:30:06.949,48,180,27.0,86.0,20,18200.0
29586,29600,3236,2021-09-29 16:30:06.949,48,401,27.0,86.0,10,10700.0
29587,29601,3236,2021-09-29 16:30:06.949,48,160,27.0,86.0,10,17900.0
...,...,...,...,...,...,...,...,...,...
267279,267315,26685,2022-06-02 13:27:47.872,48,991,27.0,86.0,3,25600.0
267280,267316,26685,2022-06-02 13:27:47.872,48,984,27.0,86.0,5,17200.0
294265,294271,29751,2022-06-28 12:48:58.711,48,973,27.0,86.0,1,73000.0
294266,294272,29751,2022-06-28 12:48:58.711,48,1503,27.0,86.0,5,25700.0


In [11]:
baskets['merchant_id'] == 48

0         False
1         False
2         False
3         False
4         False
          ...  
492432    False
492433    False
492434    False
492435    False
492436    False
Name: merchant_id, Length: 492437, dtype: bool

## Conduct some more data inspection at your own time

 - how much did it cost in total for this particular merchant?

In [12]:
baskets['price'].mean(), baskets['price'].min(), baskets['price'].max(), 

(113101.97735813889, 1.0, 4240000.0)

 - what is the average price for the first order?

In [13]:
o_id = baskets.order_id[0]
baskets[baskets['order_id'] == o_id].price.mean()

243250.0

 - how much did it cost in total for a particular merchant?

baskets[baskets['merchant_id'] == 48].price.sum()

 - do the above two queries make sense? why and why not? what do the results really mean? 
 - notice the min price is 1, does this make sense?

 - notice the min price of whole dateset is 1, how many rows have price of 1
 - question: Why would some items have price of 1? (1 rupiah is USD 0.000064)

In [14]:
baskets[baskets['price']==1].count()

id             2
order_id       2
placed_at      2
merchant_id    2
sku_id         2
top_cat        2
sub_cat        2
qty            2
price          2
dtype: int64

 - hmm, let us also check the rows with max price

In [15]:
baskets[baskets['price']== 4240000].count()

id             2
order_id       2
placed_at      2
merchant_id    2
sku_id         2
top_cat        2
sub_cat        2
qty            2
price          2
dtype: int64

 - can the two be on same rows?

In [16]:
baskets.query("price == 1 or price == 4240000")

Unnamed: 0,id,order_id,placed_at,merchant_id,sku_id,top_cat,sub_cat,qty,price
29328,29342,3207,2021-09-29 11:49:17.746,23,233,14.0,34.0,1,1.0
29329,29343,3207,2021-09-29 11:49:17.746,23,186,27.0,86.0,1,1.0
482817,482819,52262,2022-12-22 14:39:52.383,355,970,27.0,86.0,1,4240000.0
486896,486931,52741,2022-12-26 13:01:10.968,161,970,27.0,86.0,2,4240000.0


 - check columns' number of unique values

In [17]:
baskets.nunique()

id             492437
order_id        53438
placed_at       53434
merchant_id       357
sku_id           1860
top_cat            35
sub_cat           108
qty                97
price            1607
dtype: int64

- question: what can you observe from the above result? what might seem to be peculiar? 

 - let us check the min and max of "placed_date" column

In [18]:
baskets['placed_at'].min(), baskets['placed_at'].max()

('2021-06-17 14:00:31.373', '2022-12-31 18:14:45.659')

 - how many merchant transacted on a particular day, say December 31, 2022?
 - what is the type "object" anyways?

In [19]:
baskets['placed_at'][baskets.shape[0]-1], type(baskets['placed_at'][baskets.shape[0]-1])

('2022-12-31 18:14:45.659', str)

In [20]:
baskets[baskets["placed_at"] == baskets['placed_at'][baskets.shape[0]-1]]

Unnamed: 0,id,order_id,placed_at,merchant_id,sku_id,top_cat,sub_cat,qty,price
492430,492431,53438,2022-12-31 18:14:45.659,184,973,27.0,86.0,1,79000.0
492431,492432,53438,2022-12-31 18:14:45.659,184,971,27.0,86.0,5,21300.0
492432,492433,53438,2022-12-31 18:14:45.659,184,1500,27.0,86.0,5,21000.0
492433,492434,53438,2022-12-31 18:14:45.659,184,1655,18.0,61.0,1,38000.0
492434,492435,53438,2022-12-31 18:14:45.659,184,1504,27.0,86.0,1,274000.0
492435,492436,53438,2022-12-31 18:14:45.659,184,984,27.0,86.0,5,19400.0
492436,492437,53438,2022-12-31 18:14:45.659,184,866,27.0,86.0,5,21200.0


 - how do we work with a string object and get the date, hour, min, second, millisecond?

## gather all observations, questions, and TODOs

 - columns "placed_at" is string type and the rest are numerical columns
 - why are the count on top_cat_id and sub_cat_id different from others? 
 - ID columns' statistics make sense other than count, min, max, since they are supposed to be identifiers, should we treat them as categorical?
 - why would some items have price of 1? Is there a data issue here?
 - unique "placed_at" is 4 less than unique order_id, what can this possibly imply?
 - is it possible that 4 orders are made on exactly the same milisecond as some other orders? In theory it is possible, but might there be potential fraud?
 - how can we find out which 4 orders happened on the exact same millisecond? 
 - how many merchant transacted on a particular day, say December 31, 2022?
 - how do we work with a string object and get the date, hour, min, second, millisecond? 