# Order Brushing - Data Analysis
Shopee code league round 1

### Given Assumption and Definition 
- Brushing shop = concentrate rate greater of equal to 3 at any instances
- Concentrate rate = Number of orders within 1 hour/ Number of unique Buyers within 1 hours
- Brushing buyer = buyer that contributed the ```highest proportion of orders to a shop```
- Highest proportion of orders should include the ordres that occured in instances when brushing

### Basic Concepts
- Each ```orderid``` represnets a distinct transation on Shopee.
- Each unique ```shopid``` is a distinct seller on Shopee.
- Each unique ```userid``` ia a distinct buyer on Shopee.
- Event time refers to the exact time that an order was replaced on Shopee.

In [222]:
import pandas as pd

In [223]:
df = pd.read_csv('order_brush_order.csv')

In [224]:
df.columns

Index(['orderid', 'shopid', 'userid', 'event_time'], dtype='object')

In [225]:
df.head(5)

Unnamed: 0,orderid,shopid,userid,event_time
0,31076582227611,93950878,30530270,2019-12-27 00:23:03
1,31118059853484,156423439,46057927,2019-12-27 11:54:20
2,31123355095755,173699291,67341739,2019-12-27 13:22:35
3,31122059872723,63674025,149380322,2019-12-27 13:01:00
4,31117075665123,127249066,149493217,2019-12-27 11:37:55


In [226]:
df['event_time'] = pd.to_datetime(df['event_time'])

In [227]:
df.shape

(222750, 4)

In [228]:
df.isna().sum()

orderid       0
shopid        0
userid        0
event_time    0
dtype: int64

In [229]:
def get_sorted_unique_by_column_name(column_name):
    t = df[column_name].unique()
    t.sort()
    return t

unique_shop_ids = get_sorted_unique_by_column_name('shopid')

In [230]:
groupby_shop_id_df = df.groupby('shopid')

In [237]:
def get_1hour_window(t):
    return t + pd.Timedelta(hours = 1)

In [242]:
def concentration_rate(lower_time, upper_time, same_shop_id_df):
    windowed_df = same_shop_id_df[(lower_time <= same_shop_id_df['event_time']) & 
                                  (same_shop_id_df['event_time'] <= upper_time)]
    
    number_of_order = len(windowed_df)
    unique_user_id = windowed_df['userid'].unique()
    
    return float(number_of_order)/float(unique_user_id) >= 3.0

In [243]:
def get_time_range_to_brushing_user_ids_to_max_number_of_orders_pair(same_shop_id_df, time_range):
    #TBD
    
    return (time_range, [], 0)

In [244]:
def flatten(list_of_list):
    return [val for sublist in list_of_list for val in sublist]

In [231]:
def get_brushing_use_id(shop_id):
    same_shope_id = groupby_shop_id_df.get_group(shop_id)
    
    time_ranges = list(map(lambda event_time : 
                           (event_time, get_1hour_window(event_time)), 
                           same_shop_id _df['event_time']))
    
    
    time_ranges_to_brushing_use_ids_to_max_number = [
        get_time_range_to_brushing_user_ids_to_max_number_of_orders_pair(same_shop_id_df, time_range) for time_range in time_ranges()  
    ]
    
    ultimate_max_number_of_orders = max(list(map(
        (lambda _, _, max_number_of_orders: max_number_of_orders), 
        time_ranges_to_brushing_use_ids_to_max_number)))
    
    ps = filter((lambda _, _, max_number_of_orders: 
                 max_number_of_orders  == ultimate_max_number_of_orders),
                time_ranges_to_brushing_use_ids_to_max_number)
    
    brushing_user_ids = np.array([brushing_uise_ids for _, brushing_user_ids, _ in ps]).flatten().unique()
    brushing_user_ids.sort()
    
    return 0

In [232]:
result = list(map(
    lambda shop_id: (shop_id,get_brushing_use_id(shop_id)), unique_shop_ids)
             )

In [234]:
result_df = pd.DataFrame(result, columns = ['shopid', 'userid'])

In [235]:
result_df

Unnamed: 0,shopid,userid
0,10009,0
1,10051,0
2,10061,0
3,10084,0
4,10100,0
...,...,...
18765,214662358,0
18766,214949521,0
18767,214964814,0
18768,215175775,0


## Trial

In [159]:
temp_df = df[df['shopid'] == 147941492].sort_values(by = 'event_time')
temp_df = temp_df.set_index('event_time')

In [160]:
count_df = temp_df.groupby([temp_df.index.date, temp_df.index.hour])['orderid'].count().reset_index()

In [161]:
count_df

Unnamed: 0,level_0,event_time,orderid
0,2019-12-27,0,268
1,2019-12-27,1,104
2,2019-12-27,2,52
3,2019-12-27,3,34
4,2019-12-27,4,9
...,...,...,...
115,2019-12-31,19,89
116,2019-12-31,20,73
117,2019-12-31,21,59
118,2019-12-31,22,72


In [106]:
(count_df['orderid'] >= 3).any()

True

In [107]:
print(temp_df['shopid'].unique())

[147941492]


### Acquire shopid

In [108]:
shopid_list = []
for i in df['shopid'].unique():
    temp_df = df[df['shopid'] == i].sort_values(by = 'event_time').set_index('event_time')
    count_df = temp_df.groupby([temp_df.index.date, temp_df.index.hour])['orderid'].count().reset_index()
    if (count_df['orderid'] >= 3).any():
        print(i)
        shopid_list.append(i)

93950878
173699291
127249066
107921853
178400128
147941492
9374147
145694343
30988921
67162407
65883234
33242381
3285661
95138572
12662873
152569117
8051258
12480907
51526935
43719124
161269907
1175477
39938958
93363430
96757689
90339629
137754804
152871252
67960532
64625969
4669871
62713846
39554718
47415942
173718481
112904482
24759976
54615708
64492135
175116620
641249
172185419
66861410
140937896
8715449
129257169
99787848
158448460
115294353
84547411
8121841
61556313
137762642
147398485
66342314
25924280
65885656
53613428
760726
131198971
171947507
182347785
117919981
129926253
51487475
5354812
17911218
107932444
76199702
32504026
27998060
141605375
122218940
201015462
130128043
85948343
22502375
84301799
41282916
83102588
151865497
23958390
58543771
156883302
1532569
17749654
12568
129736159
27476241
96869191
10208
26056025
40429317
58835561
6285560
26218382
28091290
10891360
116499414
101590068
96953349
121742138
160305762
140589385
80049863
135054910
64909454
120981896
21019792

64237119
150218066
17678304
182869608
117986593
50969547
109823168
27430506
104593417
139268621
26299009
96679836
118139770
27301695
195639347
162227755
162297388
30876744
21976491
84321142
81072776
53479739
83767775
148821394
52440733
79069823
45084184
64917226
95745694
143246658
8566282
62398447
161153952
72112289
53762796
137739819
183207190
20025878
45084333
86799255
91679894
28090352
19517813
129649797
129916443
73692987
10228730
118122329
140151614
160135616
188348865
66391375
91681600
130168935
88202387
188553471
91242967
135111138
165500538
69575583
97084677
823357
100029879
79694639
8926646
121017883
10311
45869875
138168262
50969033
130118966
108064389
66862214
89948967
98768262
195870375
110868427
32303717
64375750
201503467
64516548
187532720
3983267
50968827
70524908
16618662
71350056
161161969
104245736
64376561
11870800
40205423
11612863
63369471
173508019
111231655
128299931
135054432
31510663
26681577
2831130
79735070
47368929
54593630
139003724
16175883
87621172
11342

14147284
45107936
94636
156057821
180787455
129916452
181814823
133252554
118949170
104955370
157830762
137762841
89718620
64394225
195068512
132928428
130117533
98705037
112253949
173318671
178841321
45559491
153335627
127473736
165809171
83366265
22132975
731606
144365607
107907122
94154189
131974582
133345670
169878610
188359661
66232769
134138844
35551166
204225676
162226376
105064701
26958215
37189296
100685685
102289947
16666326
149729734
43860220
159241151
156766195
152254221
9109593
177817260
63370266
182937
3215227
86572198
162285147
14710
48961570
142031478
129924062
599533
106051591
18092986
141828266
87845635
29056338
192608876
316121
5696946
6379564
60587222
6042309
102338025
159283439
143153670
28802071
17219984
35582678
26213989
135041716
187107283
103715156
151853506
41056024
148856653
168062274
102371504
100446829
160220186
70509505
135644492
21033575
32621245
155288806
156995951
133255677
173186657
140300040
127663510
15611828
173155199
65500189
191674006
206164408
68

In [109]:
len(shopid_list)

1946

In [110]:
len(df['shopid'].unique())

18770

### Trial2

In [175]:
import datetime 

temp_df = df[df['shopid'] == 147941492].sort_values(by = 'event_time')
temp_df = temp_df.set_index('event_time')

In [176]:
temp_df['Count order in last hour'] = temp_df['orderid'].rolling('3600s').count()

In [177]:
temp_df.reset_index()

Unnamed: 0,event_time,orderid,shopid,userid,Count order in last hour
0,2019-12-27 00:00:08,31075208826809,147941492,195903090,1.0
1,2019-12-27 00:00:12,31075212032449,147941492,210365192,2.0
2,2019-12-27 00:00:31,31075231334015,147941492,306545,3.0
3,2019-12-27 00:00:32,31075232254299,147941492,202519981,4.0
4,2019-12-27 00:00:33,31075232409868,147941492,200500884,5.0
...,...,...,...,...,...
11698,2019-12-31 23:58:17,31507096850803,147941492,120006484,91.0
11699,2019-12-31 23:58:28,31507108119827,147941492,12390455,91.0
11700,2019-12-31 23:58:59,31507139137451,147941492,197985002,92.0
11701,2019-12-31 23:59:42,31507181180213,147941492,167249884,93.0


In [178]:
temp_df['Count unique_user in last hour'] = temp_df.transform(lambda x:\
                                                              len(temp_df[(temp_df['event_time'].between(x['event_time'] - datetime.timedelta(hours=1),
                                                                                                      x['event_time']))]['userid'].unique()), axis=1)                                                       

KeyError: ('event_time', 'occurred at index 2019-12-27 00:00:08')

In [168]:
temp_df['userid'].unique()

array([195903090, 210365192,    306545, ..., 120006484,  12390455,
       197985002])