In [1]:
from dask.distributed import Client, progress
import dask.dataframe as dd
import pandas as pd

In [2]:
client = Client(n_workers=4, memory_limit='8GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:34027  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 32.00 GB


In [3]:
!ls ../data

city_id_dummy.parquet				order_data.parquet
customer_features.json				orders_table_dtypes.json
day_of_week_dummy.parquet			orders_table_dtypes_log.json
hour_of_day_dummy.parquet			parquet
machine_learning_challenge_labeled_data.csv.gz	payment_id_dummy.parquet
machine_learning_challenge_order_data.csv.gz	platform_id_dummy.parquet
order_data_batch.json				restaurant_id_dummy.parquet
order_data_dtypes.json				transmission_id_dummy.parquet
order_data_dtypes_log.json


In [4]:
orders = dd.from_pandas(
    pd.read_json(
        "../data/order_data_dtypes.json",
        orient='table',
    ).sample(frac=.1, random_state=42).set_index('customer_id'),
    npartitions=128,
).categorize()

In [6]:
# # Customer ID is NOT a category
# orders['customer_id'] = orders['customer_id'].astype('str')

In [7]:
orders.head()

Unnamed: 0_level_0,order_datetime,customer_order_rank,is_failed,voucher_amount,delivery_fee,amount_paid,restaurant_id,city_id,payment_id,platform_id,transmission_id,hour_of_day,day_of_week,is_holiday
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0000e2c6d9be,2016-01-29 20:00:00,1,False,0.0,0.0,9.558,239303498,76547,1619,30359,4356,20,4,False
0005f8cd826d,2015-10-23 18:00:00,6,False,0.0,0.493,7.434,41453498,31157,1619,30231,4228,18,4,False
0005f8cd826d,2015-09-15 19:00:00,3,False,0.0,0.493,7.434,41453498,31157,1619,30231,4356,19,1,False
0005f8cd826d,2015-12-20 15:00:00,7,False,0.0,0.493,7.434,41453498,31157,1619,30231,4356,15,6,False
00066870ca92,2016-02-22 14:00:00,10,False,0.0,0.493,14.868,1413498,10346,1619,29815,4356,14,0,False


In [8]:
# orders = orders.sample(frac=.1).compute()

In [9]:
orders.count()  # .compute()

Dask Series Structure:
npartitions=1
amount_paid       int64
voucher_amount      ...
dtype: int64
Dask Name: dataframe-count-agg, 386 tasks

In [10]:
from dask_ml.preprocessing import Categorizer, DummyEncoder

In [11]:
orders.npartitions

128

In [12]:
orders.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 14 entries, order_datetime to is_holiday
dtypes: category(1), category(1), category(1), category(1), category(1), category(1), category(1), datetime64[ns](1), bool(2), float64(3), int64(1)

In [13]:
orders.columns

Index(['order_datetime', 'customer_order_rank', 'is_failed', 'voucher_amount',
       'delivery_fee', 'amount_paid', 'restaurant_id', 'city_id', 'payment_id',
       'platform_id', 'transmission_id', 'hour_of_day', 'day_of_week',
       'is_holiday'],
      dtype='object')

In [14]:
categorical_columns = ['restaurant_id',
       'city_id', 'payment_id', 'platform_id', 'transmission_id',
       'hour_of_day', 'day_of_week']
other_columns = ['order_datetime', 'customer_order_rank', 'is_failed',
       'voucher_amount', 'delivery_fee', 'amount_paid', 'is_holiday']
groupby_column = 'customer_id'

In [17]:
dum = DummyEncoder(columns=['restaurant_id'])

In [18]:
transformed = dum.fit_transform(orders[['restaurant_id']])

In [19]:
dum.non_categorical_columns_

Index([], dtype='object')

In [20]:
dum.categorical_columns_

['restaurant_id']

In [21]:
transformed.head()

Unnamed: 0_level_0,restaurant_id_73498,restaurant_id_123498,restaurant_id_153498,restaurant_id_173498,restaurant_id_193498,restaurant_id_223498,restaurant_id_233498,restaurant_id_243498,restaurant_id_263498,restaurant_id_273498,...,restaurant_id_339593498,restaurant_id_339713498,restaurant_id_339763498,restaurant_id_339773498,restaurant_id_339823498,restaurant_id_339913498,restaurant_id_339983498,restaurant_id_340033498,restaurant_id_340093498,restaurant_id_340453498
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000e2c6d9be,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0005f8cd826d,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0005f8cd826d,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0005f8cd826d,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00066870ca92,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
transformed.columns

Index(['restaurant_id_73498', 'restaurant_id_123498', 'restaurant_id_153498',
       'restaurant_id_173498', 'restaurant_id_193498', 'restaurant_id_223498',
       'restaurant_id_233498', 'restaurant_id_243498', 'restaurant_id_263498',
       'restaurant_id_273498',
       ...
       'restaurant_id_339593498', 'restaurant_id_339713498',
       'restaurant_id_339763498', 'restaurant_id_339773498',
       'restaurant_id_339823498', 'restaurant_id_339913498',
       'restaurant_id_339983498', 'restaurant_id_340033498',
       'restaurant_id_340093498', 'restaurant_id_340453498'],
      dtype='object', length=13569)

In [23]:
aggregate_dummies = transformed.groupby('customer_id').sum().repartition(128)

In [24]:
aggregate_dummies.npartitions

128

In [25]:
aggregate_dummies.head()



KilledWorker: ("('dataframe-groupby-sum-combine-c9d4d3ea5ae2ff2d5aa3c993555ddcbb', 1, 0, 0)", <Worker 'tcp://127.0.0.1:34649', name: 3, memory: 0, processing: 18>)

In [6]:
dummies = dd.read_parquet(
    "../data/restaurant_id_dummy.parquet"
).repartition(64)

In [7]:
dummies.head()



KilledWorker: ("('read-parquet-repartition-split-head-1-5-repartition-64-69a623bda712b6f043482dded9437d05', 0)", <Worker 'tcp://127.0.0.1:36331', name: 0, memory: 0, processing: 1>)

In [3]:
# df_orders = pd.read_json(
#     "../data/order_data_dtypes.json",
#     orient='table',
# )

In [4]:
# df_orders = df_orders.repartition(npartitions=64)

In [5]:
# df_orders.head()

In [6]:
# df_orders.info()

In [7]:
# # Customer order rank
# max_customer_order_rank = df_orders.groupby(
#     'customer_id'
# )['customer_order_rank'].max()

In [8]:
# max_customer_order_rank

In [9]:
# df_orders['is_holiday'] = (
#     (
#         (df_orders['order_datetime'].dt.month == 1) & (df_orders['order_datetime'].dt.day == 1)
#     ) | (
#         (df_orders['order_datetime'].dt.month == 12) & ((df_orders['order_datetime'].dt.day == 25) | (df_orders['order_datetime'].dt.day == 31))
#     )
# )

### Customer features

In [10]:
# last_df_orders = df_orders.groupby('customer_id')['order_datetime'].max()
# max_datetime = df_orders['order_datetime'].max()
# max_datetime

In [11]:
# # customer_features['last_order_age_days'] = 
# last_order_age_days = (max_datetime - last_df_orders).dt.days

In [17]:
last_order_age_days.name = 'last_order_age_days'

In [18]:
customer_features = last_order_age_days.to_frame()

In [19]:
first_df_orders = df_orders.groupby('customer_id')['order_datetime'].min()

In [20]:
# customer_features['first_order_age_days'] = 
first_order_age_days = (max_datetime - first_df_orders).dt.days

In [21]:
first_order_age_days.name = 'first_order_age_days'

In [22]:
customer_features = customer_features.join(first_order_age_days)

In [23]:
number_of_df_orders_per_customer = df_orders.groupby('customer_id')['amount_paid'].count()
number_of_df_orders_per_customer.name = 'n_df_orders'

In [24]:
number_of_df_orders_per_customer

customer_id
000097eabfd9    1
0000e2c6d9be    1
000133bb597f    1
00018269939b    1
0001a00468a6    1
               ..
fffd696eaedd    1
fffe9d5a8d41    3
ffff347c3cfa    2
ffff4519b52d    1
ffffccbfc8a4    1
Name: n_df_orders, Length: 245453, dtype: int64

In [25]:
customer_features = customer_features.join(number_of_df_orders_per_customer)

In [26]:
max_customer_order_rank = df_orders.groupby(
    'customer_id'
)['customer_order_rank'].max()
max_customer_order_rank

customer_id
000097eabfd9    1
0000e2c6d9be    1
000133bb597f    1
00018269939b    1
0001a00468a6    1
               ..
fffd696eaedd    1
fffe9d5a8d41    1
ffff347c3cfa    2
ffff4519b52d    1
ffffccbfc8a4    1
Name: customer_order_rank, Length: 245453, dtype: int64

In [27]:
max_customer_order_rank.name = 'max_customer_order_rank'

In [28]:
customer_features = customer_features.join(max_customer_order_rank)

In [29]:
failed_df_orders_per_customer = df_orders.groupby('customer_id')['is_failed'].sum()
failed_df_orders_per_customer.name = 'n_failed'

In [30]:
customer_features = customer_features.join(failed_df_orders_per_customer)
customer_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245453 entries, 000097eabfd9 to ffffccbfc8a4
Data columns (total 5 columns):
 #   Column                   Non-Null Count   Dtype
---  ------                   --------------   -----
 0   last_order_age_days      245453 non-null  int64
 1   first_order_age_days     245453 non-null  int64
 2   n_df_orders              245453 non-null  int64
 3   max_customer_order_rank  245453 non-null  int64
 4   n_failed                 245453 non-null  int64
dtypes: int64(5)
memory usage: 21.2+ MB


In [31]:
max_voucher_amount = df_orders.groupby('customer_id')['voucher_amount'].max()
max_voucher_amount.name = 'max_voucher_amount'

In [32]:
customer_features = customer_features.join(max_voucher_amount)
customer_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245453 entries, 000097eabfd9 to ffffccbfc8a4
Data columns (total 6 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   last_order_age_days      245453 non-null  int64  
 1   first_order_age_days     245453 non-null  int64  
 2   n_df_orders              245453 non-null  int64  
 3   max_customer_order_rank  245453 non-null  int64  
 4   n_failed                 245453 non-null  int64  
 5   max_voucher_amount       245453 non-null  float64
dtypes: float64(1), int64(5)
memory usage: 23.1+ MB


In [33]:
tot_voucher_amount = df_orders.groupby('customer_id')['voucher_amount'].sum()
tot_voucher_amount.name = 'tot_voucher_amount'

In [34]:
customer_features = customer_features.join(tot_voucher_amount)
customer_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245453 entries, 000097eabfd9 to ffffccbfc8a4
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   last_order_age_days      245453 non-null  int64  
 1   first_order_age_days     245453 non-null  int64  
 2   n_df_orders              245453 non-null  int64  
 3   max_customer_order_rank  245453 non-null  int64  
 4   n_failed                 245453 non-null  int64  
 5   max_voucher_amount       245453 non-null  float64
 6   tot_voucher_amount       245453 non-null  float64
dtypes: float64(2), int64(5)
memory usage: 25.0+ MB


In [35]:
n_vouchers = (df_orders['voucher_amount']>0).groupby(df_orders['customer_id']).sum()
n_vouchers.name = 'n_vouchers'

In [36]:
customer_features = customer_features.join(n_vouchers)

In [37]:
tot_delivery_fee = df_orders.groupby('customer_id')['delivery_fee'].sum()
max_delivery_fee = df_orders.groupby('customer_id')['delivery_fee'].max()
# How many times a delivery fee was paid
n_delivery_fee = (df_orders['delivery_fee']>0).groupby(df_orders['customer_id']).sum()

In [38]:
tot_delivery_fee.name = 'tot_delivery_fee'
max_delivery_fee.name = 'max_delivery_fee'
n_delivery_fee.name = 'n_delivery_fee'

In [39]:
customer_features = customer_features.join(tot_delivery_fee)
customer_features = customer_features.join(max_delivery_fee)
customer_features = customer_features.join(n_delivery_fee)
customer_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245453 entries, 000097eabfd9 to ffffccbfc8a4
Data columns (total 11 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   last_order_age_days      245453 non-null  int64  
 1   first_order_age_days     245453 non-null  int64  
 2   n_df_orders              245453 non-null  int64  
 3   max_customer_order_rank  245453 non-null  int64  
 4   n_failed                 245453 non-null  int64  
 5   max_voucher_amount       245453 non-null  float64
 6   tot_voucher_amount       245453 non-null  float64
 7   n_vouchers               245453 non-null  int64  
 8   tot_delivery_fee         245453 non-null  float64
 9   max_delivery_fee         245453 non-null  float64
 10  n_delivery_fee           245453 non-null  int64  
dtypes: float64(4), int64(7)
memory usage: 32.5+ MB


In [40]:
tot_amount_paid = df_orders.groupby('customer_id')['amount_paid'].sum()
avg_amount_paid = df_orders.groupby('customer_id')['amount_paid'].mean()
max_amount_paid = df_orders.groupby('customer_id')['amount_paid'].max()
min_amount_paid = df_orders.groupby('customer_id')['amount_paid'].min()

In [41]:
tot_amount_paid.name = 'tot_amount_paid'
avg_amount_paid.name = 'avg_amount_paid'
max_amount_paid.name = 'max_amount_paid'
min_amount_paid.name = 'min_amount_paid'

In [42]:
customer_features = customer_features.join(tot_amount_paid)
customer_features = customer_features.join(avg_amount_paid)
customer_features = customer_features.join(max_amount_paid)
customer_features = customer_features.join(min_amount_paid)

In [43]:
customer_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245453 entries, 000097eabfd9 to ffffccbfc8a4
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   last_order_age_days      245453 non-null  int64  
 1   first_order_age_days     245453 non-null  int64  
 2   n_df_orders              245453 non-null  int64  
 3   max_customer_order_rank  245453 non-null  int64  
 4   n_failed                 245453 non-null  int64  
 5   max_voucher_amount       245453 non-null  float64
 6   tot_voucher_amount       245453 non-null  float64
 7   n_vouchers               245453 non-null  int64  
 8   tot_delivery_fee         245453 non-null  float64
 9   max_delivery_fee         245453 non-null  float64
 10  n_delivery_fee           245453 non-null  int64  
 11  tot_amount_paid          245453 non-null  float64
 12  avg_amount_paid          245453 non-null  float64
 13  max_amount_paid          245453 non-null  float

In [44]:
n_restaurants = df_orders.groupby('customer_id')['restaurant_id'].nunique()
n_restaurants.name = 'n_restaurants'
n_restaurants

customer_id
000097eabfd9    1
0000e2c6d9be    1
000133bb597f    1
00018269939b    1
0001a00468a6    1
               ..
fffd696eaedd    1
fffe9d5a8d41    2
ffff347c3cfa    2
ffff4519b52d    1
ffffccbfc8a4    1
Name: n_restaurants, Length: 245453, dtype: int64

In [45]:
customer_features = customer_features.join(n_restaurants)
customer_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245453 entries, 000097eabfd9 to ffffccbfc8a4
Data columns (total 16 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   last_order_age_days      245453 non-null  int64  
 1   first_order_age_days     245453 non-null  int64  
 2   n_df_orders              245453 non-null  int64  
 3   max_customer_order_rank  245453 non-null  int64  
 4   n_failed                 245453 non-null  int64  
 5   max_voucher_amount       245453 non-null  float64
 6   tot_voucher_amount       245453 non-null  float64
 7   n_vouchers               245453 non-null  int64  
 8   tot_delivery_fee         245453 non-null  float64
 9   max_delivery_fee         245453 non-null  float64
 10  n_delivery_fee           245453 non-null  int64  
 11  tot_amount_paid          245453 non-null  float64
 12  avg_amount_paid          245453 non-null  float64
 13  max_amount_paid          245453 non-null  float

In [48]:
n_cities = df_orders.groupby('customer_id')['city_id'].nunique()
n_cities.name = 'n_cities'
n_cities

customer_id
000097eabfd9    1
0000e2c6d9be    1
000133bb597f    1
00018269939b    1
0001a00468a6    1
               ..
fffd696eaedd    1
fffe9d5a8d41    1
ffff347c3cfa    1
ffff4519b52d    1
ffffccbfc8a4    1
Name: n_cities, Length: 245453, dtype: int64

In [49]:
customer_features = customer_features.join(n_cities)

In [50]:
customer_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245453 entries, 000097eabfd9 to ffffccbfc8a4
Data columns (total 17 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   last_order_age_days      245453 non-null  int64  
 1   first_order_age_days     245453 non-null  int64  
 2   n_df_orders              245453 non-null  int64  
 3   max_customer_order_rank  245453 non-null  int64  
 4   n_failed                 245453 non-null  int64  
 5   max_voucher_amount       245453 non-null  float64
 6   tot_voucher_amount       245453 non-null  float64
 7   n_vouchers               245453 non-null  int64  
 8   tot_delivery_fee         245453 non-null  float64
 9   max_delivery_fee         245453 non-null  float64
 10  n_delivery_fee           245453 non-null  int64  
 11  tot_amount_paid          245453 non-null  float64
 12  avg_amount_paid          245453 non-null  float64
 13  max_amount_paid          245453 non-null  float