# Feature analysis and extraction

This notebooks contain a more detailed analysis of each column.
For each column of the table we'll create as many features as possible accordingly
to the type of data
(e.g. one column could lend itself to both a numerical feature and
a binary or categorical feature).

### Read the data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import seaborn as sns

%matplotlib inline

In [2]:
sns.set()

Read the data:

In [3]:
order_data = pd.read_json(
    "../data/order_data_dtypes.json",
    orient='table',
).sample(frac=.1)

In [4]:
order_data.head()

Unnamed: 0,order_datetime,customer_id,customer_order_rank,is_failed,voucher_amount,delivery_fee,amount_paid,restaurant_id,city_id,payment_id,platform_id,transmission_id,hour_of_day,day_of_week,is_holiday
9988,2015-03-27 17:00:00,afec99bb6feb,1,False,0.0,0.986,6.372,76913498,74413,1779,29463,4324,17,4,False
437760,2016-07-16 18:00:00,ded30afababa,3,False,0.0,0.0,10.0359,48183498,75403,1619,30231,4356,18,5,False
223106,2016-01-10 20:00:00,7a349918f0ab,6,False,0.0,0.0,5.2569,64403498,84259,1619,30231,4356,20,6,False
785704,2017-02-27 18:00:00,5847712da812,2,False,0.0,0.0,6.2127,49263498,41109,1619,29463,4356,18,0,False
650099,2016-12-14 18:00:00,8273921e3166,10,False,0.0,0.0,14.6025,319403498,47282,1619,30231,4996,18,2,False


In [5]:
order_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78655 entries, 9988 to 767626
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   order_datetime       78655 non-null  datetime64[ns]
 1   customer_id          78655 non-null  object        
 2   customer_order_rank  78655 non-null  int64         
 3   is_failed            78655 non-null  bool          
 4   voucher_amount       78655 non-null  float64       
 5   delivery_fee         78655 non-null  float64       
 6   amount_paid          78655 non-null  float64       
 7   restaurant_id        78655 non-null  category      
 8   city_id              78655 non-null  category      
 9   payment_id           78655 non-null  category      
 10  platform_id          78655 non-null  category      
 11  transmission_id      78655 non-null  category      
 12  hour_of_day          78655 non-null  category      
 13  day_of_week          78655 

### Orders features

Let's store our newly-created features in separate tables.

Let's create a table for order features, including one-hot representation of categorical variables (dummy variables):

In [6]:
orders_features = pd.DataFrame(index=order_data.index)

In [7]:
orders_features['customer_id'] = order_data['customer_id']

In [8]:
orders_features.head()

Unnamed: 0,customer_id
9988,afec99bb6feb
437760,ded30afababa
223106,7a349918f0ab
785704,5847712da812
650099,8273921e3166


### Customer features

We'll be extracting customer features and storing them in a separate dataframe:

In [9]:
customer_features = pd.DataFrame(index=order_data['customer_id'].unique())
customer_features.index.name = 'customer_id'

In [10]:
customer_features

afec99bb6feb
ded30afababa
7a349918f0ab
5847712da812
8273921e3166
...
39513751d31a
3818deeab913
39724ce8de15
622f36932377
6640df589b47


#### Datetime features

In [11]:
# orders_features['hour_of_day'] = order_data['order_datetime'].dt.hour
# orders_features['day_of_week'] = order_data['order_datetime'].dt.dayofweek

In [12]:
# orders_features['is_holiday'] = (
#     (
#         (order_data['order_datetime'].dt.month == 1) & (order_data['order_datetime'].dt.day == 1)
#     ) | (
#         (order_data['order_datetime'].dt.month == 12) & ((order_data['order_datetime'].dt.day == 25) | (order_data['order_datetime'].dt.day == 31))
#     )
# )

In [13]:
orders_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78655 entries, 435090 to 360246
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   customer_id  78655 non-null  object
dtypes: object(1)
memory usage: 1.2+ MB


#### Dummy variables

Let's produce dummy variables from these:

In [14]:
# col_name = 'hour_of_day'
# pd.get_dummies(
#     orders_features[col_name],
#     prefix=col_name,
# ).to_parquet(f"../data/{col_name}_dummy.parquet")

In [15]:
# !ls ../data/

In [16]:
col_name = 'hour_of_day'
hour_dummies = pd.get_dummies(
    orders_features[col_name],
    prefix=col_name,
)

KeyError: 'hour_of_day'

In [20]:
orders_features[hour_dummies.columns] = hour_dummies

In [21]:
col_name = 'day_of_week'
day_of_week_dummies = pd.get_dummies(
    orders_features[col_name],
    prefix=col_name,
)

In [22]:
orders_features[day_of_week_dummies.columns] = day_of_week_dummies

In [23]:
orders_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 786547 entries, 0 to 786546
Data columns (total 35 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   customer_id     786547 non-null  object
 1   hour_of_day     786547 non-null  int64 
 2   day_of_week     786547 non-null  int64 
 3   is_holiday      786547 non-null  bool  
 4   hour_of_day_0   786547 non-null  uint8 
 5   hour_of_day_1   786547 non-null  uint8 
 6   hour_of_day_2   786547 non-null  uint8 
 7   hour_of_day_3   786547 non-null  uint8 
 8   hour_of_day_4   786547 non-null  uint8 
 9   hour_of_day_5   786547 non-null  uint8 
 10  hour_of_day_6   786547 non-null  uint8 
 11  hour_of_day_7   786547 non-null  uint8 
 12  hour_of_day_8   786547 non-null  uint8 
 13  hour_of_day_9   786547 non-null  uint8 
 14  hour_of_day_10  786547 non-null  uint8 
 15  hour_of_day_11  786547 non-null  uint8 
 16  hour_of_day_12  786547 non-null  uint8 
 17  hour_of_day_13  786547 non-nu

In [24]:
last_orders = order_data.groupby('customer_id')['order_datetime'].max()
max_datetime = order_data['order_datetime'].max()
max_datetime

Timestamp('2017-02-27 23:00:00')

In [25]:
customer_features['last_order_age_days'] = (max_datetime - last_orders).dt.days

In [26]:
first_orders = order_data.groupby('customer_id')['order_datetime'].min()

In [27]:
customer_features['first_order_age_days'] = (max_datetime - first_orders).dt.days

In [28]:
customer_features.head()

Unnamed: 0_level_0,last_order_age_days,first_order_age_days
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1
000097eabfd9,618,618
0000e2c6d9be,395,395
000133bb597f,1,1
00018269939b,22,22
0001a00468a6,573,573


### Number of orders

We'll throw in the number of orders per customer as well:

In [29]:
number_of_orders_per_customer = order_data.groupby('customer_id')['amount_paid'].count()
number_of_orders_per_customer.name = 'n_orders'

In [30]:
customer_features = customer_features.join(number_of_orders_per_customer)
customer_features.head()

Unnamed: 0_level_0,last_order_age_days,first_order_age_days,n_orders
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000097eabfd9,618,618,1
0000e2c6d9be,395,395,1
000133bb597f,1,1,1
00018269939b,22,22,1
0001a00468a6,573,573,1


### Customer order rank

The customer order rank can show us how many orders a customer has
successfully submitted in the considered time frame.
We find this by taking the maximum of the quantity.

In [31]:
max_customer_order_rank = order_data.groupby(
    'customer_id'
)['customer_order_rank'].max().sort_values(ascending=False)
max_customer_order_rank

customer_id
15edce943edd    369
8745a335e9cf    281
d956116d863d    272
0063666607bb    266
ae60dce05485    266
               ... 
4d262f1d2382      0
c0a84c0ffb7a      0
2cab60314f16      0
570d64fd10ed      0
7c0e6c387d48      0
Name: customer_order_rank, Length: 245453, dtype: int64

In [32]:
max_customer_order_rank.describe()

count    245453.000000
mean          3.103771
std           6.770718
min           0.000000
25%           1.000000
50%           1.000000
75%           2.000000
max         369.000000
Name: customer_order_rank, dtype: float64

In [33]:
max_customer_order_rank.quantile(.99)

31.0

As we can see, 99% or customers haven't ordered more than 31 times,
and the majority haven't ordered more than one.

In [34]:
max_customer_order_rank.name = 'max_customer_order_rank'
customer_features = customer_features.join(max_customer_order_rank)
customer_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245453 entries, 000097eabfd9 to ffffccbfc8a4
Data columns (total 4 columns):
 #   Column                   Non-Null Count   Dtype
---  ------                   --------------   -----
 0   last_order_age_days      245453 non-null  int64
 1   first_order_age_days     245453 non-null  int64
 2   n_orders                 245453 non-null  int64
 3   max_customer_order_rank  245453 non-null  int64
dtypes: int64(4)
memory usage: 19.4+ MB


### is_failed

Here we can see how many orders have failed.
Using the number of orders we can also calculate a rate of success.

In [35]:
failed_orders_per_customer = order_data.groupby('customer_id')['is_failed'].sum()
failed_orders_per_customer.name = 'n_failed'

In [36]:
failed_orders_per_customer.head()

customer_id
000097eabfd9    0
0000e2c6d9be    0
000133bb597f    0
00018269939b    0
0001a00468a6    0
Name: n_failed, dtype: int64

In [37]:
customer_features = customer_features.join(failed_orders_per_customer)
customer_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245453 entries, 000097eabfd9 to ffffccbfc8a4
Data columns (total 5 columns):
 #   Column                   Non-Null Count   Dtype
---  ------                   --------------   -----
 0   last_order_age_days      245453 non-null  int64
 1   first_order_age_days     245453 non-null  int64
 2   n_orders                 245453 non-null  int64
 3   max_customer_order_rank  245453 non-null  int64
 4   n_failed                 245453 non-null  int64
dtypes: int64(5)
memory usage: 21.2+ MB


### Voucher amount

We can extract the number of vouchers used, the total amount from vouchers,
and the maximum value of a customer's voucher.

In [38]:
max_voucher_amount = order_data.groupby('customer_id')['voucher_amount'].max()
max_voucher_amount.name = 'max_voucher_amount'

In [39]:
max_voucher_amount.describe()

count    245453.000000
mean          0.171549
std           0.717387
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          93.398900
Name: max_voucher_amount, dtype: float64

In [40]:
max_voucher_amount.quantile(.9999)

16.102162439983935

In [41]:
customer_features = customer_features.join(max_voucher_amount)
customer_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245453 entries, 000097eabfd9 to ffffccbfc8a4
Data columns (total 6 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   last_order_age_days      245453 non-null  int64  
 1   first_order_age_days     245453 non-null  int64  
 2   n_orders                 245453 non-null  int64  
 3   max_customer_order_rank  245453 non-null  int64  
 4   n_failed                 245453 non-null  int64  
 5   max_voucher_amount       245453 non-null  float64
dtypes: float64(1), int64(5)
memory usage: 23.1+ MB


In [42]:
tot_voucher_amount = order_data.groupby('customer_id')['voucher_amount'].sum()
tot_voucher_amount.name = 'tot_voucher_amount'

In [43]:
tot_voucher_amount.describe()

count    245453.000000
mean          0.293150
std           1.824942
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max         191.994250
Name: tot_voucher_amount, dtype: float64

In [44]:
customer_features = customer_features.join(tot_voucher_amount)
customer_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245453 entries, 000097eabfd9 to ffffccbfc8a4
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   last_order_age_days      245453 non-null  int64  
 1   first_order_age_days     245453 non-null  int64  
 2   n_orders                 245453 non-null  int64  
 3   max_customer_order_rank  245453 non-null  int64  
 4   n_failed                 245453 non-null  int64  
 5   max_voucher_amount       245453 non-null  float64
 6   tot_voucher_amount       245453 non-null  float64
dtypes: float64(2), int64(5)
memory usage: 25.0+ MB


In [45]:
n_vouchers = (order_data['voucher_amount']>0).groupby(order_data['customer_id']).sum()
n_vouchers.name = 'n_vouchers'

In [46]:
n_vouchers.describe()

count    245453.000000
mean          0.175728
std           1.024720
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          98.000000
Name: n_vouchers, dtype: float64

In [47]:
customer_features = customer_features.join(n_vouchers)

In [48]:
customer_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245453 entries, 000097eabfd9 to ffffccbfc8a4
Data columns (total 8 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   last_order_age_days      245453 non-null  int64  
 1   first_order_age_days     245453 non-null  int64  
 2   n_orders                 245453 non-null  int64  
 3   max_customer_order_rank  245453 non-null  int64  
 4   n_failed                 245453 non-null  int64  
 5   max_voucher_amount       245453 non-null  float64
 6   tot_voucher_amount       245453 non-null  float64
 7   n_vouchers               245453 non-null  int64  
dtypes: float64(2), int64(6)
memory usage: 26.9+ MB


### Delivery fee

here we follow an identical procedure as for vouchers.

In [49]:
tot_delivery_fee = order_data.groupby('customer_id')['delivery_fee'].sum()
max_delivery_fee = order_data.groupby('customer_id')['delivery_fee'].max()
# How many times a delivery fee was paid
n_delivery_fee = (order_data['delivery_fee']>0).groupby(order_data['customer_id']).sum()

In [50]:
tot_delivery_fee.name = 'tot_delivery_fee'
max_delivery_fee.name = 'max_delivery_fee'
n_delivery_fee.name = 'n_delivery_fee'

In [51]:
customer_features = customer_features.join(tot_delivery_fee)
customer_features = customer_features.join(max_delivery_fee)
customer_features = customer_features.join(n_delivery_fee)
customer_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245453 entries, 000097eabfd9 to ffffccbfc8a4
Data columns (total 11 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   last_order_age_days      245453 non-null  int64  
 1   first_order_age_days     245453 non-null  int64  
 2   n_orders                 245453 non-null  int64  
 3   max_customer_order_rank  245453 non-null  int64  
 4   n_failed                 245453 non-null  int64  
 5   max_voucher_amount       245453 non-null  float64
 6   tot_voucher_amount       245453 non-null  float64
 7   n_vouchers               245453 non-null  int64  
 8   tot_delivery_fee         245453 non-null  float64
 9   max_delivery_fee         245453 non-null  float64
 10  n_delivery_fee           245453 non-null  int64  
dtypes: float64(4), int64(7)
memory usage: 32.5+ MB


### Amount paid

Here we can use the average amount paid on top of everything else.

In [52]:
tot_amount_paid = order_data.groupby('customer_id')['amount_paid'].sum()
avg_amount_paid = order_data.groupby('customer_id')['amount_paid'].mean()
max_amount_paid = order_data.groupby('customer_id')['amount_paid'].max()
min_amount_paid = order_data.groupby('customer_id')['amount_paid'].min()

In [53]:
tot_amount_paid.name = 'tot_amount_paid'
avg_amount_paid.name = 'avg_amount_paid'
max_amount_paid.name = 'max_amount_paid'
min_amount_paid.name = 'min_amount_paid'

In [54]:
customer_features = customer_features.join(tot_amount_paid)
customer_features = customer_features.join(avg_amount_paid)
customer_features = customer_features.join(max_amount_paid)
customer_features = customer_features.join(min_amount_paid)

In [55]:
customer_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245453 entries, 000097eabfd9 to ffffccbfc8a4
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   last_order_age_days      245453 non-null  int64  
 1   first_order_age_days     245453 non-null  int64  
 2   n_orders                 245453 non-null  int64  
 3   max_customer_order_rank  245453 non-null  int64  
 4   n_failed                 245453 non-null  int64  
 5   max_voucher_amount       245453 non-null  float64
 6   tot_voucher_amount       245453 non-null  float64
 7   n_vouchers               245453 non-null  int64  
 8   tot_delivery_fee         245453 non-null  float64
 9   max_delivery_fee         245453 non-null  float64
 10  n_delivery_fee           245453 non-null  int64  
 11  tot_amount_paid          245453 non-null  float64
 12  avg_amount_paid          245453 non-null  float64
 13  max_amount_paid          245453 non-null  float

### Restaurant ID

Categorical variables lend themselves naturally to produce dummy variables.

In [56]:
n_restaurants = order_data.groupby('customer_id')['restaurant_id'].nunique()
n_restaurants.name = 'n_restaurants'
n_restaurants

customer_id
000097eabfd9    1
0000e2c6d9be    1
000133bb597f    1
00018269939b    1
0001a00468a6    1
               ..
fffd696eaedd    1
fffe9d5a8d41    2
ffff347c3cfa    2
ffff4519b52d    1
ffffccbfc8a4    1
Name: n_restaurants, Length: 245453, dtype: int64

In [57]:
n_restaurants.describe()

count    245453.000000
mean          1.760740
std           1.843813
min           1.000000
25%           1.000000
50%           1.000000
75%           2.000000
max          51.000000
Name: n_restaurants, dtype: float64

In [58]:
customer_features = customer_features.join(n_restaurants)
customer_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245453 entries, 000097eabfd9 to ffffccbfc8a4
Data columns (total 16 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   last_order_age_days      245453 non-null  int64  
 1   first_order_age_days     245453 non-null  int64  
 2   n_orders                 245453 non-null  int64  
 3   max_customer_order_rank  245453 non-null  int64  
 4   n_failed                 245453 non-null  int64  
 5   max_voucher_amount       245453 non-null  float64
 6   tot_voucher_amount       245453 non-null  float64
 7   n_vouchers               245453 non-null  int64  
 8   tot_delivery_fee         245453 non-null  float64
 9   max_delivery_fee         245453 non-null  float64
 10  n_delivery_fee           245453 non-null  int64  
 11  tot_amount_paid          245453 non-null  float64
 12  avg_amount_paid          245453 non-null  float64
 13  max_amount_paid          245453 non-null  float

#### Dummy variables

In [59]:
# col_name = 'restaurant_id'
# rest_id_dummies = pd.get_dummies(
#     order_data[col_name],
#     prefix=col_name,
#     sparse=True,
# )

In [60]:
# rest_id_dummies.info()

In [61]:
# orders_features[rest_id_dummies.columns] = rest_id_dummies

In [62]:
# orders_features.info()

In [63]:
orders_features.head()

Unnamed: 0,customer_id,hour_of_day,day_of_week,is_holiday,hour_of_day_0,hour_of_day_1,hour_of_day_2,hour_of_day_3,hour_of_day_4,hour_of_day_5,...,hour_of_day_21,hour_of_day_22,hour_of_day_23,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
0,000097eabfd9,19,5,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0000e2c6d9be,20,4,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,000133bb597f,19,6,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,00018269939b,17,6,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0001a00468a6,19,1,False,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


### City ID

In [64]:
n_cities = order_data.groupby('customer_id')['city_id'].nunique()
n_cities.name = 'n_cities'
n_cities

customer_id
000097eabfd9    1
0000e2c6d9be    1
000133bb597f    1
00018269939b    1
0001a00468a6    1
               ..
fffd696eaedd    1
fffe9d5a8d41    1
ffff347c3cfa    1
ffff4519b52d    1
ffffccbfc8a4    1
Name: n_cities, Length: 245453, dtype: int64

In [65]:
n_cities.describe()

count    245453.000000
mean          1.044888
std           0.263277
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max          26.000000
Name: n_cities, dtype: float64

In [66]:
customer_features = customer_features.join(n_cities)

In [67]:
customer_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245453 entries, 000097eabfd9 to ffffccbfc8a4
Data columns (total 17 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   last_order_age_days      245453 non-null  int64  
 1   first_order_age_days     245453 non-null  int64  
 2   n_orders                 245453 non-null  int64  
 3   max_customer_order_rank  245453 non-null  int64  
 4   n_failed                 245453 non-null  int64  
 5   max_voucher_amount       245453 non-null  float64
 6   tot_voucher_amount       245453 non-null  float64
 7   n_vouchers               245453 non-null  int64  
 8   tot_delivery_fee         245453 non-null  float64
 9   max_delivery_fee         245453 non-null  float64
 10  n_delivery_fee           245453 non-null  int64  
 11  tot_amount_paid          245453 non-null  float64
 12  avg_amount_paid          245453 non-null  float64
 13  max_amount_paid          245453 non-null  float

#### Dummy vars

In [68]:
col_name = 'city_id'
city_id_dummies = pd.get_dummies(
    order_data[col_name],
    prefix=col_name,
#     sparse=True,
)

In [69]:
city_id_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 786547 entries, 0 to 786546
Columns: 3749 entries, city_id_230 to city_id_100205
dtypes: uint8(3749)
memory usage: 2.7 GB


In [70]:
orders_features[city_id_dummies.columns] = city_id_dummies

In [71]:
orders_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 786547 entries, 0 to 786546
Columns: 3784 entries, customer_id to city_id_100205
dtypes: bool(1), int64(2), object(1), uint8(3780)
memory usage: 2.8+ GB


### Payment ID

In [72]:
col_name = 'payment_id'
payment_id_dummies = pd.get_dummies(
    order_data[col_name],
    prefix=col_name,
#     sparse=True,
)

In [73]:
payment_id_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 786547 entries, 0 to 786546
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype
---  ------           --------------   -----
 0   payment_id_1491  786547 non-null  uint8
 1   payment_id_1523  786547 non-null  uint8
 2   payment_id_1619  786547 non-null  uint8
 3   payment_id_1779  786547 non-null  uint8
 4   payment_id_1811  786547 non-null  uint8
dtypes: uint8(5)
memory usage: 3.8 MB


In [74]:
orders_features[payment_id_dummies.columns] = payment_id_dummies

In [75]:
orders_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 786547 entries, 0 to 786546
Columns: 3789 entries, customer_id to payment_id_1811
dtypes: bool(1), int64(2), object(1), uint8(3785)
memory usage: 2.8+ GB


### Platform ID

In [76]:
col_name = 'platform_id'
platform_id_dummies = pd.get_dummies(
    order_data[col_name],
    prefix=col_name,
#     sparse=True,
)

In [77]:
platform_id_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 786547 entries, 0 to 786546
Data columns (total 14 columns):
 #   Column             Non-Null Count   Dtype
---  ------             --------------   -----
 0   platform_id_525    786547 non-null  uint8
 1   platform_id_22167  786547 non-null  uint8
 2   platform_id_22263  786547 non-null  uint8
 3   platform_id_22295  786547 non-null  uint8
 4   platform_id_29463  786547 non-null  uint8
 5   platform_id_29495  786547 non-null  uint8
 6   platform_id_29751  786547 non-null  uint8
 7   platform_id_29815  786547 non-null  uint8
 8   platform_id_30135  786547 non-null  uint8
 9   platform_id_30199  786547 non-null  uint8
 10  platform_id_30231  786547 non-null  uint8
 11  platform_id_30359  786547 non-null  uint8
 12  platform_id_30391  786547 non-null  uint8
 13  platform_id_30423  786547 non-null  uint8
dtypes: uint8(14)
memory usage: 10.5 MB


In [78]:
orders_features[platform_id_dummies.columns] = platform_id_dummies

In [79]:
orders_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 786547 entries, 0 to 786546
Columns: 3803 entries, customer_id to platform_id_30423
dtypes: bool(1), int64(2), object(1), uint8(3799)
memory usage: 2.8+ GB


### Transmission ID

In [80]:
col_name = 'transmission_id'
transmission_id_dummies = pd.get_dummies(
    order_data[col_name],
    prefix=col_name,
#     sparse=True,
)

In [81]:
transmission_id_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 786547 entries, 0 to 786546
Data columns (total 10 columns):
 #   Column                 Non-Null Count   Dtype
---  ------                 --------------   -----
 0   transmission_id_212    786547 non-null  uint8
 1   transmission_id_1988   786547 non-null  uint8
 2   transmission_id_2020   786547 non-null  uint8
 3   transmission_id_4196   786547 non-null  uint8
 4   transmission_id_4228   786547 non-null  uint8
 5   transmission_id_4260   786547 non-null  uint8
 6   transmission_id_4324   786547 non-null  uint8
 7   transmission_id_4356   786547 non-null  uint8
 8   transmission_id_4996   786547 non-null  uint8
 9   transmission_id_21124  786547 non-null  uint8
dtypes: uint8(10)
memory usage: 7.5 MB


In [82]:
orders_features[transmission_id_dummies.columns] = transmission_id_dummies

In [83]:
orders_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 786547 entries, 0 to 786546
Columns: 3813 entries, customer_id to transmission_id_21124
dtypes: bool(1), int64(2), object(1), uint8(3809)
memory usage: 2.8+ GB


### Store data

This time a space-efficient format is preferrable.

In [84]:
customer_features

Unnamed: 0_level_0,last_order_age_days,first_order_age_days,n_orders,max_customer_order_rank,n_failed,max_voucher_amount,tot_voucher_amount,n_vouchers,tot_delivery_fee,max_delivery_fee,n_delivery_fee,tot_amount_paid,avg_amount_paid,max_amount_paid,min_amount_paid,n_restaurants,n_cities
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
000097eabfd9,618,618,1,1,0,0.0,0.0,0,0.0000,0.0000,0,11.46960,11.46960,11.46960,11.46960,1,1
0000e2c6d9be,395,395,1,1,0,0.0,0.0,0,0.0000,0.0000,0,9.55800,9.55800,9.55800,9.55800,1,1
000133bb597f,1,1,1,1,0,0.0,0.0,0,0.4930,0.4930,1,5.93658,5.93658,5.93658,5.93658,1,1
00018269939b,22,22,1,1,0,0.0,0.0,0,0.4930,0.4930,1,9.82350,9.82350,9.82350,9.82350,1,1
0001a00468a6,573,573,1,1,0,0.0,0.0,0,0.4930,0.4930,1,5.15070,5.15070,5.15070,5.15070,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fffd696eaedd,532,532,1,1,0,0.0,0.0,0,1.4297,1.4297,1,24.13395,24.13395,24.13395,24.13395,1,1
fffe9d5a8d41,150,211,3,1,2,0.0,0.0,0,0.0000,0.0000,0,29.89530,9.96510,10.72620,8.44290,2,1
ffff347c3cfa,165,194,2,2,0,0.0,0.0,0,0.0000,0.0000,0,13.54050,6.77025,7.59330,5.94720,2,1
ffff4519b52d,331,331,1,1,0,0.0,0.0,0,0.0000,0.0000,0,21.77100,21.77100,21.77100,21.77100,1,1


### Customer features from dummy features

Here we distill the extracted order feature to customer features.

In [85]:
orders_features

Unnamed: 0,customer_id,hour_of_day,day_of_week,is_holiday,hour_of_day_0,hour_of_day_1,hour_of_day_2,hour_of_day_3,hour_of_day_4,hour_of_day_5,...,transmission_id_212,transmission_id_1988,transmission_id_2020,transmission_id_4196,transmission_id_4228,transmission_id_4260,transmission_id_4324,transmission_id_4356,transmission_id_4996,transmission_id_21124
0,000097eabfd9,19,5,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0000e2c6d9be,20,4,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,000133bb597f,19,6,False,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,00018269939b,17,6,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0001a00468a6,19,1,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
786542,fffe9d5a8d41,20,4,False,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
786543,ffff347c3cfa,21,2,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
786544,ffff347c3cfa,21,3,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
786545,ffff4519b52d,19,5,False,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [86]:
orders_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 786547 entries, 0 to 786546
Columns: 3813 entries, customer_id to transmission_id_21124
dtypes: bool(1), int64(2), object(1), uint8(3809)
memory usage: 2.8+ GB


In [None]:
# customer_features_dummy = orders_features.drop(
#     columns=['hour_of_day', 'day_of_week']
# ).groupby(['customer_id']).agg(np.sum)