# Import Libraries


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import datetime
import warnings
warnings.filterwarnings("ignore")

# Import Datasets

In [2]:
path_to_orders_data='./input_data/machine_learning_challenge_order_data.csv'
path_to_labelled_data='./input_data/machine_learning_challenge_labeled_data.csv'


In [71]:
#Import orders data and display few rows
df_orders=pd.read_csv(path_to_orders_data)
df_orders.head()

Unnamed: 0,customer_id,order_date,order_hour,customer_order_rank,is_failed,voucher_amount,delivery_fee,amount_paid,restaurant_id,city_id,payment_id,platform_id,transmission_id
0,000097eabfd9,2015-06-20,19,1.0,0,0.0,0.0,11.4696,5803498,20326,1779,30231,4356
1,0000e2c6d9be,2016-01-29,20,1.0,0,0.0,0.0,9.558,239303498,76547,1619,30359,4356
2,000133bb597f,2017-02-26,19,1.0,0,0.0,0.493,5.93658,206463498,33833,1619,30359,4324
3,00018269939b,2017-02-05,17,1.0,0,0.0,0.493,9.8235,36613498,99315,1619,30359,4356
4,0001a00468a6,2015-08-04,19,1.0,0,0.0,0.493,5.1507,225853498,16456,1619,29463,4356


In [72]:
#Import labelled data and display few rows
df_target_label=pd.read_csv(path_to_labelled_data)
df_target_label.head()

Unnamed: 0,customer_id,is_returning_customer
0,000097eabfd9,0
1,0000e2c6d9be,0
2,000133bb597f,1
3,00018269939b,0
4,0001a00468a6,0


In [73]:
df_orders.shape

(786600, 13)

In [74]:
df_target_label.shape

(245455, 2)

In [75]:

df_orders.customer_id.nunique()

245455

We have 786600 records of orders for 245,455 unique customers in  2yrs span of time (2015-03-01,2017-02-28) 

In [77]:
##Checking for any duplicate rows. If any will be removed accordingly.
df_orders=df_orders.drop_duplicates()
df_orders.shape

(786054, 13)

In [70]:

df_orders.customer_id.nunique()

245455

It seems we have almost 546 records duplicate entries.it needs to removed.
The customer_ids remain same.

In [12]:
#Checking out  missing values if any in the columns of order data.
df_orders.isnull().any()

customer_id            False
order_date             False
order_hour             False
customer_order_rank     True
is_failed              False
voucher_amount         False
delivery_fee           False
amount_paid            False
restaurant_id          False
city_id                False
payment_id             False
platform_id            False
transmission_id        False
dtype: bool

As mentioned in data dictionary that 'customer_order_rank' is empty for failed orders.

In [14]:
#Datatypes of columns of the orders dataframe
df_orders.dtypes

customer_id             object
order_date              object
order_hour               int64
customer_order_rank    float64
is_failed                int64
voucher_amount         float64
delivery_fee           float64
amount_paid            float64
restaurant_id            int64
city_id                  int64
payment_id               int64
platform_id              int64
transmission_id          int64
dtype: object

'order_date' need to be converted to datetime format as its one of the imporant column to understand the order patterns with respect to days etc. rest all columns seems to be fine

In [15]:
##Converting into Timestamp(datetime format)
df_orders['order_date']=pd.to_datetime(df_orders['order_date'])
df_orders.head()

Unnamed: 0,customer_id,order_date,order_hour,customer_order_rank,is_failed,voucher_amount,delivery_fee,amount_paid,restaurant_id,city_id,payment_id,platform_id,transmission_id
0,000097eabfd9,2015-06-20,19,1.0,0,0.0,0.0,11.4696,5803498,20326,1779,30231,4356
1,0000e2c6d9be,2016-01-29,20,1.0,0,0.0,0.0,9.558,239303498,76547,1619,30359,4356
2,000133bb597f,2017-02-26,19,1.0,0,0.0,0.493,5.93658,206463498,33833,1619,30359,4324
3,00018269939b,2017-02-05,17,1.0,0,0.0,0.493,9.8235,36613498,99315,1619,30359,4356
4,0001a00468a6,2015-08-04,19,1.0,0,0.0,0.493,5.1507,225853498,16456,1619,29463,4356


Describing the data to see count,min,max values of the various numerical columns 

In [18]:
df_orders[['customer_order_rank','voucher_amount','delivery_fee','amount_paid']].describe()

Unnamed: 0,customer_order_rank,voucher_amount,delivery_fee,amount_paid
count,761833.0,786054.0,786054.0,786054.0
mean,9.43681,0.091476,0.181161,10.182875
std,17.772322,0.479488,0.369661,5.605253
min,1.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,6.65343
50%,3.0,0.0,0.0,9.027
75%,10.0,0.0,0.0,12.213
max,369.0,93.3989,9.86,1131.03


In [19]:
df_orders[df_orders.customer_order_rank==369]

Unnamed: 0,customer_id,order_date,order_hour,customer_order_rank,is_failed,voucher_amount,delivery_fee,amount_paid,restaurant_id,city_id,payment_id,platform_id,transmission_id
69538,15edce943edd,2017-02-26,19,369.0,0,0.0,0.0,7.5402,332363498,31506,1779,29463,4996


# keytakeaways:
- customer_order_rank has missing values and with average of 9-10 orders and also max number of orders being 369 by one of the customer_id(15edce943edd).

- voucher_amount seems to be 0 for most of them and ofcourse max value for one of the customer being 93.39.

- delivery_fee seems to be low to 0 for most of them and on average being 0.18 collected.

- amount_paid per order is in range of 0 to 1131 .its been  10 on average though

In [20]:
##Categorical variables distribution
cat_columns = ['restaurant_id','city_id','payment_id','transmission_id','platform_id','order_hour','is_failed']
df_orders[cat_columns] = df_orders[cat_columns].astype(str)
df_orders.describe(include='O')

Unnamed: 0,customer_id,order_hour,is_failed,restaurant_id,city_id,payment_id,platform_id,transmission_id
count,786054,786054,786054,786054,786054,786054,786054,786054
unique,245455,24,2,13569,3749,5,14,10
top,15edce943edd,19,0,37623498,10346,1619,29463,4356
freq,386,133959,761833,1314,86574,476507,241351,341675


top:Most commonly occuring value among all values in a column.
freq:Frequency of most commonly occuring value amoong all values in a column

# Keytakeaways:
- orders data is from 3,749 unique cities,14 unique platforms,5 unique payment methods,10 different transmissions.

- Most commonly order_hour seems to 19 hrs(but depends on timezone of data)

- Failed orders represented by is_failed=1 and seems to be less amount of data. As is_failed=0 has more values.


In [22]:
# Labelled data understanding
#Count of customers by target response variable 'is_returning_customer'
df_target_label.is_returning_customer.value_counts()

0    189948
1     55507
Name: is_returning_customer, dtype: int64

In [24]:
#Percentage of customers
df_target_label.is_returning_customer.value_counts(normalize=True)*100

0    77.386079
1    22.613921
Name: is_returning_customer, dtype: float64

# Keytakeaways

- we have almost 77.3% of customers who churn out and dont return to the platform(did not order again in the 6 months after 2017-02-28).

- 22.6% of customers are placing orders again at least once after 2017-02-28.                                                                     
- Hence it becomes a class imbalance problem which needs to be balanced during train and test split of datasets and further evaluations.

                                                                           

# Feature Engineering and Transformations

Lets transform the data into rows per customer aggreagted information by computing various features through feature engineering.


In [26]:
#Input dataset
df_orders.head()

Unnamed: 0,customer_id,order_date,order_hour,customer_order_rank,is_failed,voucher_amount,delivery_fee,amount_paid,restaurant_id,city_id,payment_id,platform_id,transmission_id
0,000097eabfd9,2015-06-20,19,1.0,0,0.0,0.0,11.4696,5803498,20326,1779,30231,4356
1,0000e2c6d9be,2016-01-29,20,1.0,0,0.0,0.0,9.558,239303498,76547,1619,30359,4356
2,000133bb597f,2017-02-26,19,1.0,0,0.0,0.493,5.93658,206463498,33833,1619,30359,4324
3,00018269939b,2017-02-05,17,1.0,0,0.0,0.493,9.8235,36613498,99315,1619,30359,4356
4,0001a00468a6,2015-08-04,19,1.0,0,0.0,0.493,5.1507,225853498,16456,1619,29463,4356


Compute  
total orders placed per customer and also count of successful and failed orders 

In [27]:
#total orders
df_count_orders=df_orders[['customer_id','order_date']].groupby('customer_id')['order_date'].count().reset_index(name='total_orders')
df_count_orders.head()


Unnamed: 0,customer_id,total_orders
0,000097eabfd9,1
1,0000e2c6d9be,1
2,000133bb597f,1
3,00018269939b,1
4,0001a00468a6,1


In [29]:
#total successful orders ie filtering orders with customer_order_rank is null (failed orders)
df_count_sucess_orders=df_orders[~df_orders.customer_order_rank.isnull()][['customer_id','customer_order_rank']]
df_count_sucess_orders=df_count_sucess_orders.groupby('customer_id')['customer_order_rank'].count().reset_index(name='total_sucess_orders')
df_count_sucess_orders.head()


Unnamed: 0,customer_id,total_sucess_orders
0,000097eabfd9,1
1,0000e2c6d9be,1
2,000133bb597f,1
3,00018269939b,1
4,0001a00468a6,1


In [33]:
#Total failed orders per customer
df_orders['is_failed']=df_orders['is_failed'].astype(int)
df_count_failed_orders=df_orders[['customer_id','is_failed']].groupby('customer_id')['is_failed'].sum().reset_index(name='total_failed_orders')
df_count_failed_orders.head()


Unnamed: 0,customer_id,total_failed_orders
0,000097eabfd9,0
1,0000e2c6d9be,0
2,000133bb597f,0
3,00018269939b,0
4,0001a00468a6,0


In [39]:
#Just cross checking the computation for one random customer with orders information as well
df_count_failed_orders[df_count_failed_orders.customer_id=='fffe9d5a8d41'].head()

Unnamed: 0,customer_id,total_failed_orders
245451,fffe9d5a8d41,2


In [37]:
df_orders[df_orders.customer_id=='fffe9d5a8d41'].head()

Unnamed: 0,customer_id,order_date,order_hour,customer_order_rank,is_failed,voucher_amount,delivery_fee,amount_paid,restaurant_id,city_id,payment_id,platform_id,transmission_id
786593,fffe9d5a8d41,2016-07-31,21,,1,0.0,0.0,8.4429,156133498,10346,1811,29463,212
786594,fffe9d5a8d41,2016-09-30,20,1.0,0,0.0,0.0,10.7262,983498,10346,1779,29463,4228
786595,fffe9d5a8d41,2016-09-30,20,,1,0.0,0.0,10.7262,983498,10346,1779,29463,212


In [30]:
df_count_sucess_orders.customer_id.nunique()

244937

# Keytakeaways

- we have 244937 customers of 245455 ie almost 99.7% customers having placed atleast one successful order.
- Remaining 518 customers have placed orders but havent been successful maybe platform issues/restaurant decline or change of user choice etc could be reasons/ data capturing problems

- For further analysis because of time constraints and also 518 customers seems to be of small magnitude .I would consider the data as it and won't filter those failed orders customers from the dataset.
In future areas of imporvement ,this would need further investigation to filter and build datasets separately to see if it has impact on model performances etc

Feature Set 2:

- When did the customer order for 1st and last time and also the days before end of this dataset(2017-03-01) did customer placed the order ?
Recency of orders in terms of days..like on average how often he placed order

This metrics has been crucial in past experiences and also reflects the engagement of customer .

In [41]:
df_customer_ordered_dates=df_orders[['customer_id','order_date']].drop_duplicates()
df_customer_ordered_dates.head()

Unnamed: 0,customer_id,order_date
0,000097eabfd9,2015-06-20
1,0000e2c6d9be,2016-01-29
2,000133bb597f,2017-02-26
3,00018269939b,2017-02-05
4,0001a00468a6,2015-08-04


In [42]:
##compute the first and last ordered date per customer
df_customer_min_date=df_customer_ordered_dates.groupby('customer_id')['order_date'].min().reset_index(name='min_order_date')
df_customer_max_date=df_customer_ordered_dates.groupby('customer_id')['order_date'].max().reset_index(name='max_order_date')

df_customer_min_max_dates=pd.merge(df_customer_min_date,df_customer_max_date,on='customer_id')
df_customer_min_max_dates.head()

Unnamed: 0,customer_id,min_order_date,max_order_date
0,000097eabfd9,2015-06-20,2015-06-20
1,0000e2c6d9be,2016-01-29,2016-01-29
2,000133bb597f,2017-02-26,2017-02-26
3,00018269939b,2017-02-05,2017-02-05
4,0001a00468a6,2015-08-04,2015-08-04


In [43]:
#compute the difference in dates :days
df_customer_min_max_dates['diff_days_in_first_last_order']=(df_customer_min_max_dates['max_order_date']-df_customer_min_max_dates['min_order_date']).dt.days
df_customer_min_max_dates.head()


Unnamed: 0,customer_id,min_order_date,max_order_date,diff_days_in_first_last_order
0,000097eabfd9,2015-06-20,2015-06-20,0
1,0000e2c6d9be,2016-01-29,2016-01-29,0
2,000133bb597f,2017-02-26,2017-02-26,0
3,00018269939b,2017-02-05,2017-02-05,0
4,0001a00468a6,2015-08-04,2015-08-04,0


In [46]:
df_customer_min_max_dates.customer_id.nunique()

245455

In [44]:
df_customer_min_max_dates['diff_days_in_first_last_order'].value_counts(normalize=True)*100

0       60.180074
1        0.647777
7        0.496221
2        0.431851
14       0.376036
          ...    
1204     0.000407
1106     0.000407
1163     0.000407
1091     0.000407
1530     0.000407
Name: diff_days_in_first_last_order, Length: 741, dtype: float64

- we see that almost 60% of customers had zero days gap between there orders.

In [47]:
#Define the lastdate as per dataset mentioned 
last_date = datetime.datetime(2017,3,1)
print(last_date)

2017-03-01 00:00:00


In [48]:
#num_days_before_end of last_date placed order

df_customer_min_max_dates['num_days_before_lastdate']=df_customer_min_max_dates['max_order_date'].apply(lambda x:(last_date-x).days)
df_customer_min_max_dates.head()

Unnamed: 0,customer_id,min_order_date,max_order_date,diff_days_in_first_last_order,num_days_before_lastdate
0,000097eabfd9,2015-06-20,2015-06-20,0,620
1,0000e2c6d9be,2016-01-29,2016-01-29,0,397
2,000133bb597f,2017-02-26,2017-02-26,0,3
3,00018269939b,2017-02-05,2017-02-05,0,24
4,0001a00468a6,2015-08-04,2015-08-04,0,575


In [99]:
##num_days_before_end of date placed order

df_customer_min_max_dates['num_days_before_firstdate']=df_customer_min_max_dates['min_order_date'].apply(lambda x:(last_date-x).days)
df_customer_min_max_dates.head()

Unnamed: 0,customer_id,min_order_date,max_order_date,diff_days_in_first_last_order,num_days_before_lastdate,num_days_before_firstdate
0,000097eabfd9,2015-06-20,2015-06-20,0,620,620
1,0000e2c6d9be,2016-01-29,2016-01-29,0,397,397
2,000133bb597f,2017-02-26,2017-02-26,0,3,3
3,00018269939b,2017-02-05,2017-02-05,0,24,24
4,0001a00468a6,2015-08-04,2015-08-04,0,575,575


In [100]:
#for verification
df_customer_min_max_dates[df_customer_min_max_dates.customer_id=='fffe9d5a8d41'].head()

Unnamed: 0,customer_id,min_order_date,max_order_date,diff_days_in_first_last_order,num_days_before_lastdate,num_days_before_firstdate
245451,fffe9d5a8d41,2016-07-31,2016-09-30,61,152,213


In [101]:
#Dropping unncessary columns from the dataframe as we would need when we joined together all computed features together 
df_customer_min_max_dates_final=df_customer_min_max_dates.drop(['min_order_date','max_order_date'],axis=1)
df_customer_min_max_dates_final.head()

Unnamed: 0,customer_id,diff_days_in_first_last_order,num_days_before_lastdate,num_days_before_firstdate
0,000097eabfd9,0,620,620
1,0000e2c6d9be,0,397,397
2,000133bb597f,0,3,3
3,00018269939b,0,24,24
4,0001a00468a6,0,575,575


In [51]:
#Recency of Orders on average 
##assuming unique timezone..

df_recency=df_customer_ordered_dates.copy()
df_recency['timestamp']=df_recency['order_date'].apply(lambda x:x.strftime("%s"))

df_recency=df_recency.drop('order_date',axis=1)
df_recency=df_recency.sort_values(['customer_id','timestamp'],ascending=True)
df_recency['timestamp']=df_recency['timestamp'].astype(int)

df_tmp=df_recency.groupby('customer_id')['timestamp'].diff()
df_tmp.columns=['diff_timestamp']
df_recency['diff_timestamp']=df_tmp
df_recency['diff_timestamp']=df_recency['diff_timestamp'].fillna(0)
df_recency['recency_days']=df_recency['diff_timestamp'].apply(lambda x:x/(3600*24))
df_recency.head()


Unnamed: 0,customer_id,timestamp,diff_timestamp,recency_days
0,000097eabfd9,1434744000,0.0,0.0
1,0000e2c6d9be,1454011200,0.0,0.0
2,000133bb597f,1488052800,0.0,0.0
3,00018269939b,1486238400,0.0,0.0
4,0001a00468a6,1438632000,0.0,0.0


In [52]:
#Average recency of orders per customer in terms of days
df_recency_avg=df_recency[['customer_id','recency_days']].groupby('customer_id')['recency_days'].mean().reset_index(name='avg_recency_days_orders')

df_recency_avg.head()


Unnamed: 0,customer_id,avg_recency_days_orders
0,000097eabfd9,0.0
1,0000e2c6d9be,0.0
2,000133bb597f,0.0
3,00018269939b,0.0
4,0001a00468a6,0.0


FeatureSet 3

- Did customer used voucher anytime? If so can we flag them accordingly
- Did customer pay for delivery fee or not ? If so we can flag them accordingly
- Did customer pay any amount for orders.if so we can flag them accordingly.


In [55]:
#Voucher flag
df_voucher=df_orders[['customer_id','voucher_amount']]
df_voucher['used_voucher']=df_voucher['voucher_amount'].apply(lambda x: 1 if x>0 else 0)
df_voucher=df_voucher[['customer_id','used_voucher']].groupby('customer_id')['used_voucher'].max().reset_index(name='used_voucher')
df_voucher.head()


Unnamed: 0,customer_id,used_voucher
0,000097eabfd9,0
1,0000e2c6d9be,0
2,000133bb597f,0
3,00018269939b,0
4,0001a00468a6,0


In [56]:
df_voucher['used_voucher'].value_counts(normalize=True)*100

0    90.969832
1     9.030168
Name: used_voucher, dtype: float64

- Almost 91% of customers havent used any voucher for there orders

In [57]:
#Deliveryfee flag
df_deliveryfee=df_orders[['customer_id','delivery_fee']]
df_deliveryfee['paid_deliveryfee']=df_deliveryfee['delivery_fee'].apply(lambda x: 1 if x>0 else 0)
df_deliveryfee=df_deliveryfee[['customer_id','paid_deliveryfee']].groupby('customer_id')['paid_deliveryfee'].max().reset_index(name='paid_deliveryfee')
df_deliveryfee.head()

Unnamed: 0,customer_id,paid_deliveryfee
0,000097eabfd9,0
1,0000e2c6d9be,0
2,000133bb597f,1
3,00018269939b,1
4,0001a00468a6,1


In [62]:
df_deliveryfee['paid_deliveryfee'].value_counts()


0    165778
1     79677
Name: paid_deliveryfee, dtype: int64

In [58]:
#Percentage of Customers who pay or didnt pay delivery fee
df_deliveryfee['paid_deliveryfee'].value_counts(normalize=True)*100


0    67.53906
1    32.46094
Name: paid_deliveryfee, dtype: float64

- Almost 67% of customers have paid delivery fee for atleast one of there orders.

In [63]:
#Amount Paid flag

df_orders_paid=df_orders[['customer_id','amount_paid']]
df_orders_paid['paid_order']=df_orders_paid['amount_paid'].apply(lambda x: 1 if x>0 else 0)
df_orders_paid=df_orders_paid[['customer_id','paid_order']].groupby('customer_id')['paid_order'].max().reset_index(name='paid_order')
df_orders_paid.head()

Unnamed: 0,customer_id,paid_order
0,000097eabfd9,1
1,0000e2c6d9be,1
2,000133bb597f,1
3,00018269939b,1
4,0001a00468a6,1


In [64]:
df_orders_paid['paid_order'].value_counts(normalize=True)*100

1    99.852926
0     0.147074
Name: paid_order, dtype: float64

- Almost 99.8% of customers have paid for there orders atleast once.

FeatureSet 4: Dealing with Categorical variables

- convert the categorical presence into more interpretable format to understand the importance & impact of each categorical value per customer orders overall

In [69]:
df_orders.count()

customer_id            786054
order_date             786054
order_hour             786054
customer_order_rank    761833
is_failed              786054
voucher_amount         786054
delivery_fee           786054
amount_paid            786054
restaurant_id          786054
city_id                786054
payment_id             786054
platform_id            786054
transmission_id        786054
dtype: int64

In [65]:
def compute_cat_dummies(df_orders):
    df_pymt_dum=pd.get_dummies(df_orders['payment_id'],prefix='payment_id')
    df_transmission_dum=pd.get_dummies(df_orders['transmission_id'],prefix='transmission_id')
    df_platform_dum=pd.get_dummies(df_orders['platform_id'],prefix='platform_id')
    list_all_dfs=[df_orders,df_pymt_dum,df_transmission_dum,df_platform_dum]
    df_all=pd.concat(list_all_dfs,axis=1)
    return df_all
    

In [66]:
df_transform_orders=compute_cat_dummies(df_orders)
df_transform_orders.head()

Unnamed: 0,customer_id,order_date,order_hour,customer_order_rank,is_failed,voucher_amount,delivery_fee,amount_paid,restaurant_id,city_id,...,platform_id_29495,platform_id_29751,platform_id_29815,platform_id_30135,platform_id_30199,platform_id_30231,platform_id_30359,platform_id_30391,platform_id_30423,platform_id_525
0,000097eabfd9,2015-06-20,19,1.0,0,0.0,0.0,11.4696,5803498,20326,...,0,0,0,0,0,1,0,0,0,0
1,0000e2c6d9be,2016-01-29,20,1.0,0,0.0,0.0,9.558,239303498,76547,...,0,0,0,0,0,0,1,0,0,0
2,000133bb597f,2017-02-26,19,1.0,0,0.0,0.493,5.93658,206463498,33833,...,0,0,0,0,0,0,1,0,0,0
3,00018269939b,2017-02-05,17,1.0,0,0.0,0.493,9.8235,36613498,99315,...,0,0,0,0,0,0,1,0,0,0
4,0001a00468a6,2015-08-04,19,1.0,0,0.0,0.493,5.1507,225853498,16456,...,0,0,0,0,0,0,0,0,0,0


In [67]:
df_transform_orders.shape

(786054, 42)

In [68]:
df_transform_orders.columns

Index(['customer_id', 'order_date', 'order_hour', 'customer_order_rank',
       'is_failed', 'voucher_amount', 'delivery_fee', 'amount_paid',
       'restaurant_id', 'city_id', 'payment_id', 'platform_id',
       'transmission_id', 'payment_id_1491', 'payment_id_1523',
       'payment_id_1619', 'payment_id_1779', 'payment_id_1811',
       'transmission_id_1988', 'transmission_id_2020', 'transmission_id_21124',
       'transmission_id_212', 'transmission_id_4196', 'transmission_id_4228',
       'transmission_id_4260', 'transmission_id_4324', 'transmission_id_4356',
       'transmission_id_4996', 'platform_id_22167', 'platform_id_22263',
       'platform_id_22295', 'platform_id_29463', 'platform_id_29495',
       'platform_id_29751', 'platform_id_29815', 'platform_id_30135',
       'platform_id_30199', 'platform_id_30231', 'platform_id_30359',
       'platform_id_30391', 'platform_id_30423', 'platform_id_525'],
      dtype='object')

-Aggregations on the data to compute sum,mean,std of numerical values and capture unique values of categorical values
like restaurant,platform,payment,transmission and city

In [79]:
def q75(x):
            return x.quantile(0.75)
def q25(x):
            return x.quantile(0.25)


In [80]:
aggregations = {
    'restaurant_id':lambda x:x.nunique(),
    'city_id':lambda x:x.nunique(),
    'platform_id':lambda x:x.nunique(),
    'payment_id' : lambda x:x.nunique(),
    'transmission_id' :lambda x:x.nunique(),
    'amount_paid':['mean','std','sum',q25,q75],
    'delivery_fee':['mean','sum'],
    'voucher_amount':['mean','sum'],
    
}
df_agg_orders=df_orders.groupby('customer_id',as_index=False).agg(aggregations)
df_agg_orders.columns = ["_".join(x) for x in df_agg_orders.columns.ravel()]
df_agg_orders.head()


Unnamed: 0,customer_id_,restaurant_id_<lambda>,city_id_<lambda>,platform_id_<lambda>,payment_id_<lambda>,transmission_id_<lambda>,amount_paid_mean,amount_paid_std,amount_paid_sum,amount_paid_q25,amount_paid_q75,delivery_fee_mean,delivery_fee_sum,voucher_amount_mean,voucher_amount_sum
0,000097eabfd9,1,1,1,1,1,11.4696,,11.4696,11.4696,11.4696,0.0,0.0,0.0,0.0
1,0000e2c6d9be,1,1,1,1,1,9.558,,9.558,9.558,9.558,0.0,0.0,0.0,0.0
2,000133bb597f,1,1,1,1,1,5.93658,,5.93658,5.93658,5.93658,0.493,0.493,0.0,0.0
3,00018269939b,1,1,1,1,1,9.8235,,9.8235,9.8235,9.8235,0.493,0.493,0.0,0.0
4,0001a00468a6,1,1,1,1,1,5.1507,,5.1507,5.1507,5.1507,0.493,0.493,0.0,0.0


In [81]:
#Renaming columns to meaningful values
df_agg_orders=df_agg_orders.rename(columns={'customer_id_':'customer_id','restaurant_id_<lambda>':'num_restaurant_id_unique','city_id_<lambda>':'num_city_id_unique','platform_id_<lambda>':'num_platform_id_unique','payment_id_<lambda>':'num_payment_id_unique','transmission_id_<lambda>':'num_transmission_id_unique'})

df_agg_orders.head()
                              
                              

Unnamed: 0,customer_id,num_restaurant_id_unique,num_city_id_unique,num_platform_id_unique,num_payment_id_unique,num_transmission_id_unique,amount_paid_mean,amount_paid_std,amount_paid_sum,amount_paid_q25,amount_paid_q75,delivery_fee_mean,delivery_fee_sum,voucher_amount_mean,voucher_amount_sum
0,000097eabfd9,1,1,1,1,1,11.4696,,11.4696,11.4696,11.4696,0.0,0.0,0.0,0.0
1,0000e2c6d9be,1,1,1,1,1,9.558,,9.558,9.558,9.558,0.0,0.0,0.0,0.0
2,000133bb597f,1,1,1,1,1,5.93658,,5.93658,5.93658,5.93658,0.493,0.493,0.0,0.0
3,00018269939b,1,1,1,1,1,9.8235,,9.8235,9.8235,9.8235,0.493,0.493,0.0,0.0
4,0001a00468a6,1,1,1,1,1,5.1507,,5.1507,5.1507,5.1507,0.493,0.493,0.0,0.0


In [82]:
#verification
df_agg_orders.loc[df_agg_orders["customer_id"]=="ffff347c3cfa"]


Unnamed: 0,customer_id,num_restaurant_id_unique,num_city_id_unique,num_platform_id_unique,num_payment_id_unique,num_transmission_id_unique,amount_paid_mean,amount_paid_std,amount_paid_sum,amount_paid_q25,amount_paid_q75,delivery_fee_mean,delivery_fee_sum,voucher_amount_mean,voucher_amount_sum
245452,ffff347c3cfa,2,1,1,1,1,6.77025,1.163968,13.5405,6.358725,7.181775,0.0,0.0,0.0,0.0


- Compute the distribution of platform,payment,transmission and how much each percentage contribute to orders per customer.

In [83]:
#cols
payment_cols = df_transform_orders.filter(regex='^payment_id_', axis=1).columns.tolist()
platform_cols = df_transform_orders.filter(regex='^platform_id_', axis=1).columns.tolist()
transmission_cols = df_transform_orders.filter(regex='^transmission_id_', axis=1).columns.tolist()

In [84]:


##appending customer_id as a reference for further joins 
payment_cols.append('customer_id')
platform_cols.append('customer_id')
transmission_cols.append('customer_id')


In [85]:
payment_cols

['payment_id_1491',
 'payment_id_1523',
 'payment_id_1619',
 'payment_id_1779',
 'payment_id_1811',
 'customer_id']

In [86]:
def compute_percentage_contribution(df_transform_orders,cols):
    df=df_transform_orders[cols].groupby('customer_id').apply(np.sum).drop('customer_id',axis=1)
    df_res=round(df.div(df.sum(axis=1), axis=0)*100)
    return df_res.reset_index()
    

In [87]:
# aggregates computed
df_payment_agg=compute_percentage_contribution(df_transform_orders,payment_cols)
df_payment_agg.head()

Unnamed: 0,customer_id,payment_id_1491,payment_id_1523,payment_id_1619,payment_id_1779,payment_id_1811
0,000097eabfd9,0.0,0.0,0.0,100.0,0.0
1,0000e2c6d9be,0.0,0.0,100.0,0.0,0.0
2,000133bb597f,0.0,0.0,100.0,0.0,0.0
3,00018269939b,0.0,0.0,100.0,0.0,0.0
4,0001a00468a6,0.0,0.0,100.0,0.0,0.0


In [89]:

df_payment_agg[df_payment_agg.customer_id=='ffcdbbc627fe']

Unnamed: 0,customer_id,payment_id_1491,payment_id_1523,payment_id_1619,payment_id_1779,payment_id_1811
245283,ffcdbbc627fe,70.0,0.0,30.0,0.0,0.0


In [91]:
df_transmission_agg=compute_percentage_contribution(df_transform_orders,transmission_cols)
df_platform_agg=compute_percentage_contribution(df_transform_orders,platform_cols)


In [92]:
df_platform_agg.head()

Unnamed: 0,customer_id,platform_id_22167,platform_id_22263,platform_id_22295,platform_id_29463,platform_id_29495,platform_id_29751,platform_id_29815,platform_id_30135,platform_id_30199,platform_id_30231,platform_id_30359,platform_id_30391,platform_id_30423,platform_id_525
0,000097eabfd9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
1,0000e2c6d9be,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
2,000133bb597f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
3,00018269939b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
4,0001a00468a6,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Combine all independent feature dataframes together 

#List of dataframes computed to be merged
- df_platform_agg
- df_transmission_agg
- df_payment_agg
- df_count_failed_orders
- df_count_orders
- df_count_sucess_orders
- df_customer_min_max_dates_final
- df_recency_avg
- df_voucher
- df_deliveryfee
- df_orders_paid
- df_agg_orders
- df_target_label

In [93]:
df_agg_orders.head()

Unnamed: 0,customer_id,num_restaurant_id_unique,num_city_id_unique,num_platform_id_unique,num_payment_id_unique,num_transmission_id_unique,amount_paid_mean,amount_paid_std,amount_paid_sum,amount_paid_q25,amount_paid_q75,delivery_fee_mean,delivery_fee_sum,voucher_amount_mean,voucher_amount_sum
0,000097eabfd9,1,1,1,1,1,11.4696,,11.4696,11.4696,11.4696,0.0,0.0,0.0,0.0
1,0000e2c6d9be,1,1,1,1,1,9.558,,9.558,9.558,9.558,0.0,0.0,0.0,0.0
2,000133bb597f,1,1,1,1,1,5.93658,,5.93658,5.93658,5.93658,0.493,0.493,0.0,0.0
3,00018269939b,1,1,1,1,1,9.8235,,9.8235,9.8235,9.8235,0.493,0.493,0.0,0.0
4,0001a00468a6,1,1,1,1,1,5.1507,,5.1507,5.1507,5.1507,0.493,0.493,0.0,0.0


In [94]:
df_agg_orders.customer_id.nunique()

245455

In [102]:
#Consider df_agg_orders as base table and merge other tables together using left join on customer_id
df_final=df_agg_orders.merge(df_voucher,on='customer_id',how='left')
df_final.head()

Unnamed: 0,customer_id,num_restaurant_id_unique,num_city_id_unique,num_platform_id_unique,num_payment_id_unique,num_transmission_id_unique,amount_paid_mean,amount_paid_std,amount_paid_sum,amount_paid_q25,amount_paid_q75,delivery_fee_mean,delivery_fee_sum,voucher_amount_mean,voucher_amount_sum,used_voucher
0,000097eabfd9,1,1,1,1,1,11.4696,,11.4696,11.4696,11.4696,0.0,0.0,0.0,0.0,0
1,0000e2c6d9be,1,1,1,1,1,9.558,,9.558,9.558,9.558,0.0,0.0,0.0,0.0,0
2,000133bb597f,1,1,1,1,1,5.93658,,5.93658,5.93658,5.93658,0.493,0.493,0.0,0.0,0
3,00018269939b,1,1,1,1,1,9.8235,,9.8235,9.8235,9.8235,0.493,0.493,0.0,0.0,0
4,0001a00468a6,1,1,1,1,1,5.1507,,5.1507,5.1507,5.1507,0.493,0.493,0.0,0.0,0


In [103]:
df_final=df_final.merge(df_deliveryfee,on='customer_id',how='left')
df_final=df_final.merge(df_orders_paid,on='customer_id',how='left')
df_final=df_final.merge(df_recency_avg,on='customer_id',how='left')
df_final=df_final.merge(df_customer_min_max_dates_final,on='customer_id',how='left')
df_final=df_final.merge(df_count_orders,on='customer_id',how='left')
df_final=df_final.merge(df_count_sucess_orders,on='customer_id',how='left')
df_final=df_final.merge(df_count_failed_orders,on='customer_id',how='left')
df_final=df_final.merge(df_payment_agg,on='customer_id',how='left')
df_final=df_final.merge(df_transmission_agg,on='customer_id',how='left')
df_final=df_final.merge(df_platform_agg,on='customer_id',how='left')
df_final=df_final.merge(df_target_label,on='customer_id',how='left')
df_final.head()


Unnamed: 0,customer_id,num_restaurant_id_unique,num_city_id_unique,num_platform_id_unique,num_payment_id_unique,num_transmission_id_unique,amount_paid_mean,amount_paid_std,amount_paid_sum,amount_paid_q25,...,platform_id_29751,platform_id_29815,platform_id_30135,platform_id_30199,platform_id_30231,platform_id_30359,platform_id_30391,platform_id_30423,platform_id_525,is_returning_customer
0,000097eabfd9,1,1,1,1,1,11.4696,,11.4696,11.4696,...,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0
1,0000e2c6d9be,1,1,1,1,1,9.558,,9.558,9.558,...,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0
2,000133bb597f,1,1,1,1,1,5.93658,,5.93658,5.93658,...,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,1
3,00018269939b,1,1,1,1,1,9.8235,,9.8235,9.8235,...,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0
4,0001a00468a6,1,1,1,1,1,5.1507,,5.1507,5.1507,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [104]:
df_final.shape

(245455, 55)

In [105]:
df_final.dtypes

customer_id                       object
num_restaurant_id_unique           int64
num_city_id_unique                 int64
num_platform_id_unique             int64
num_payment_id_unique              int64
num_transmission_id_unique         int64
amount_paid_mean                 float64
amount_paid_std                  float64
amount_paid_sum                  float64
amount_paid_q25                  float64
amount_paid_q75                  float64
delivery_fee_mean                float64
delivery_fee_sum                 float64
voucher_amount_mean              float64
voucher_amount_sum               float64
used_voucher                       int64
paid_deliveryfee                   int64
paid_order                         int64
avg_recency_days_orders          float64
diff_days_in_first_last_order      int64
num_days_before_lastdate           int64
num_days_before_firstdate          int64
total_orders                       int64
total_sucess_orders              float64
total_failed_ord

In [106]:
df_final.isnull().any()

customer_id                      False
num_restaurant_id_unique         False
num_city_id_unique               False
num_platform_id_unique           False
num_payment_id_unique            False
num_transmission_id_unique       False
amount_paid_mean                 False
amount_paid_std                   True
amount_paid_sum                  False
amount_paid_q25                  False
amount_paid_q75                  False
delivery_fee_mean                False
delivery_fee_sum                 False
voucher_amount_mean              False
voucher_amount_sum               False
used_voucher                     False
paid_deliveryfee                 False
paid_order                       False
avg_recency_days_orders          False
diff_days_in_first_last_order    False
num_days_before_lastdate         False
num_days_before_firstdate        False
total_orders                     False
total_sucess_orders               True
total_failed_orders              False
payment_id_1491          

In [107]:
#Filling empty values with 0
df_final['amount_paid_std']=df_final['amount_paid_std'].fillna(0)
df_final['total_sucess_orders']=df_final['total_sucess_orders'].fillna(0)

In [108]:
###save final dataset for further analysis

df_final.to_csv('./input_data/peruser_order_attributes.csv',index=None)