In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cascade-cup-22/sample_submission.csv
/kaggle/input/cascade-cup-22/train.csv
/kaggle/input/cascade-cup-22/test.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

VALID_TRAIN_SPLIT = 0.2
TEST_SPLIT_RATIO  = 0.2
SEED = 42

In [3]:
data_df = pd.read_csv('/kaggle/input/cascade-cup-22/train.csv')
test_df = pd.read_csv('/kaggle/input/cascade-cup-22/test.csv')
print('DATA loaded ...')
print(f'Train Columns : {data_df.shape[1]}   Rows : {data_df.shape[0]}')
print(f'Test Columns : {test_df.shape[1]}   Rows : {test_df.shape[0]}')


DATA loaded ...
Train Columns : 20   Rows : 450000
Test Columns : 16   Rows : 144844


In [4]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype.name
        
        if col_type not in ['object', 'category', 'datetime64[ns, UTC]', 'datetime64[ns]']:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [5]:
data_df = reduce_mem_usage(data_df)
test_df = reduce_mem_usage(test_df)

Memory usage of dataframe is 68.66 MB
Memory usage after optimization is: 41.20 MB
Decreased by 40.0%
Memory usage of dataframe is 17.68 MB
Memory usage after optimization is: 9.67 MB
Decreased by 45.3%


In [6]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450000 entries, 0 to 449999
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_time            450000 non-null  object 
 1   order_id              450000 non-null  int32  
 2   order_date            450000 non-null  object 
 3   allot_time            450000 non-null  object 
 4   accept_time           449843 non-null  object 
 5   pickup_time           447579 non-null  object 
 6   delivered_time        444782 non-null  object 
 7   rider_id              450000 non-null  int16  
 8   first_mile_distance   450000 non-null  float16
 9   last_mile_distance    450000 non-null  float16
 10  alloted_orders        433052 non-null  float16
 11  delivered_orders      432659 non-null  float16
 12  cancelled             450000 non-null  int16  
 13  undelivered_orders    432659 non-null  float16
 14  lifetime_order_count  449947 non-null  float16
 15  

In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144844 entries, 0 to 144843
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_time            144844 non-null  object 
 1   order_id              144844 non-null  int32  
 2   order_date            144844 non-null  object 
 3   allot_time            144844 non-null  object 
 4   accept_time           144776 non-null  object 
 5   rider_id              144844 non-null  int16  
 6   first_mile_distance   144844 non-null  float16
 7   last_mile_distance    144844 non-null  float16
 8   alloted_orders        140071 non-null  float16
 9   delivered_orders      139960 non-null  float16
 10  undelivered_orders    139960 non-null  float16
 11  lifetime_order_count  144066 non-null  float16
 12  reassignment_method   4632 non-null    object 
 13  reassignment_reason   4635 non-null    object 
 14  reassigned_order      4635 non-null    float16
 15  

In [8]:
def get_features(df ,test = False ,normalize = False):
    if test :
        df = df[['order_time','order_id','allot_time','accept_time','first_mile_distance', 'last_mile_distance','alloted_orders', 'delivered_orders', 'undelivered_orders','lifetime_order_count' ,'session_time' ,'reassigned_order']]
    else:    
        df = df[['order_time','allot_time','accept_time','first_mile_distance', 'last_mile_distance','alloted_orders', 'delivered_orders', 'undelivered_orders','lifetime_order_count' , 'session_time','reassigned_order']]
    days = ['Monday' , 'Saturday' , 'Sunday' , 'Thursday' , 'Tuesday', 'Wednesday' ,'Friday']
    for i in df.columns[5:]:
        #print(i)
        #print(df[[i]][~np.isnan(df[i])][i].quantile(0.5))
        if i == 'reassigned_order':
            df[[i]] = df[[i]].fillna(0)
            continue
        df[[i]] = df[[i]].fillna(df[[i]][~np.isnan(df[i])][i].quantile(0.5) )
    ## Processing Date 
    df.order_time = pd.to_datetime(df.order_time ,yearfirst =True)
    df.allot_time = pd.to_datetime(df.allot_time ,yearfirst =True)
    df.accept_time = pd.to_datetime(df.accept_time ,yearfirst =True)
    df['elapse1'] = (df.allot_time - df.order_time).dt.total_seconds()
    df['elapse2'] = (df.accept_time - df.allot_time).dt.total_seconds()
    df['elapse3'] = df['elapse1'] + df['elapse2']
    df[['elapse1']] = df[['elapse1']].fillna(df[['elapse1']][~np.isnan(df['elapse1'])]['elapse1'].quantile(0.5) )
    df[['elapse2']] = df[['elapse2']].fillna(df[['elapse2']][~np.isnan(df['elapse2'])]['elapse2'].quantile(0.5) )
    df[['elapse3']] = df[['elapse3']].fillna(df[['elapse3']][~np.isnan(df['elapse3'])]['elapse3'].quantile(0.5) )
    df['elapse4'] = df['elapse2']/df['elapse1']
    df['tot_dist'] = df['first_mile_distance'] + df['last_mile_distance']
    df['del_ratio'] = df['delivered_orders']/df['alloted_orders']
    df['undel_ratio'] = 1 - df['del_ratio']
    df['order_ratio'] = df['alloted_orders']/df['lifetime_order_count']
    df['diff_dist'] = df['first_mile_distance'] - df['last_mile_distance']
    df['dist_ratio'] = df['first_mile_distance']/df['last_mile_distance']
    #df[['dist_ratio']] = df[['dist_ratio']].astype(np.float16)
    df['dist_ratio_2'] = df['last_mile_distance']/df['first_mile_distance']
    df['Day_of_the_week'] = df.order_time.dt.day_name()
    df['Hour'] = df.order_time.dt.hour
    df['Month'] = df.order_time.dt.month
    df.drop(columns = ['order_time'] ,inplace =True)
    df = pd.concat([df,pd.get_dummies(df['Day_of_the_week']).astype(int)] ,axis = 1)
    df.drop(columns = [ 'Day_of_the_week' ,'allot_time' , 'accept_time'] ,inplace = True) #'Day_of_the_week' ,
    for i in days:
        if i not in df.columns:
            df[i] = 0
    if normalize:
        df[['first_mile_distance','elapse1','elapse2', 'last_mile_distance','tot_dist','alloted_orders', 'delivered_orders','del_ratio','undelivered_orders','lifetime_order_count','Hour','session_time','dist_ratio']] = StandardScaler().fit_transform(df[['first_mile_distance','elapse1','elapse2', 'last_mile_distance','tot_dist','alloted_orders', 'delivered_orders','del_ratio','undelivered_orders','lifetime_order_count','Hour','session_time','dist_ratio']])
    if test:        
        df = df[['order_id','first_mile_distance','elapse1','elapse2', 'last_mile_distance','tot_dist','alloted_orders', 'delivered_orders','del_ratio','undelivered_orders','lifetime_order_count','Hour','session_time','Monday' , 'Saturday' , 'Sunday' , 'Thursday' , 'Tuesday', 'Wednesday','dist_ratio']]   #'Monday' , 'Saturday' , 'Sunday' , 'Thursday' , 'Tuesday', 'Wednesday'
    else:
        df = df[['first_mile_distance','elapse1','elapse2', 'last_mile_distance','tot_dist','alloted_orders', 'delivered_orders','del_ratio','undelivered_orders','lifetime_order_count','Hour','session_time','Monday' , 'Saturday' , 'Sunday' , 'Thursday' , 'Tuesday', 'Wednesday','dist_ratio']] #'undelivered_orders'  #,'del_ratio'  #,'order_ratio' #,'order_ratio' #,'diff_dist' #,'reassigned_order'#,'dist_ratio_2'#,'Hour'#,'Month'#,'tot_dist'
    df = reduce_mem_usage(df)
    return df

In [9]:
X ,y = get_features(data_df) , data_df[['cancelled']]
test_df = get_features(test_df ,test =True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https

Memory usage of dataframe is 39.48 MB
Memory usage after optimization is: 18.88 MB
Decreased by 52.2%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https

Memory usage of dataframe is 13.26 MB
Memory usage after optimization is: 6.63 MB
Decreased by 50.0%


In [10]:
X.shape ,y.shape ,test_df.shape

((450000, 19), (450000, 1), (144844, 20))

In [11]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450000 entries, 0 to 449999
Data columns (total 19 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   first_mile_distance   450000 non-null  float16
 1   elapse1               450000 non-null  float16
 2   elapse2               450000 non-null  float16
 3   last_mile_distance    450000 non-null  float16
 4   tot_dist              450000 non-null  float16
 5   alloted_orders        450000 non-null  float16
 6   delivered_orders      450000 non-null  float16
 7   del_ratio             450000 non-null  float16
 8   undelivered_orders    450000 non-null  float16
 9   lifetime_order_count  450000 non-null  float16
 10  Hour                  450000 non-null  int16  
 11  session_time          450000 non-null  float16
 12  Monday                450000 non-null  int16  
 13  Saturday              450000 non-null  int16  
 14  Sunday                450000 non-null  int16  
 15  

In [12]:
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450000 entries, 0 to 449999
Data columns (total 1 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   cancelled  450000 non-null  int16
dtypes: int16(1)
memory usage: 879.0 KB


In [13]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144844 entries, 0 to 144843
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_id              144844 non-null  int32  
 1   first_mile_distance   144844 non-null  float16
 2   elapse1               144844 non-null  float16
 3   elapse2               144844 non-null  float16
 4   last_mile_distance    144844 non-null  float16
 5   tot_dist              144844 non-null  float16
 6   alloted_orders        144844 non-null  float16
 7   delivered_orders      144844 non-null  float16
 8   del_ratio             144844 non-null  float16
 9   undelivered_orders    144844 non-null  float16
 10  lifetime_order_count  144844 non-null  float16
 11  Hour                  144844 non-null  int16  
 12  session_time          144844 non-null  float16
 13  Monday                144844 non-null  int16  
 14  Saturday              144844 non-null  int16  
 15  

In [14]:
X.head(10)

Unnamed: 0,first_mile_distance,elapse1,elapse2,last_mile_distance,tot_dist,alloted_orders,delivered_orders,del_ratio,undelivered_orders,lifetime_order_count,Hour,session_time,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,dist_ratio
0,1.566406,24.0,9.0,2.650391,4.21875,46.0,46.0,1.0,0.0,621.0,2,175.5,0,0,0,0,1,0,0.59082
1,2.521484,41.0,48.0,2.759766,5.28125,8.0,8.0,1.0,0.0,105.0,2,3.267578,0,0,0,0,1,0,0.913574
2,2.207031,8.0,16.0,4.800781,7.007812,1.0,1.0,1.0,0.0,66.0,2,9.820312,0,0,0,0,1,0,0.459717
3,2.189453,32.0,41.0,6.378906,8.570312,1.0,1.0,1.0,0.0,127.0,2,17.53125,0,0,0,0,1,0,0.343262
4,2.787109,51.0,36.0,4.011719,6.796875,34.0,34.0,1.0,0.0,84.0,3,1.349609,0,0,0,0,1,0,0.694824
5,2.482422,298.0,13.0,5.179688,7.664062,296.0,294.0,0.993164,2.0,1506.0,3,175.5,0,0,0,0,1,0,0.479248
6,2.808594,28.0,47.0,3.400391,6.210938,45.0,45.0,1.0,0.0,1460.0,3,175.5,0,0,0,0,1,0,0.826172
7,0.025681,28.0,6.0,0.160034,0.185669,54.0,53.0,0.981445,1.0,270.0,3,44.15625,0,0,0,0,1,0,0.160522
8,2.443359,30.0,24.0,2.859375,5.304688,29.0,29.0,1.0,0.0,955.0,3,2.5,0,0,0,0,1,0,0.854492
9,2.878906,369.0,37.0,2.609375,5.488281,81.0,81.0,1.0,0.0,1.0,3,175.5,0,0,0,0,1,0,1.103516


In [15]:
test_df.head(10)

Unnamed: 0,order_id,first_mile_distance,elapse1,elapse2,last_mile_distance,tot_dist,alloted_orders,delivered_orders,del_ratio,undelivered_orders,lifetime_order_count,Hour,session_time,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,dist_ratio
0,130231,1.658203,25.0,26.0,4.539062,6.195312,216.0,215.0,0.995605,1.0,747.0,10,273.5,0,1,0,0,0,0,0.365234
1,130232,2.070312,1.0,9.0,5.839844,7.910156,52.0,52.0,1.0,0.0,75.0,10,252.125,0,1,0,0,0,0,0.354492
2,130233,1.388672,47.0,80.0,0.990234,2.378906,289.0,289.0,1.0,0.0,2214.0,10,241.375,0,1,0,0,0,0,1.402344
3,130234,1.904297,1.0,23.0,2.589844,4.492188,125.0,122.0,0.976074,3.0,1020.0,10,292.0,0,1,0,0,0,0,0.735352
4,130235,0.827637,8.0,60.0,0.939941,1.767578,352.0,350.0,0.994141,2.0,7284.0,10,247.125,0,1,0,0,0,0,0.880371
5,130237,0.879883,22.0,88.0,5.488281,6.367188,5.0,5.0,1.0,0.0,261.0,10,184.125,0,1,0,0,0,0,0.160278
6,130236,0.601074,0.0,63.0,0.970215,1.571289,59.0,58.0,0.98291,1.0,889.0,10,79.8125,0,1,0,0,0,0,0.619629
7,130238,1.467773,55.0,26.0,1.450195,2.917969,178.0,178.0,1.0,0.0,348.0,10,211.75,0,1,0,0,0,0,1.011719
8,130239,1.833984,2.0,52.0,1.230469,3.064453,30.0,30.0,1.0,0.0,87.0,10,208.0,0,1,0,0,0,0,1.490234
9,130240,2.279297,41.0,13.0,2.419922,4.699219,81.0,81.0,1.0,0.0,1470.0,10,213.0,0,1,0,0,0,0,0.941895


In [16]:
from lightgbm import LGBMClassifier
from sklearn.ensemble import BaggingClassifier

In [17]:
params = {'n_estimators': 10000,
 'learning_rate': 0.1791352346972678,
 'num_leaves': 2484,
 'max_depth': 7,
 'min_data_in_leaf': 1000,
 'lambda_l1': 0.001416929031249321,
 'lambda_l2': 5,
 'min_gain_to_split': 6.936460637257937,
 'bagging_fraction': 0.9,
 'bagging_freq': 1,
 'feature_fraction': 0.5}

In [18]:
clf = BaggingClassifier(LGBMClassifier(seed=SEED ,**params,n_jobs= -1) ,n_estimators=100 ,random_state =SEED,verbose =2)

In [19]:
clf.fit(X, y)

  return f(**kwargs)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Building estimator 1 of 100 for this parallel run (total 100)...
Building estimator 2 of 100 for this parallel run (total 100)...
Building estimator 3 of 100 for this parallel run (total 100)...
Building estimator 4 of 100 for this parallel run (total 100)...
Building estimator 5 of 100 for this parallel run (total 100)...
Building estimator 6 of 100 for this parallel run (total 100)...
Building estimator 7 of 100 for this parallel run (total 100)...
Building estimator 8 of 100 for this parallel run (total 100)...
Building estimator 9 of 100 for this parallel run (total 100)...
Building estimator 10 of 100 for this parallel run (total 100)...
Building estimator 11 of 100 for this parallel run (total 100)...
Building estimator 12 of 100 for this parallel run (total 100)...
Building estimator 13 of 100 for this parallel run (total 100)...
Building estimator 14 of 100 for this parallel run (total 100)...
Building estimator 15 of 100 for this parallel run (total 100)...
Building estimator 

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 193.5min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 193.5min finished


BaggingClassifier(base_estimator=LGBMClassifier(bagging_fraction=0.9,
                                                bagging_freq=1,
                                                feature_fraction=0.5,
                                                lambda_l1=0.001416929031249321,
                                                lambda_l2=5,
                                                learning_rate=0.1791352346972678,
                                                max_depth=7,
                                                min_data_in_leaf=1000,
                                                min_gain_to_split=6.936460637257937,
                                                n_estimators=10000,
                                                num_leaves=2484, seed=42),
                  n_estimators=100, random_state=42, verbose=2)

In [20]:
out = clf.predict_proba(test_df[list(test_df.columns[1:])])[:,1]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   25.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   25.8s finished


In [21]:
out_df = pd.DataFrame({'cancelled' : out} , index = test_df.order_id)

In [22]:
out_df.reset_index(inplace =True)

In [23]:
out_df.head()

Unnamed: 0,order_id,cancelled
0,130231,0.005242
1,130232,0.006907
2,130233,0.003712
3,130234,0.007496
4,130235,0.002979


In [24]:
out_df.shape

(144844, 2)

In [25]:
out_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144844 entries, 0 to 144843
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   order_id   144844 non-null  int64  
 1   cancelled  144844 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 2.2 MB


In [26]:
out_df.to_csv('/kaggle/working/best_submission1.csv' ,index =False)