# Seminar BA & QM
### Code Bol.com Case

Below you will find the code for the Bol.com case from group 8. 

Table of contents:

*  [Data preparation](#Data-preparation)
*  [Functions](#Functions)

****

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import date, timedelta
import time

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Data Preparation

In [302]:
#Import data
df_2019 = pd.read_csv('/Users/LV/Documents/ECONOMETRIE-ECO/MASTER/Seminar/2021_Case_Bol.com_Happy_Matches/data/data_2019.csv', low_memory = False)
df_2020 = pd.read_csv('/Users/LV/Documents/ECONOMETRIE-ECO/MASTER/Seminar/2021_Case_Bol.com_Happy_Matches/data/data_2020.csv', low_memory = False)

df_full = pd.concat([df_2019, df_2020])
df_full = df_full.reset_index(drop = True)

print('Total # records: ',df_full.shape[0])

In [303]:
#Rename columns
rename_columns = {'datetTimeFirstDeliveryMoment': 'dateTimeFirstDeliveryMoment',
                  'generalMatchClassification': 'detailedMatchClassification',
                  'detailedMatchClassification': 'generalMatchClassification',
                  'quanityReturnd': 'quantityReturned'}

df_full = df_full.rename(columns = rename_columns)

In [304]:
#Transform dates to date-type
df_full['orderDate'] = pd.to_datetime(df_full['orderDate'])
df_full['cancellationDate'] = pd.to_datetime(df_full['cancellationDate'])
df_full['promisedDeliveryDate'] = pd.to_datetime(df_full['promisedDeliveryDate'])
df_full['shipmentDate'] = pd.to_datetime(df_full['shipmentDate'])
df_full['dateTimeFirstDeliveryMoment'] = pd.to_datetime(df_full['dateTimeFirstDeliveryMoment'])
df_full['startDateCase'] = pd.to_datetime(df_full['startDateCase'])
df_full['returnDateTime'] = pd.to_datetime(df_full['returnDateTime'])
df_full['registrationDateSeller'] = pd.to_datetime(df_full['registrationDateSeller'])

In [306]:
#Remove non-sensible rows
noise = df_full.loc[(df_full['startDateCase'] < df_full['orderDate']) | 
                   (df_full['cancellationDate'] < df_full['orderDate']) |
                   (df_full['promisedDeliveryDate'] < df_full['orderDate']) |
                   (df_full['shipmentDate'] < df_full['orderDate']) |
                   (df_full['dateTimeFirstDeliveryMoment'].dt.normalize() < df_full['orderDate']) |
                   (df_full['returnDateTime'] < df_full['orderDate']) |
                   (df_full['orderDate'] < df_full['registrationDateSeller']) |
                   (df_full['cancellationDate'] > df_full['dateTimeFirstDeliveryMoment'].dt.normalize()) |
                   (df_full['cancellationDate'] > df_full['returnDateTime']) |
                   (df_full['shipmentDate'] > df_full['returnDateTime']) |
                   (df_full['shipmentDate'] > df_full['dateTimeFirstDeliveryMoment'].dt.normalize()) |
                   (df_full['registrationDateSeller'].isnull()) |
                   (df_full['promisedDeliveryDate'].isnull()) |
                   ((df_full['returnDateTime'] < df_full['dateTimeFirstDeliveryMoment'].dt.normalize()) & (df_full['dateTimeFirstDeliveryMoment'].notnull()) & (df_full['returnDateTime'].notnull())) |
                   ((df_full['cancellationDate'] > df_full['shipmentDate']) & ((df_full['cancellationReasonCode'] == 'CUST_FE') | (df_full['cancellationReasonCode'] == 'CUST_CS')))].index

#Drop noise data
df = df_full.drop(index = noise)

#Remove inconsistent cancellation data
df.loc[(df['noCancellation'] == True) & (df['cancellationDate'].notnull()), 
       ['cancellationDate','cancellationReasonCode']] = None

#Sort rows on orderDate and create new index
df = df.sort_values(by = 'orderDate')
df = df.reset_index(drop = True)

print('Cleaned # records: ',df.shape[0])

Cleaned # records:  4772950


In [11]:
#df['Discounted_Price'] = df.apply(lambda row: row.Cost - (row.Cost * 0.1), axis = 1) 

#Create new variables related to periods (days difference)
df['caseDays'] = (df['startDateCase'] - df['orderDate']).dt.days
df['returnDays'] = (df['returnDateTime'] - df['orderDate']).dt.days
df['cancellationDays'] = (df['cancellationDate'] - df['orderDate']).dt.days
df['actualDeliveryDays'] = (df['dateTimeFirstDeliveryMoment'].dt.normalize() - df['orderDate']).dt.days
df['shipmentDays'] = (df['shipmentDate'] - df['orderDate']).dt.days
df['partnerSellingMonths'] = (df['orderDate'] - df['registrationDateSeller']).dt.days
df['promisedDeliveryDays'] = (df['promisedDeliveryDate'] - df['orderDate']).dt.days

#Time related variables
df['orderYear'] = df['orderDate'].dt.year
df['orderMonth'] = df['orderDate'].dt.month
df['orderWeekday'] = df['orderDate'].dt.weekday
#df['orderCorona'] = df['orderDate'].apply(lambda x: True if x > '2020-03-20' else False)

#Create dummy variables for weeday and months
df['orderMonday'] = df['orderWeekday'].apply(lambda x: True if x == 1 else False)
df['orderTuesday'] = df['orderWeekday'].apply(lambda x: True if x == 2 else False)
df['orderWednesday'] = df['orderWeekday'].apply(lambda x: True if x == 3 else False)
df['orderThursday'] = df['orderWeekday'].apply(lambda x: True if x == 4 else False)
df['orderFriday'] = df['orderWeekday'].apply(lambda x: True if x == 5 else False)
df['orderSaturday'] = df['orderWeekday'].apply(lambda x: True if x == 6 else False)
df['orderSunday'] = df['orderWeekday'].apply(lambda x: True if x == 7 else False)

df['orderJanuary'] = df['orderMonth'].apply(lambda x: True if x == 1 else False)
df['orderFebruary'] = df['orderMonth'].apply(lambda x: True if x == 2 else False)
df['orderMarch'] = df['orderMonth'].apply(lambda x: True if x == 3 else False)
df['orderApril'] = df['orderMonth'].apply(lambda x: True if x == 4 else False)
df['orderMay'] = df['orderMonth'].apply(lambda x: True if x == 5 else False)
df['orderJune'] = df['orderMonth'].apply(lambda x: True if x == 6 else False)
df['orderJuly'] = df['orderMonth'].apply(lambda x: True if x == 7 else False)
df['orderAugust'] = df['orderMonth'].apply(lambda x: True if x == 8 else False)
df['orderSeptember'] = df['orderMonth'].apply(lambda x: True if x == 9 else False)
df['orderOctober'] = df['orderMonth'].apply(lambda x: True if x == 10 else False)
df['orderNovember'] = df['orderMonth'].apply(lambda x: True if x == 11 else False)
df['orderDecember'] = df['orderMonth'].apply(lambda x: True if x == 12 else False)

#Other variables
df['productTitleLength'] = len(df['productTitle'])

df['countryCodeNL'] = df['countryCode'].apply(lambda x: True if x == 'NL' else False)
df['fulfilmentByBol'] = df['fulfilmentType'].apply(lambda x: True if x == 'FBB' else False)
df['countryOriginNL'] = df['countryOriginSeller'].apply(lambda x: True if x == 'NL' else False)
df['countryOriginBE'] = df['countryOriginSeller'].apply(lambda x: True if x == 'BE' else False)
df['countryOriginDE'] = df['countryOriginSeller'].apply(lambda x: True if x == 'DE' else False)

#determinantClassification

In [16]:
#Change type of columns
dtype = {'calculationDefinitive': bool,
         'noCancellation': bool,
         'noCase': bool,
         'hasOneCase': bool,
         'hasMoreCases': bool,
         'noReturn': bool}

df = df.astype(dtype)
df.dtypes

orderDate          datetime64[ns]
productId                   int64
sellerId                    int64
totalPrice                float64
quantityOrdered             int64
                        ...      
countryCodeNL                bool
fulfilmentByBol              bool
countryOriginNL              bool
countryOriginBE              bool
countryOriginDE              bool
Length: 73, dtype: object

### Functions

In [108]:
def addPeriodColumns(df,X):
    """
    Function to create dynamic columns based on the prediction period.
    X = number of days after order date
    """
    
    dynamic_cols = ['caseDays','returnDays','cancellationDays','actualDeliveryDays']
    
    for col in dynamic_cols:
        
        dynamic_colname = col+'_'+str(X)
        df[dynamic_colname] = df[col].dt.days
        df[dynamic_colname] = df[dynamic_colname].apply(lambda x: x if x <= X else None)
        
    return df

In [264]:
def addProductCount_OLD(df):
    """
    Fuction to add a column to your dataframe which counts the number of occurances of
    the productId uptil that date.
    """

    df = df.sort_values(by = ['productId','orderDate'])
    df = df.reset_index(drop = True)
    
    df_ = df[['productId','orderDate']]
    
    productCount = np.ones(df_.shape[0])
    
    previousID = None
    previousDate = None
    sameDate = False
    countSameDate = 1
    firstIndexSameDate = None

    for row in df_.itertuples():
        
        if (previousID == row[1]) and (previousDate != row[2]) and (sameDate == False):
            productCount[row[0]] = productCount[row[0]-1] + 1
            
            previousID = row[1]
            previousDate = row[2]
            
        elif (previousID == row[1]) and (previousDate != row[2]) and (sameDate == True):
            productCount[firstIndexSameDate:row[0]] = productCount[firstIndexSameDate-1] + countSameDate
            productCount[row[0]] = productCount[row[0]-1] + 1
            
            previousID = row[1]
            previousDate = row[2]
            
            sameDate = False
            firstIndexSameDate = None
            countSameDate = 1
            
        elif (previousID == row[1]) and (previousDate == row[2]) and (sameDate == False):
            sameDate = True
            firstIndexSameDate = row[0] - 1
            countSameDate += 1
            
            previousID = row[1]
            previousDate = row[2]
            
        elif (previousID == row[1]) and (previousDate == row[2]) and (sameDate == True):
            countSameDate +=1
            
            previousID = row[1]
            previousDate = row[2]
            
        elif (previousID != row[1]) and (sameDate == True):
            productCount[firstIndexSameDate:row[0]] = productCount[firstIndexSameDate-1] + countSameDate
            
            sameDate = False
            firstIndexSameDate = None
            countSameDate = 1
            
            previousID = row[1]
            previousDate = row[2]
            
        elif (previousID != row[1]) and (sameDate == False):
            previousID = row[1]
            previousDate = row[2]
    
    
    df['productCount'] = productCount
    
    return(df)

In [272]:
def addProductCount_HARD(df):
    """
    Fuction to add a column to your dataframe which counts the number of occurances of
    the productId uptil that date and total quantityOrdered.
    """
    df = df.sort_values(by = ['productId','orderDate'])
    df = df.reset_index(drop = True)
    
    df_ = df[['productId','orderDate','quantityOrdered','quantityReturned','returnDateTime']]
    
    #delta = timedelta(0)
    
    productOrderCount = np.ones(df_.shape[0])
    productTotalCount = np.zeros(df_.shape[0])
    productTotalReturned = np.zeros(df.shape[0])
    
    for row in df_.itertuples():
        
        if row[0] == 0:
            df_P = df_.loc[df_['productId'] == row[1]]
            
            productTotalCount[row[0]] = row[3]
            
            previousId = row[1]
            
        elif previousId == row[1]:
            extract = df_P.loc[df_P['orderDate'] <= row[2]]
            extract2 = extract.loc[extract['returnDateTime'] <= row[2]]
            
            productOrderCount[row[0]] = extract.quantityOrdered.count()
            productTotalCount[row[0]] = extract.quantityOrdered.sum()
            productTotalReturned[row[0]] = extract2.quantityReturned.sum()
            
            previousId = row[1]
            
        else:
            df_P = df_.loc[df_['productId'] == row[1]]
            
            extract = df_P.loc[df_P['orderDate'] <= row[2]]
            extract2 = extract.loc[extract['returnDateTime'] <= row[2]]
            
            productOrderCount[row[0]] = extract.quantityOrdered.count()
            productTotalCount[row[0]] = extract.quantityOrdered.sum()
            productTotalReturned[row[0]] = extract2.quantityReturned.sum()
            
            previousId = row[1]
               
    df['productOrderCount'] = productOrderCount
    df['productTotalCount'] = productTotalCount
    df['productTotalReturned'] = productTotalReturned
    df['productReturnFraction'] = productTotalReturned / productTotalCount
    
    return(df)

In [271]:
#df_ = df.loc[(df['productId']==9200000105019922)]
#df_['orderDate'][835748]-timedelta(7)

t0 = time.time()
a = addProductCount_HARD(df.iloc[:10000])
t1 = time.time()
total = t1-t0
print(total)

#a[['productId','orderDate','quantityOrdered','quantityReturned','returnDateTime','productOrderCount','productTotalCount','productTotalReturned','productReturnFraction']].iloc[:40]

23.347223043441772


In [295]:
def addProductCount(df):
    """
    Fuction to add a column to your dataframe which counts the number of occurances of
    the productId uptil that date.
    """
    
    df_ = df[['productId','orderDate']].sort_values(by = ['productId','orderDate'])
    df_ = df_.reset_index(drop = True)
    
    productCount = np.ones(df_.shape[0])
    
    previousID = None
    previousDate = None
    
    for row in df_.itertuples():
        
        if row[0] == 0:
            previousID = row[1]
            
        elif (previousID == row[1]):
            productCount[row[0]] = productCount[row[0] - 1] + 1
            previousID = row[1]
            
        else:
            previousID = row[1]
        
    df['productCount'] = productCount
    
    return(df)

In [85]:
#WERKT NIET!
def knownReturns(df):
    """
    Fuction to create column which includes the number of returns 
    """
    df1 = df.loc[df['productId']==9200000105019922, ['orderDate','productId','quantityOrdered']].iloc[:300]
    df1 = df1.sort_values(by = ['productId','orderDate'])
    df1 = df1.reset_index(drop = True)
    df1['rowId'] = df1.index

    df2 = df.loc[df['productId']==9200000105019922, ['orderDate','productId','returnDateTime','quantityReturned']].iloc[:300]
    df2 = df2.sort_values(by = ['productId','orderDate'])
    df2 = df2.reset_index(drop = True)
    df2['rowId'] = df2.index

    df3 = df1.merge(df2, 
                    left_on=['orderDate','productId'], 
                    right_on=['returnDateTime','productId'], 
                    how = 'left')
    df3 = df3.sort_values(by = ['orderDate_x','productId'])
    df3['fraction'] = df3['quantityReturned'] / df3.groupby(by = ['orderDate_x','productId','rowId_y']).quantityReturned.transform(np.sum)
    
    return