In [64]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import date

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [50]:
#Import data
df_2019 = pd.read_csv('data_2019.csv', low_memory = False)
df_2020 = pd.read_csv('data_2020.csv', low_memory = False)

df_full = pd.concat([df_2019, df_2020])

In [51]:
#Rename columns
rename_columns = {'datetTimeFirstDeliveryMoment': 'dateTimeFirstDeliveryMoment',
                  'generalMatchClassification': 'detailedMatchClassification',
                  'detailedMatchClassification': 'generalMatchClassification'}

df_full = df_full.rename(columns = rename_columns)

In [52]:
#Transform dates to date-type
df_full['orderDate'] = pd.to_datetime(df_full['orderDate'])
df_full['cancellationDate'] = pd.to_datetime(df_full['cancellationDate'])
df_full['promisedDeliveryDate'] = pd.to_datetime(df_full['promisedDeliveryDate'])
df_full['shipmentDate'] = pd.to_datetime(df_full['shipmentDate'])
df_full['dateTimeFirstDeliveryMoment'] = pd.to_datetime(df_full['dateTimeFirstDeliveryMoment'])
df_full['startDateCase'] = pd.to_datetime(df_full['startDateCase'])
df_full['returnDateTime'] = pd.to_datetime(df_full['returnDateTime'])
df_full['registrationDateSeller'] = pd.to_datetime(df_full['registrationDateSeller'])

In [55]:
df_full.shape

(4779466, 38)

(returnDateTime < (CONVERT(date,dateTimeFirstDeliveryMoment)) AND dateTimeFirstDeliveryMoment IS NOT NULL AND returnDateTime IS NOT NULL) OR (shipmentDate > returnDateTime)

(cancellationDate > shipmentDate AND (cancellationReasonCode = 'CUST_FE' OR cancellationReasonCode = 'CUST_CS'))

In [86]:
#(df['returnDateTime'] < df['dateTimeFirstDeliveryMoment'] & df['dateTimeFirstDeliveryMoment'].notnull() & df['returnDateTime'].notnull())
#(df['cancellationDate'] > df['shipmentDate'] & (df['cancellationReasonCode'] == 'CUST_FE' | df['cancellationReasonCode'] == 'CUST_CS'))
df = df_full.drop(df_full[(df_full['startDateCase'] < df_full['orderDate']) | 
                   (df_full['cancellationDate'] < df_full['orderDate']) |
                   (df_full['promisedDeliveryDate'] < df_full['orderDate']) |
                   (df_full['shipmentDate'] < df_full['orderDate']) |
                   (df_full['dateTimeFirstDeliveryMoment'].dt.normalize() < df_full['orderDate']) |
                   (df_full['returnDateTime'] < df_full['orderDate']) |
                   (df_full['orderDate'] < df_full['registrationDateSeller']) |
                   (df_full['cancellationDate'] > df_full['dateTimeFirstDeliveryMoment'].dt.normalize()) |
                   (df_full['cancellationDate'] > df_full['returnDateTime']) |
                   (df_full['shipmentDate'] > df_full['returnDateTime']) |
                   (df_full['shipmentDate'] > df_full['dateTimeFirstDeliveryMoment'].dt.normalize()) |
                   (df_full['registrationDateSeller'].isnull()) |
                   (df_full['promisedDeliveryDate'].isnull())].index)

#Sort rows on orderDate and create new index
df = df.sort_values(by = 'orderDate')
df = df.reset_index(drop = True)

In [100]:
#df['Discounted_Price'] = df.apply(lambda row: row.Cost - (row.Cost * 0.1), axis = 1) 

#Create new variables related to periods (days difference)
df['caseDays'] = (df['startDateCase'] - df['orderDate']).dt.days
df['returnDays'] = (df['returnDateTime'] - df['orderDate']).dt.days
df['cancellationDays'] = (df['cancellationDate'] - df['orderDate']).dt.days
df['actualDeliveryDays'] = (df['dateTimeFirstDeliveryMoment'].dt.normalize() - df['orderDate']).dt.days
df['shipmentDays'] = (df['shipmentDate'] - df['orderDate']).dt.days
df['partnerSellingMonths'] = (df['orderDate'] - df['registrationDateSeller']).dt.days
df['promisedDeliveryDays'] = (df['promisedDeliveryDate'] - df['orderDate']).dt.days

#Time related variables
df['orderYear'] = df['orderDate'].dt.year
df['orderMonth'] = df['orderDate'].dt.month
df['orderWeekday'] = df['orderDate'].dt.weekday
df['orderCorona'] = df['orderDate'].apply(lambda x: True if x > '2020-03-20' else False)

#Create dummy variables for weeday and months
df['orderMonday'] = df['orderWeekday'].apply(lambda x: True if x == 1 else False)
df['orderTuesday'] = df['orderWeekday'].apply(lambda x: True if x == 2 else False)
df['orderWednesday'] = df['orderWeekday'].apply(lambda x: True if x == 3 else False)
df['orderThursday'] = df['orderWeekday'].apply(lambda x: True if x == 4 else False)
df['orderFriday'] = df['orderWeekday'].apply(lambda x: True if x == 5 else False)
df['orderSaturday'] = df['orderWeekday'].apply(lambda x: True if x == 6 else False)
df['orderSunday'] = df['orderWeekday'].apply(lambda x: True if x == 7 else False)

df['orderJanuary'] = df['orderMonth'].apply(lambda x: True if x == 1 else False)
df['orderFebruary'] = df['orderMonth'].apply(lambda x: True if x == 2 else False)
df['orderMarch'] = df['orderMonth'].apply(lambda x: True if x == 3 else False)
df['orderApril'] = df['orderMonth'].apply(lambda x: True if x == 4 else False)
df['orderMay'] = df['orderMonth'].apply(lambda x: True if x == 5 else False)
df['orderJune'] = df['orderMonth'].apply(lambda x: True if x == 6 else False)
df['orderJuly'] = df['orderMonth'].apply(lambda x: True if x == 7 else False)
df['orderAugust'] = df['orderMonth'].apply(lambda x: True if x == 8 else False)
df['orderSeptember'] = df['orderMonth'].apply(lambda x: True if x == 9 else False)
df['orderOctober'] = df['orderMonth'].apply(lambda x: True if x == 10 else False)
df['orderNovember'] = df['orderMonth'].apply(lambda x: True if x == 11 else False)
df['orderDecember'] = df['orderMonth'].apply(lambda x: True if x == 12 else False)

#Other variables
df['productTitleLength'] = len(df['productTitle'])

df['countryCodeNL'] = df['countryCode'].apply(lambda x: True if x == 'NL' else False)
df['fulfilmentByBol'] = df['fulfilmentType'].apply(lambda x: True if x == 'FBB' else False)
df['countryOriginNL'] = df['countryOriginSeller'].apply(lambda x: True if x == 'NL' else False)
df['countryOriginBE'] = df['countryOriginSeller'].apply(lambda x: True if x == 'BE' else False)
df['countryOriginDE'] = df['countryOriginSeller'].apply(lambda x: True if x == 'DE' else False)

#determinantClassification

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['caseDays'] = (df['startDateCase'] - df['orderDate']).dt.days
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['returnDays'] = (df['returnDateTime'] - df['orderDate']).dt.days
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cancellationDays'] = (df['cancellationDate'] - df['orderDate']).dt.d

In [105]:
#Change type of columns
dtype = {'calculationDefinitive': bool,
         'noCancellation': bool,
         'noCase': bool,
         'hasOneCase': bool,
         'hasMoreCases': bool,
         'noReturn': bool}

df = df.astype(dtype)
df.dtypes

In [108]:
def createColumns(df,X):
    """
    Function to create dynamic columns based on the prediction period.
    X = number of days after order date
    """
    
    dynamic_cols = ['caseDays','returnDays','cancellationDays','actualDeliveryDays']
    
    for col in dynamic_cols:
        
        dynamic_colname = col+'_'+str(X)
        df[dynamic_colname] = df[col].dt.days
        df[dynamic_colname] = df[dynamic_colname].apply(lambda x: x if x <= X else None)
        
    return df

In [204]:
def addProductCount_HARD(df):
    """
    Fuction to add a column to your dataframe which counts the number of occurances of
    the productId uptil that date.
    """

    df = df.sort_values(by = ['productId','orderDate'])
    
    productCount = np.ones(df.shape[0])
    
    previousID = None
    previousDate = None
    sameDate = False
    countSameDate = 1
    firstIndexSameDate = None

    for index, row in df.iterrows():
        
        if (previousID == row.productId) and (previousDate != row.orderDate) and (sameDate == False):
            productCount[index] = productCount[index-1] + 1
            previousID = row.productId
            previousDate = row.orderDate
            
        elif (previousID == row.productId) and (previousDate != row.orderDate) and (sameDate == True):
            productCount[firstIndexSameDate:index] = productCount[firstIndexSameDate-1] + countSameDate
            productCount[index] = productCount[index-1] + 1
            previousID = row.productId
            previousDate = row.orderDate
            sameDate = False
            firstIndexSameDate = None
            countSameDate = 1
            
        elif (previousID == row.productId) and (previousDate == row.orderDate) and (sameDate == False):
            sameDate = True
            firstIndexSameDate = index - 1
            countSameDate += 1
            previousID = row.productId
            previousDate = row.orderDate
            
        elif (previousID == row.productId) and (previousDate == row.orderDate) and (sameDate == True):
            countSameDate +=1
            previousID = row.productId
            previousDate = row.orderDate
            
        elif (previousID != row.productId) and (sameDate == True):
            productCount[firstIndexSameDate:index] = productCount[firstIndexSameDate-1] + countSameDate
            sameDate = False
            firstIndexSameDate = None
            countSameDate = 1
            previousID = row.productId
            previousDate = row.orderDate
            
        elif (previousID != row.productId) and (sameDate == False):
            previousID = row.productId
            previousDate = row.orderDate
        
    df['productCount'] = productCount
    
    return(df)

In [279]:
def addProductCount1(df):
    """
    Fuction to add a column to your dataframe which counts the number of occurances of
    the productId uptil that date.
    """

    df_ = df[['productId','orderDate']].sort_values(by = ['productId','orderDate'])
    df_ = df_.reset_index(drop = True)
    
    productCount = np.ones(df_.shape[0])
    
    previousID = None
    previousDate = None
    
    print('Check 1')

    for index, row in df_.iterrows():
        
        if index == 0:
            print('Check 2')
            previousID = row.productId
        elif (previousID == row.productId):
            productCount[index] = productCount[index - 1] + 1
            previousID = row.productId
        else:
            previousID = row.productId
            
        clear_output(wait=True)
        print(index, flush=True)
        
    df['productCount'] = productCount
    
    return(df)

In [295]:
def addProductCount2(df):
    """
    Fuction to add a column to your dataframe which counts the number of occurances of
    the productId uptil that date.
    """

    df_ = df[['productId','orderDate']].sort_values(by = ['productId','orderDate'])
    df_ = df_.reset_index(drop = True)
    
    productCount = np.ones(df_.shape[0])
    
    previousID = None
    previousDate = None
    
    print('Check 1')

    for row in df_.itertuples():
        
        if row[0] == 0:
            print('Check 2')
            previousID = row[1]
        elif (previousID == row[1]):
            productCount[row[0]] = productCount[row[0] - 1] + 1
            previousID = row[1]
        else:
            previousID = row[1]
            
        clear_output(wait=True)
        print(row[0], flush=True)
        
    df['productCount'] = productCount
    
    return(df)

In [305]:
def addProductCount3(df):
    """
    Fuction to add a column to your dataframe which counts the number of occurances of
    the productId uptil that date.
    """

    df_ = df[['productId','orderDate']].sort_values(by = ['productId','orderDate'])
    df_ = df_.to_numpy()
    
    N_rows = df_.shape[0]
    
    productCount = np.ones(N_rows)
    
    previousID = None
    previousDate = None
    
    print('Check 1')

    for ix in range(N_rows):
        
        if ix == 0:
            print('Check 2')
            previousID = df_[ix][1]
        elif (previousID == row[1]):
            productCount[ix] = productCount[ix - 1] + 1
            previousID = df_[ix][1]
        else:
            previousID = df_[ix][1]
            
        clear_output(wait=True)
        print(ix, flush=True)
        
    df['productCount'] = productCount
    
    return(df)

In [278]:
productCountDay = df[['productId','orderDate']].value_counts().sort_index()

In [234]:
from IPython.display import clear_output

for i in range(1000):
    clear_output(wait=True)
    print(i, flush=True)

999


In [293]:
for row in df_.itertuples():
    print(row[2])

1
2
3
3
4
4
2
1
2


In [306]:
addProductCount3(df)

33871


KeyboardInterrupt: 

In [301]:
df_test = df[['productId','orderDate']].iloc[:100].to_numpy()