In [116]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [117]:
import pandas as pd
import numpy as np
import pyodbc as py

from datetime import date, timedelta, datetime
import time

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn import neighbors
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from imblearn.over_sampling import SMOTE

import keras

pd.set_option('display.max_columns', None)

In [118]:
connection_string = (    
    r'Driver={SQL Server};'
    r'Server=LAPTOP-LD74USH0\SQLEXPRESS;'
    r'Integrated Security=SSPI;'
    r'Trusted_Connection=yes;'
)
pd.set_option('display.max_columns', None)

In [119]:
def sql2df(query, params=[], parse_dates=None, dsn='SQLEXPRESS'):
        with py.connect(connection_string, readonly=True) as conn:
            return pd.read_sql(query, conn, params=params, parse_dates=parse_dates)

In [97]:
# First work with random top 100.000 (to reduce computation time) - 45secs

df = sql2df('''
SELECT TOP 500000 * FROM Seminar.dbo.cleaned_bol_data_full
ORDER BY newid();
''')


In [120]:
# 9.5 minutes 

df = sql2df('''
SELECT * FROM Seminar.dbo.cleaned_bol_data_full;
''')

In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4772950 entries, 0 to 4772949
Data columns (total 78 columns):
 #   Column                            Dtype         
---  ------                            -----         
 0   orderDate                         object        
 1   productId                         object        
 2   sellerId                          object        
 3   totalPrice                        float64       
 4   quantityOrdered                   int64         
 5   countryCode                       object        
 6   cancellationDate                  object        
 7   cancellationReasonCode            object        
 8   promisedDeliveryDate              object        
 9   shipmentDate                      object        
 10  transporterCode                   object        
 11  transporterName                   object        
 12  transporterNameOther              object        
 13  dateTimeFirstDeliveryMoment       datetime64[ns]
 14  fulfilmentType    

In [122]:
#Change type of columns
dtype = {'calculationDefinitive': bool,
         'noCancellation': bool,
         'noCase': bool,
         'hasOneCase': bool,
         'hasMoreCases': bool,
         'noReturn': bool,
         'orderWeekend': bool,
         'orderCorona': bool,
         'countryCodeNL': bool,
         'fulfilmentByBol': bool,
         'countryOriginNL': bool,
         'countryOriginBE': bool,
         'countryOriginDE': bool,
         'orderMonday': bool,
         'orderTuesday': bool,
         'orderWednesday': bool,
         'orderThursday': bool,
         'orderFriday': bool,
         'orderSaturday': bool,
         'orderSunday': bool,
         'orderJanuary': bool,
         'orderFebruary': bool,
         'orderMarch': bool,
         'orderApril': bool,
         'orderMay': bool,
         'orderJune': bool,
         'orderJuly': bool,
         'orderAugust': bool,
         'orderSeptember': bool,
         'orderOctober': bool,
         'orderNovember': bool,
         'orderDecember': bool}

df = df.astype(dtype)

#Transform dates to date-type
df['orderDate'] = pd.to_datetime(df['orderDate'], errors='coerce')
df['cancellationDate'] = pd.to_datetime(df['cancellationDate'], errors='coerce')
df['promisedDeliveryDate'] = pd.to_datetime(df['promisedDeliveryDate'], errors='coerce')
df['shipmentDate'] = pd.to_datetime(df['shipmentDate'], errors='coerce')
df['dateTimeFirstDeliveryMoment'] = pd.to_datetime(df['dateTimeFirstDeliveryMoment'], errors='coerce')
df['startDateCase'] = pd.to_datetime(df['startDateCase'], errors='coerce')
df['returnDateTime'] = pd.to_datetime(df['returnDateTime'], errors='coerce')
df['registrationDateSeller'] = pd.to_datetime(df['registrationDateSeller'], errors='coerce')

df.dtypes

orderDate             datetime64[ns]
productId                     object
sellerId                      object
totalPrice                   float64
quantityOrdered                int64
                           ...      
orderSeptember                  bool
orderOctober                    bool
orderNovember                   bool
orderDecember                   bool
productTitleLength             int64
Length: 78, dtype: object

In [123]:
df.head()

Unnamed: 0,orderDate,productId,sellerId,totalPrice,quantityOrdered,countryCode,cancellationDate,cancellationReasonCode,promisedDeliveryDate,shipmentDate,transporterCode,transporterName,transporterNameOther,dateTimeFirstDeliveryMoment,fulfilmentType,startDateCase,cntDistinctCaseIds,returnDateTime,quantityReturned,returnCode,productTitle,brickName,chunkName,productGroup,productSubGroup,productSubSubGroup,registrationDateSeller,countryOriginSeller,currentCountryAvailabilitySeller,calculationDefinitive,noCancellation,onTimeDelivery,noCase,hasOneCase,hasMoreCases,noReturn,detailedMatchClassification,generalMatchClassification,determinantClassification,orderYear,orderMonth,orderYearMonth,orderWeekday,orderWeekend,orderCorona,transporterFeature,partnerSellingMonths,cancellationDays,shipmentDays,promisedDeliveryDays,actualDeliveryDays,caseDays,returnDays,countryCodeNL,fulfilmentByBol,countryOriginNL,countryOriginBE,countryOriginDE,orderMonday,orderTuesday,orderWednesday,orderThursday,orderFriday,orderSaturday,orderSunday,orderJanuary,orderFebruary,orderMarch,orderApril,orderMay,orderJune,orderJuly,orderAugust,orderSeptember,orderOctober,orderNovember,orderDecember,productTitleLength
0,2019-12-06,9000000012387632,656525,53.0,1,NL,NaT,,2019-12-09,2019-12-06,UPS,UPS,,NaT,FBR,NaT,,NaT,,,Bresser Optics Hunter 16 x 50 verrekijker BK-7...,Verrekijkers,Verrekijker,Sound and Vision Accessories,Beeld en Geluid Accessoires,Camera Accessoires,2012-07-04,NL,ALL,True,True,,True,False,False,True,UNKNOWN,UNKNOWN,Unknown delivery,2019,12,2019-12,5,False,True,UPS,89,,0.0,3,,,,True,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,52
1,2019-07-07,9200000011323565,656525,56.95,1,NL,NaT,,2019-07-09,2019-07-08,UPS,UPS,,NaT,FBR,NaT,,NaT,,,National Geographic Verrekijker 10x50 Porro,Verrekijkers,Verrekijker,Sound and Vision Accessories,Beeld en Geluid Accessoires,Camera Accessoires,2012-07-04,NL,ALL,True,True,,True,False,False,True,UNKNOWN,UNKNOWN,Unknown delivery,2019,7,2019-07,7,True,True,UPS,84,,1.0,2,,,,True,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,43
2,2019-11-25,9200000046125841,656525,16.95,1,NL,NaT,,2019-11-27,2019-11-26,UPS,UPS,,NaT,FBR,NaT,,NaT,,,"National Geographic 4x30 BK-7 Zwart, Geel verr...",Verrekijkers,Verrekijker,Sound and Vision Accessories,Beeld en Geluid Accessoires,Camera Accessoires,2012-07-04,NL,ALL,True,True,,True,False,False,True,UNKNOWN,UNKNOWN,Unknown delivery,2019,11,2019-11,1,False,True,UPS,88,,1.0,2,,,,True,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,53
3,2019-08-04,9200000086468758,656525,34.95,1,NL,NaT,,2019-08-06,2019-08-05,UPS,UPS,,NaT,FBR,NaT,,NaT,,,Bresser Microscoop preparaten 25 stuks - Biologie,Verrekijkers,Verrekijker,Recreational and Outdoor Toys,Leren en Experimenteren,Leren en Experimenteren,2012-07-04,NL,ALL,True,True,,True,False,False,True,UNKNOWN,UNKNOWN,Unknown delivery,2019,8,2019-08,7,True,True,UPS,85,,1.0,2,,,,True,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,49
4,2019-04-12,1000004001422158,742787,14.99,1,NL,NaT,,2019-04-16,2019-04-13,TNT_BRIEF,PostNL Briefpost,,NaT,FBR,NaT,,NaT,,,The Very Best of the Eagles,Voorbespeelde Audio,Muziek,Music,CD,CD,2013-01-28,NL,NL,True,True,,True,False,False,True,UNKNOWN,UNKNOWN,Unknown delivery,2019,4,2019-04,5,False,True,TNT_BRIEF,75,,1.0,4,,,,True,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,27


#### Add variables

In [124]:
# Binary classification variable
df['binaryMatchClassification'] = df['generalMatchClassification'].apply(lambda x: 'UNKNOWN' if x == 'UNKNOWN' else 'KNOWN')

# Dummy for year = 2020
df['orderYear2020'] = df['orderYear'].apply(lambda x: True if x == 2020 else False)

#### Transporter Groups

In [125]:
def transporterCluster(transporterCode):
    """
    Function to create a new manually clustered transporter variable: 28 -> 5 categories
    """
    if transporterCode in ['AH-NL','TNT','TNT-EXPRESS','TNT-EXTRA']:
        return 'POSTNL'
    elif transporterCode in ['DHL','DHL_DE','DHLFORYOU']:
        return 'DHL'
    elif transporterCode in ['DPD-NL','DPD-BE']:
        return 'DPD'
    elif transporterCode in ['BRIEFPOST','BPOST_BE','BPOST_BRIEF','DHL-GLOBAL-MAIL','TNT_BRIEF']:
        return 'BRIEFPOST'
    else:
        return 'OTHER'

In [126]:
df['transporterCodeGeneral'] = df['transporterCode'].apply(transporterCluster)
df['transporterCodeGeneral'].value_counts()

POSTNL       2110753
BRIEFPOST    1576462
DHL           436975
DPD           329746
OTHER         319014
Name: transporterCodeGeneral, dtype: int64

#### Product Groups

In [127]:
def productGroupCluster(productGroup):
    """
    Function to create a new manually clustered product group variable based on categories bol.com
    60 -> 14 groups.
    """
    if productGroup in ['Dutch Books PG','Ebooks and Audiobooks','International Books PG']:
        return 'Books'
    elif productGroup in ['Games Accessories','Games Consoles','Games Software Physical',
                          'Movies','Music']:
        return 'Music, Film & Games'
    elif productGroup in ['Camera','Desktop Monitor and Beamer','Ereaders and Accessories',
                          'Laptop Computers','PC Accessories','Personal Audio',
                          'Sound and Vision Accessories','Storage and Network',
                          'Telephone and Tablet Accessories','Telephones and Tablets','Television']:
        return 'Computer & Electronics'
    elif productGroup in ['General Toys','Recreational and Outdoor Toys']:
        return 'Toys & Hobby'
    elif productGroup in ['Baby and Kids Fashion','Baby PG']:
        return 'Baby & Kids'
    elif productGroup in ['Daily Care PG','Health PG','Perfumery PG','Personal Care']:
        return 'Health & Care'
    elif productGroup in ['Footwear','Jewelry and Watches','Mens and Womens Fashion','Wearables']:
        return 'Fashion, Shoes & Accessories'
    elif productGroup in ['Bodyfashion and Beachwear','Camping and Outdoor','Cycling',
                          'Sporting Equipment','Sportswear','Travel Bags and Accessories']:
        return 'Sports, Outdoor & Travel'
    elif productGroup in ['Educational Dutch','Educational International','Printing and Ink']:
        return 'Office & School'
    elif productGroup in ['Supermarket PG'] :
        return 'Food & Beverage'
    elif productGroup in ['Furniture','Heating and Air','Home Decoration','Home Entertainment',
                          'Household','Household Appliances','Kitchen','Kitchen Machines',
                          'Lighting','Major Domestic Appliances PG','Plumbing and Safety']:
        return 'Home, Cooking & Household'
    elif productGroup in ['Garden','Pet PG','Textiles','Tools and Paint']:
        return 'Pets, Garden & Jobs'
    elif productGroup in ['Car and Motorcycle'] :
        return 'Car & Motor'
    else:
        return 'Other'

In [128]:
df['productGroupGeneral'] = df['productGroup'].apply(productGroupCluster)
df['productGroupGeneral'].value_counts()

Computer & Electronics          1387679
Home, Cooking & Household        797874
Sports, Outdoor & Travel         522098
Toys & Hobby                     500977
Pets, Garden & Jobs              339813
Health & Care                    299049
Food & Beverage                  258769
Books                            184581
Music, Film & Games              163842
Baby & Kids                      113707
Fashion, Shoes & Accessories     110067
Office & School                   52270
Car & Motor                       29753
Other                             12471
Name: productGroupGeneral, dtype: int64

In [129]:
#Create dummies of new product grouping
for group in df['productGroupGeneral'].unique():
    
    columnName = 'group' + group.split(' ')[0].replace(',','')
    df[columnName] = df['productGroupGeneral'].apply(lambda x: True if x == group else False)

In [130]:
print(df.columns)
print('Total: ',len(df.columns),' columns')

Index(['orderDate', 'productId', 'sellerId', 'totalPrice', 'quantityOrdered',
       'countryCode', 'cancellationDate', 'cancellationReasonCode',
       'promisedDeliveryDate', 'shipmentDate', 'transporterCode',
       'transporterName', 'transporterNameOther',
       'dateTimeFirstDeliveryMoment', 'fulfilmentType', 'startDateCase',
       'cntDistinctCaseIds', 'returnDateTime', 'quantityReturned',
       'returnCode', 'productTitle', 'brickName', 'chunkName', 'productGroup',
       'productSubGroup', 'productSubSubGroup', 'registrationDateSeller',
       'countryOriginSeller', 'currentCountryAvailabilitySeller',
       'calculationDefinitive', 'noCancellation', 'onTimeDelivery', 'noCase',
       'hasOneCase', 'hasMoreCases', 'noReturn', 'detailedMatchClassification',
       'generalMatchClassification', 'determinantClassification', 'orderYear',
       'orderMonth', 'orderYearMonth', 'orderWeekday', 'orderWeekend',
       'orderCorona', 'transporterFeature', 'partnerSellingMonths',
   

#### Functions

In [131]:
def createColumns(df,X):
    """
    Function to create dynamic columns based on the prediction period.
    X = number of days after order date
    """
    
    dynamic_cols = ['caseDays','returnDays','cancellationDays','actualDeliveryDays']
    
    for col in dynamic_cols:
        
        dynamic_colname = col+'_'+str(X)
        df[dynamic_colname] = df[col].dt.days
        df[dynamic_colname] = df[dynamic_colname].apply(lambda x: x if x <= X else None)
        
    return df

In [132]:
def addPeriodColumns(df,X):
    """
    Function to create columns which indicate whether determinants are known after X days.
    Input: X = number of days after order date at which the prediction is made
           df = dataFrame
    TO-DO: add transporter dummies if known after shipmentDate
    """
    
    df['caseKnown']           = df.apply(lambda row: True if row.caseDays <= X else False, axis = 1)
    df['returnKnown']         = df.apply(lambda row: True if row.returnDays <= X else False, axis = 1)
    df['cancellationKnown']   = df.apply(lambda row: True if row.cancellationDays <= X else False, axis = 1)
    df['onTimeDeliveryKnown'] = df.apply(lambda row: True if ((row.actualDeliveryDays <= X) and (row.onTimeDelivery == True)) else False, axis = 1)
    df['lateDeliveryKnown']   = df.apply(lambda row: True if ((row.actualDeliveryDays <= X) and (row.onTimeDelivery == False)) else False, axis = 1)
    
#     df['transporterPostNL'] = df.apply(lambda row: True if ((row.shipmentDays <= X) and (row. == 'PostNL')) else False, axis = 1)
#     df['transporterDHL']    = df.apply(lambda row: True if ((row.shipmentDays <= X) and (row. == 'DHL')) else False, axis = 1)
#     df['transporterGLS']    = df.apply(lambda row: True if ((row.shipmentDays <= X) and (row. == 'GLS')) else False, axis = 1)
#     df['transporterBrief']  = df.apply(lambda row: True if ((row.shipmentDays <= X) and (row. == 'Brief')) else False, axis = 1)
#     df['transporterOther']  = df.apply(lambda row: True if ((row.shipmentDays <= X) and (row. == 'Other')) else False, axis = 1)

    return df

In [133]:
def addProductColumns(df):
    """
    Function to add 4 columns: productOrderCount, productTotalCount, productTotalReturned and productReturnFraction.
    Input: dataFrame with columns: 'productId','orderDate','quantityOrdered','quantityReturned','returnDateTime'.
    """
    df = df.sort_values(by = ['productId','orderDate'])
    df = df.reset_index(drop = True)
    
    df_ = df[['productId','orderDate','quantityOrdered','quantityReturned','returnDateTime']]
    
    #ProductTotalCount
    pivot = df_.groupby(['productId','orderDate']).quantityOrdered.sum().groupby('productId').cumsum()
    productTotalCount = df_.merge(pivot, 
                                left_on=['productId','orderDate'], 
                                right_index=True, 
                                how = 'left').quantityOrdered_y
    
    #ProductOrderCount
    pivot = df_.groupby(['productId','orderDate']).quantityOrdered.count().groupby('productId').cumsum()
    productOrderCount = df_.merge(pivot, 
                                left_on=['productId','orderDate'], 
                                right_index=True, 
                                how = 'left').quantityOrdered_y
    
    #ProductTotalReturned
    productTotalReturned = np.zeros(df_.shape[0])
    
    previousID = None
    
    returnDic = {}
    
    for row in df_.itertuples(): #iterate through dataFrame
        
        if row[0] == 0: 
            
            #update return dictionary if this product is returned
            if row[4] != None:
                if row[5] in returnDic:
                    returnDic[row[5]] += row[4]
                else:
                    returnDic[row[5]] = row[4]

            previousID = row[1]
            
        elif (previousID == row[1]):
            
            #update return dictionary if this product is returned
            if row[4] != None:
                if row[5] in returnDic:
                    returnDic[row[5]] += row[4]
                else:
                    returnDic[row[5]] = row[4]
            
            #add returned products to new dictionary if known
            known = {k: v for k, v in returnDic.items() if k <= row[2]}
            productTotalReturned[row[0]] = sum(known.values())
            
            #update the dictionary by removing the returns which are now known
            returnDic = {k: v for k, v in returnDic.items() if k > row[2]}
                        
            previousID = row[1]
            
        else:
            returnDic = {} #new productId, hence empty the return dictionary
            
            #update return dictionary if this product is returned
            if row[4] != None:
                if row[5] in returnDic:
                    returnDic[row[5]] += row[4]
                else:
                    returnDic[row[5]] = row[4]
                    
            previousID = row[1]
    
    df_['productTotalReturned'] = productTotalReturned
    pivot = df_.groupby(by = ['productId','orderDate']).productTotalReturned.sum().groupby('productId').cumsum()
    productTotalReturned = df_.merge(pivot, 
                                left_on=['productId','orderDate'], 
                                right_index=True, 
                                how = 'left').productTotalReturned_y
     
    #Add new columns to dataFrame    
    df['productOrderCount'] = productOrderCount
    df['productTotalCount'] = productTotalCount
    df['productTotalReturned'] = productTotalReturned
    df['productReturnFraction'] = productTotalReturned / productTotalCount
    
    return(df)

In [134]:
def classifyLabels(classifier, X, y, n, split = 'TimeSeries', smote = False, scale = None, days = 0):
    """
    Function to classify match labels using a pre-specified classifier with X and y variables. 
    
    Input:
    - classifier: can be any supported classifier. E.g. DecisionTreeClassifier(random_state=0, class_weight='balanced', max_depth=10). Necessary!
    - X: dataframe input on explanatory features. Necessary!
    - y: dataframe input on labels. Necessary!
    - n: number of folds to be evaluated.
    - split: object that can take value 'Random' to make K-fold random train/test split. Default is to apply time series split.
    - smote: boolean, if true Synthetic Minority Oversampling will be applied. Default = False.
    - scale: object that can take values 'MinMax' or 'Standard' to scale X correspondingly. Any other input will not scale X. Default = None.
    - days: integer number of days after orderDate that should be considered. Default = 0.
    
    Output: 
    - accuracy: list of accuracies for the n evaluated classifiers.
    - class_report: report of performance measures for the n evaluated classifiers.
    """
    
    accuracy = {}
    class_report = {}
    count = 1
    
    if split == 'Random':
        
        kf = StratifiedKFold(n_splits = n, random_state = 0, shuffle = True)
        for train_index, test_index in kf.split(X, y):

            if scale == 'MinMax':
                scaler = preprocessing.MinMaxScaler()
                X_scaled = pd.DataFrame(scaler.fit_transform(X))
                X_train, X_test = X_scaled.iloc[train_index], X_scaled.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            elif scale == 'Standard':
                scaler = preprocessing.StandardScaler()
                X_scaled = pd.DataFrame(scaler.fit_transform(X))
                X_train, X_test = X_scaled.iloc[train_index], X_scaled.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            else:
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            if smote == True:
                smote = SMOTE('not majority')
                X_train, y_train = smote.fit_sample(X_train,y_train)
            else:
                X_train, y_train = X_train, y_train
            
            clf = classifier
            clf = clf.fit(X_train,y_train)
            prediction = clf.predict(X_test)
            accuracy[count] = metrics.accuracy_score(y_test, prediction)
            class_report[count] = metrics.classification_report(y_test, prediction)
    
            print(count)
            count +=1
    
    else:
        
        tscv = TimeSeriesSplit(n_splits = n)
        
        for train_index, test_index in tscv.split(X):
        
            if scale == 'MinMax':
                scaler = preprocessing.MinMaxScaler()
                X_scaled = pd.DataFrame(scaler.fit_transform(X))
                X_train, X_test = X_scaled.iloc[train_index], X_scaled.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            elif scale == 'Standard':
                scaler = preprocessing.StandardScaler()
                X_scaled = pd.DataFrame(scaler.fit_transform(X))
                X_train, X_test = X_scaled.iloc[train_index], X_scaled.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            else:
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            if smote == True:
                smote = SMOTE('not majority')
                X_train, y_train = smote.fit_sample(X_train,y_train)
            else:
                X_train, y_train = X_train, y_train
            
            clf = classifier
            clf = clf.fit(X_train,y_train)
            prediction = clf.predict(X_test)
            accuracy[count] = metrics.accuracy_score(y_test, prediction)
            class_report[count] = metrics.classification_report(y_test, prediction)
    
            print(count)
            count +=1

    return(accuracy, class_report)

In [112]:
df = addProductColumns(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_['productTotalReturned'] = productTotalReturned


In [113]:
df = addPeriodColumns(df,2)

In [135]:
#Categorical variables
s = (df.dtypes == 'object')
object_cols = list(s[s].index)
print("Categorical variables:")
print(object_cols)

Categorical variables:
['productId', 'sellerId', 'countryCode', 'cancellationReasonCode', 'transporterCode', 'transporterName', 'transporterNameOther', 'fulfilmentType', 'returnCode', 'productTitle', 'brickName', 'chunkName', 'productGroup', 'productSubGroup', 'productSubSubGroup', 'countryOriginSeller', 'currentCountryAvailabilitySeller', 'onTimeDelivery', 'detailedMatchClassification', 'generalMatchClassification', 'determinantClassification', 'orderMonth', 'orderYearMonth', 'transporterFeature', 'binaryMatchClassification', 'transporterCodeGeneral', 'productGroupGeneral']


# Function-based Classification

## Define X and y variables for classification

In [136]:
index = ['orderDate']
X_col_base = ['totalPrice','quantityOrdered','promisedDeliveryDays','orderCorona','partnerSellingMonths',
        'countryCodeNL', 'fulfilmentByBol', 'countryOriginNL', 'countryOriginBE', 'countryOriginDE', 'orderWeekend',
        'orderMonday','orderTuesday', 'orderWednesday', 'orderThursday', 'orderFriday', 'orderSaturday', 'orderSunday',
        'orderJanuary', 'orderFebruary', 'orderMarch', 'orderApril', 'orderMay', 'orderJune', 'orderJuly',
        'orderAugust', 'orderSeptember', 'orderOctober', 'orderNovember', 'orderDecember', 'productTitleLength',
        'productOrderCount', 'productReturnFraction', 'orderYear2020', 'groupComputer', 'groupFood', 'groupBooks',
        'groupHealth', 'groupToys', 'groupSports', 'groupHome', 'groupOffice', 'groupPets', 'groupMusic', 'groupFashion',
        'groupBaby', 'groupOther', 'groupCar']
#y_col = ['binaryMatchClassification']
y_col = ['generalMatchClassification']

In [137]:
def dataX(df,days):
    
    if days == 0:
        X_col = X_col_base # + 'productOrderCount(0)' + 'productReturnFraction(0)'
    else:
        df = addPeriodColumns(df,days)
        X_col = [X_col_base + 'caseKnown' + 'cancellationKnown' + 'returnKnown' + 'onTimeDeliveryKnown' + 'lateDeliveryKnown']
        # + 'productOrderCount(days)' + 'productReturnFraction(days)' + 'transporterPostNL' + 'transporterDHL' 
        # + 'transporterGLS' + 'transporterBrief' + 'transporterOther'

    df_test = df[index+X_col+y_col].dropna()
    df_test = df_test.sort_values(by = 'orderDate')
    df_test = df_test.reset_index(drop = True)

    X = df_test[X_col]
    y = df_test[y_col]
    
    return(X,y)

## Classification

### Function: classifyLabels(classifier, X, y, n, split = 'TimeSeries', smote = False, scale = None, days = 0)

In [79]:
# Naive Bayes Bernoulli
(accuracy,class_report) = classifyLabels(BernoulliNB(), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

  return f(**kwargs)


1


  return f(**kwargs)
  _warn_prf(average, modifier, msg_start, len(result))


2


  return f(**kwargs)
  _warn_prf(average, modifier, msg_start, len(result))


3
{1: 0.588248, 2: 0.689048, 3: 0.675984}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.77      0.61      0.68     68195
     UNHAPPY       0.15      0.29      0.20     15437
     UNKNOWN       0.66      0.66      0.66     41368

    accuracy                           0.59    125000
   macro avg       0.53      0.52      0.51    125000
weighted avg       0.66      0.59      0.62    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.69      0.94      0.80     75861
     UNHAPPY       0.00      0.00      0.00     14564
     UNKNOWN       0.67      0.44      0.53     34575

    accuracy                           0.69    125000
   macro avg       0.45      0.46      0.44    125000
weighted avg       0.61      0.69      0.63    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.66      0.94      0.78     70405
     UNHAPPY       0.00      0.00      0.00     14378
     UNKNOWN       0.72      0.46

In [83]:
# Naive Bayes Gaussian
(accuracy,class_report) = classifyLabels(GaussianNB(), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

  return f(**kwargs)


1


  return f(**kwargs)


2


  return f(**kwargs)


3
{1: 0.648288, 2: 0.646056, 3: 0.644936}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.68      0.84      0.75     68195
     UNHAPPY       0.18      0.08      0.11     15437
     UNKNOWN       0.67      0.55      0.61     41368

    accuracy                           0.65    125000
   macro avg       0.51      0.49      0.49    125000
weighted avg       0.62      0.65      0.62    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.71      0.83      0.76     75861
     UNHAPPY       0.18      0.10      0.13     14564
     UNKNOWN       0.59      0.48      0.53     34575

    accuracy                           0.65    125000
   macro avg       0.49      0.47      0.47    125000
weighted avg       0.61      0.65      0.62    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.72      0.78      0.75     70405
     UNHAPPY       0.16      0.10      0.12     14378
     UNKNOWN       0.62      0.60

In [70]:
# K-NN
(accuracy,class_report) = classifyLabels(neighbors.KNeighborsClassifier(n_neighbors = 3), X, y, n = 3, scale = 'MinMax')
print(accuracy)
for item in class_report.values():
    print(item)

  clf = clf.fit(X_train,y_train)


1


  clf = clf.fit(X_train,y_train)


2


  clf = clf.fit(X_train,y_train)


3
{1: 0.776368, 2: 0.757728, 3: 0.762664}
              precision    recall  f1-score   support

       KNOWN       0.84      0.83      0.83     83632
     UNKNOWN       0.66      0.68      0.67     41368

    accuracy                           0.78    125000
   macro avg       0.75      0.75      0.75    125000
weighted avg       0.78      0.78      0.78    125000

              precision    recall  f1-score   support

       KNOWN       0.83      0.83      0.83     90425
     UNKNOWN       0.56      0.56      0.56     34575

    accuracy                           0.76    125000
   macro avg       0.70      0.70      0.70    125000
weighted avg       0.76      0.76      0.76    125000

              precision    recall  f1-score   support

       KNOWN       0.81      0.84      0.83     84783
     UNKNOWN       0.64      0.59      0.62     40217

    accuracy                           0.76    125000
   macro avg       0.73      0.72      0.72    125000
weighted avg       0.76      0.7

In [85]:
# Logistic Regression
(accuracy,class_report) = classifyLabels(LogisticRegression(random_state=0,
                                                            class_weight='balanced',
                                                            fit_intercept=False,
                                                            solver='liblinear'), X, y, n = 3, scale = 'MinMax')
print(accuracy)
for item in class_report.values():
    print(item)

  return f(**kwargs)


1


  return f(**kwargs)


2


  return f(**kwargs)


3
{1: 0.678176, 2: 0.685968, 3: 0.7108}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.77      0.75      0.76     68195
     UNHAPPY       0.20      0.03      0.05     15437
     UNKNOWN       0.59      0.80      0.68     41368

    accuracy                           0.68    125000
   macro avg       0.52      0.53      0.50    125000
weighted avg       0.64      0.68      0.65    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.83      0.79     75861
     UNHAPPY       0.21      0.05      0.08     14564
     UNKNOWN       0.58      0.64      0.61     34575

    accuracy                           0.69    125000
   macro avg       0.52      0.51      0.49    125000
weighted avg       0.64      0.69      0.66    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.77      0.81      0.79     70405
     UNHAPPY       0.21      0.03      0.05     14378
     UNKNOWN       0.64      0.79  

In [None]:
# SVM (very slow!)
(accuracy,class_report) = classifyLabels(svm.SVC(random_state=0,
                                                 class_weight='balanced'), X, y, n = 3, scale = 'MinMax')
print(accuracy)
for item in class_report.values():
    print(item)

In [87]:
# Decision Tree
(accuracy,class_report) = classifyLabels(DecisionTreeClassifier(random_state=0,
                                                                class_weight='balanced'), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)



1
2
3
{1: 0.603464, 2: 0.587064, 3: 0.603976}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.74      0.67      0.70     68195
     UNHAPPY       0.14      0.18      0.16     15437
     UNKNOWN       0.63      0.64      0.64     41368

    accuracy                           0.60    125000
   macro avg       0.50      0.50      0.50    125000
weighted avg       0.63      0.60      0.61    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.66      0.70     75861
     UNHAPPY       0.13      0.16      0.14     14564
     UNKNOWN       0.52      0.61      0.56     34575

    accuracy                           0.59    125000
   macro avg       0.47      0.48      0.47    125000
weighted avg       0.62      0.59      0.60    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.72      0.69      0.71     70405
     UNHAPPY       0.13      0.18      0.15     14378
     UNKNOWN       0.64      

In [88]:
# AdaBoost
(accuracy,class_report) = classifyLabels(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3),
                                                            n_estimators=50,
                                                            random_state=0), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

  return f(**kwargs)


1


  return f(**kwargs)


2


  return f(**kwargs)


3
{1: 0.73088, 2: 0.740224, 3: 0.751032}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.87      0.81     68195
     UNHAPPY       0.15      0.02      0.03     15437
     UNKNOWN       0.71      0.77      0.74     41368

    accuracy                           0.73    125000
   macro avg       0.54      0.55      0.52    125000
weighted avg       0.66      0.73      0.69    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.76      0.90      0.83     75861
     UNHAPPY       0.25      0.01      0.01     14564
     UNKNOWN       0.69      0.70      0.69     34575

    accuracy                           0.74    125000
   macro avg       0.57      0.54      0.51    125000
weighted avg       0.68      0.74      0.69    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.74      0.93      0.83     70405
     UNHAPPY       0.30      0.01      0.02     14378
     UNKNOWN       0.77      0.71 

In [89]:
# Gradient Boosting
(accuracy,class_report) = classifyLabels(GradientBoostingClassifier(random_state=0), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

  return f(**kwargs)


1


  return f(**kwargs)


2


  return f(**kwargs)


3
{1: 0.747288, 2: 0.75016, 3: 0.74272}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.92      0.82     68195
     UNHAPPY       0.23      0.00      0.00     15437
     UNKNOWN       0.75      0.75      0.75     41368

    accuracy                           0.75    125000
   macro avg       0.58      0.55      0.52    125000
weighted avg       0.68      0.75      0.70    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.76      0.93      0.83     75861
     UNHAPPY       0.45      0.00      0.00     14564
     UNKNOWN       0.72      0.68      0.70     34575

    accuracy                           0.75    125000
   macro avg       0.65      0.54      0.51    125000
weighted avg       0.71      0.75      0.70    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.73      0.94      0.82     70405
     UNHAPPY       0.32      0.00      0.00     14378
     UNKNOWN       0.78      0.66  

In [90]:
# Hist Gradient Boosting
(accuracy,class_report) = classifyLabels(HistGradientBoostingClassifier(random_state=0), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

  return f(**kwargs)


1


  return f(**kwargs)


2


  return f(**kwargs)


3
{1: 0.75276, 2: 0.751056, 3: 0.757056}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.92      0.83     68195
     UNHAPPY       0.25      0.00      0.01     15437
     UNKNOWN       0.75      0.76      0.76     41368

    accuracy                           0.75    125000
   macro avg       0.59      0.56      0.53    125000
weighted avg       0.69      0.75      0.70    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.77      0.92      0.83     75861
     UNHAPPY       0.22      0.00      0.00     14564
     UNKNOWN       0.71      0.71      0.71     34575

    accuracy                           0.75    125000
   macro avg       0.56      0.54      0.52    125000
weighted avg       0.69      0.75      0.70    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.94      0.83     70405
     UNHAPPY       0.26      0.00      0.00     14378
     UNKNOWN       0.79      0.71 

In [91]:
# Bagging
(accuracy,class_report) = classifyLabels(BaggingClassifier(n_estimators=10,
                                                           random_state=0), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

  return f(**kwargs)


1


  return f(**kwargs)


2


  return f(**kwargs)


3
{1: 0.704248, 2: 0.691872, 3: 0.691832}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.74      0.84      0.79     68195
     UNHAPPY       0.15      0.05      0.07     15437
     UNKNOWN       0.70      0.72      0.71     41368

    accuracy                           0.70    125000
   macro avg       0.53      0.54      0.52    125000
weighted avg       0.66      0.70      0.67    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.76      0.82      0.79     75861
     UNHAPPY       0.16      0.06      0.09     14564
     UNKNOWN       0.62      0.68      0.65     34575

    accuracy                           0.69    125000
   macro avg       0.51      0.52      0.51    125000
weighted avg       0.65      0.69      0.67    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.74      0.82      0.78     70405
     UNHAPPY       0.14      0.07      0.10     14378
     UNKNOWN       0.70      0.68

In [92]:
# Random Forest
(accuracy,class_report) = classifyLabels(RandomForestClassifier(n_estimators=10,
                                                                random_state=0,
                                                                class_weight='balanced'), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

  clf = clf.fit(X_train,y_train)


1


  clf = clf.fit(X_train,y_train)


2


  clf = clf.fit(X_train,y_train)


3
{1: 0.741472, 2: 0.743208, 3: 0.752912}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.90      0.82     68195
     UNHAPPY       0.18      0.03      0.05     15437
     UNKNOWN       0.75      0.74      0.75     41368

    accuracy                           0.74    125000
   macro avg       0.56      0.56      0.54    125000
weighted avg       0.68      0.74      0.70    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.77      0.91      0.83     75861
     UNHAPPY       0.17      0.03      0.05     14564
     UNKNOWN       0.72      0.69      0.70     34575

    accuracy                           0.74    125000
   macro avg       0.55      0.54      0.53    125000
weighted avg       0.68      0.74      0.70    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.93      0.83     70405
     UNHAPPY       0.18      0.03      0.05     14378
     UNKNOWN       0.79      0.71