In [1]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [2]:
import pandas as pd
import numpy as np
import pyodbc as py

import warnings

from datetime import date, timedelta, datetime
import time

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import preprocessing
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.pipeline import Pipeline

from sklearn import neighbors
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from imblearn.over_sampling import SMOTE

import keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Activation,Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

import importlib
import functions
importlib.reload(functions)


pd.set_option('display.max_columns', None)

In [3]:
warnings.filterwarnings('ignore')

# Data Loading and preparation

## Option 1: via .csv files (preferred)

### Load files into dataframes

In [7]:
df_2019 = pd.read_csv('/Users/thoma/Documents/seminar_data/data_2019.csv')
df_2020 = pd.read_csv('/Users/thoma/Documents/seminar_data/data_2020.csv')

In [8]:
#Concat files and create new index
df_full = pd.concat([df_2019, df_2020])
df_full = df_full.reset_index(drop = True)

print('Total # records: ',df_full.shape[0])

Total # records:  4779466


### Rename existing columns

In [9]:
#Rename columns
rename_columns = {'datetTimeFirstDeliveryMoment': 'dateTimeFirstDeliveryMoment',
                  'generalMatchClassification'  : 'detailedMatchClassification',
                  'detailedMatchClassification' : 'generalMatchClassification',
                  'quanityReturned'             : 'quantityReturned'}

df_full = df_full.rename(columns = rename_columns)

### Change data type of relevant columns

In [10]:
#Transform dates to date-type
df_full['orderDate']                   = pd.to_datetime(df_full['orderDate'])
df_full['cancellationDate']            = pd.to_datetime(df_full['cancellationDate'])
df_full['promisedDeliveryDate']        = pd.to_datetime(df_full['promisedDeliveryDate'])
df_full['shipmentDate']                = pd.to_datetime(df_full['shipmentDate'])
df_full['dateTimeFirstDeliveryMoment'] = pd.to_datetime(df_full['dateTimeFirstDeliveryMoment'])
df_full['startDateCase']               = pd.to_datetime(df_full['startDateCase'])
df_full['returnDateTime']              = pd.to_datetime(df_full['returnDateTime'])
df_full['registrationDateSeller']      = pd.to_datetime(df_full['registrationDateSeller'])

#Change type of columns
dtype = {'calculationDefinitive': bool,
         'noCancellation'       : bool,
         'noCase'               : bool,
         'hasOneCase'           : bool,
         'hasMoreCases'         : bool,
         'noReturn'             : bool}

df_full = df_full.astype(dtype)

### Remove noise and irrelevant data

In [11]:
#Remove nonsensical rows
noise = df_full.loc[(df_full['startDateCase']        < df_full['orderDate']) | 
                   (df_full['cancellationDate']      < df_full['orderDate']) |
                   (df_full['promisedDeliveryDate']  < df_full['orderDate']) |
                   (df_full['shipmentDate']          < df_full['orderDate']) |
                   (df_full['returnDateTime']        < df_full['orderDate']) |
                   (df_full['cancellationDate']      > df_full['returnDateTime']) |
                   (df_full['shipmentDate']          > df_full['returnDateTime']) |
                   (df_full['orderDate']             < df_full['registrationDateSeller']) |
                   (df_full['orderDate']             > df_full['dateTimeFirstDeliveryMoment'].dt.normalize()) |
                   (df_full['cancellationDate']      > df_full['dateTimeFirstDeliveryMoment'].dt.normalize()) |
                   (df_full['shipmentDate']          > df_full['dateTimeFirstDeliveryMoment'].dt.normalize()) |
                   ((df_full['returnDateTime']       < df_full['dateTimeFirstDeliveryMoment'].dt.normalize()) & 
                                                       (df_full['dateTimeFirstDeliveryMoment'].notnull()) &
                                                       (df_full['returnDateTime'].notnull())) |
                   ((df_full['cancellationDate']     > df_full['shipmentDate']) &
                                                       ((df_full['cancellationReasonCode'] == 'CUST_FE') |
                                                       (df_full['cancellationReasonCode'] == 'CUST_CS'))) |
                   (df_full['registrationDateSeller'].isnull()) |
                   (df_full['promisedDeliveryDate'].isnull())].index

#Drop noise data
df = df_full.drop(index = noise)
print(len(noise), 'complete records removed from the data')
print('Cleaned # records: ',df.shape[0],'\n')

#Sort rows on orderDate and create new index
df = df.sort_values(by = 'orderDate')
df = df.reset_index(drop = True)

#Remove inconsistent values -> we fill in all data as known by bol during their lookback periods
cancellationNoise = df.loc[(df['noCancellation'] == True) & (df['cancellationDate'].notnull())].index
returnNoise       = df.loc[(df['noReturn'] == True) & (df['returnDateTime'].notnull())].index
caseNoise         = df.loc[(df['noCase'] == True) & (df['startDateCase'].notnull())].index
quantityNoise     = df.loc[df['quantityReturned'] > df['quantityOrdered']].index
deliveryNoise     = df.loc[(df['dateTimeFirstDeliveryMoment'].notnull()) & (df['onTimeDelivery'].isnull())].index

df.loc[cancellationNoise, ['cancellationDate','cancellationReasonCode']] = None
df.loc[returnNoise,       ['returnDateTime','quantityReturned','returnCode']] = None
df.loc[caseNoise,         ['startDateCase','cntDistinctCaseIds','hasOneCase','hasMoreCases']] = None
df.loc[quantityNoise,     ['quantityReturned']] = df.loc[quantityNoise, ['quantityOrdered']]
df.loc[deliveryNoise,     ['dateTimeFirstDeliveryMoment']] = None

print('# Records where cancellation values are emptied:',len(cancellationNoise))
print('# Records where return values are emptied: \t',len(returnNoise))
print('# Records where case values are emptied: \t',len(caseNoise))
print('# Records where quantity values are equalized: \t',len(quantityNoise))
print('# Records where delivery values are emptied: \t',len(deliveryNoise))

6516 complete records removed from the data
Cleaned # records:  4772950 

# Records where cancellation values are emptied: 53780
# Records where return values are emptied: 	 8208
# Records where case values are emptied: 	 0
# Records where quantity values are equalized: 	 14722
# Records where delivery values are emptied: 	 4103


### Create variables

#### Time-related 

In [17]:
#Create new variables related to periods (days difference)
df['caseDays']             = (df['startDateCase'] - df['orderDate']).dt.days
df['returnDays']           = (df['returnDateTime'] - df['orderDate']).dt.days
df['cancellationDays']     = (df['cancellationDate'] - df['orderDate']).dt.days
df['actualDeliveryDays']   = (df['dateTimeFirstDeliveryMoment'].dt.normalize() - df['orderDate']).dt.days
df['shipmentDays']         = (df['shipmentDate'] - df['orderDate']).dt.days
df['partnerSellingDays']   = (df['orderDate'] - df['registrationDateSeller']).dt.days
df['promisedDeliveryDays'] = (df['promisedDeliveryDate'] - df['orderDate']).dt.days

#Time related variables
df['orderYear']    = df['orderDate'].dt.year
df['orderMonth']   = df['orderDate'].dt.month
df['orderWeekday'] = df['orderDate'].dt.weekday
df['orderCorona']  = df['orderDate'].apply(lambda x: True if x > datetime.strptime('2020-03-20','%Y-%m-%d') else False)
# Weekend?

#Create dummy variables for weekdays, months and years
df['orderMonday']    = df['orderWeekday'].apply(lambda x: True if x == 0 else False)
df['orderTuesday']   = df['orderWeekday'].apply(lambda x: True if x == 1 else False)
df['orderWednesday'] = df['orderWeekday'].apply(lambda x: True if x == 2 else False)
df['orderThursday']  = df['orderWeekday'].apply(lambda x: True if x == 3 else False)
df['orderFriday']    = df['orderWeekday'].apply(lambda x: True if x == 4 else False)
df['orderSaturday']  = df['orderWeekday'].apply(lambda x: True if x == 5 else False)
df['orderSunday']    = df['orderWeekday'].apply(lambda x: True if x == 6 else False)

df['orderJanuary']   = df['orderMonth'].apply(lambda x: True if x == 1 else False)
df['orderFebruary']  = df['orderMonth'].apply(lambda x: True if x == 2 else False)
df['orderMarch']     = df['orderMonth'].apply(lambda x: True if x == 3 else False)
df['orderApril']     = df['orderMonth'].apply(lambda x: True if x == 4 else False)
df['orderMay']       = df['orderMonth'].apply(lambda x: True if x == 5 else False)
df['orderJune']      = df['orderMonth'].apply(lambda x: True if x == 6 else False)
df['orderJuly']      = df['orderMonth'].apply(lambda x: True if x == 7 else False)
df['orderAugust']    = df['orderMonth'].apply(lambda x: True if x == 8 else False)
df['orderSeptember'] = df['orderMonth'].apply(lambda x: True if x == 9 else False)
df['orderOctober']   = df['orderMonth'].apply(lambda x: True if x == 10 else False)
df['orderNovember']  = df['orderMonth'].apply(lambda x: True if x == 11 else False)
df['orderDecember']  = df['orderMonth'].apply(lambda x: True if x == 12 else False)

df['orderYear2019'] = df['orderYear'].apply(lambda x: True if x == 2019 else False)
df['orderYear2020'] = df['orderYear'].apply(lambda x: True if x == 2020 else False)

#### Other

In [18]:
df['productTitleLength'] = len(df['productTitle'])

df['fulfilmentByBol'] = df['fulfilmentType'].apply(lambda x: True if x == 'FBB' else False)

df['countryCodeNL']   = df['countryCode'].apply(lambda x: True if x == 'NL' else False)
df['countryOriginNL'] = df['countryOriginSeller'].apply(lambda x: True if x == 'NL' else False)
df['countryOriginBE'] = df['countryOriginSeller'].apply(lambda x: True if x == 'BE' else False)
df['countryOriginDE'] = df['countryOriginSeller'].apply(lambda x: True if x == 'DE' else False)

#### Determinant classification

In [43]:
df_values = df[['noCancellation','noReturn','noCase','onTimeDelivery']].values

determinantClassification = np.empty(df_values.shape[0], dtype='object')

for ix,df_ in enumerate(df_values):
    if ((df_[0] == 1) & (df_[1] == 1) & (df_[2] == 1) & (df_[3] == True)): 
        determinantClassification[ix] = 'All good'
    elif ((df_[0] == 1) & (df_[1] == 1) & (df_[2] == 1) & (np.isnan(df_[3]) == True)):
        determinantClassification[ix] = 'Unknown delivery'
    elif ((df_[0] == 1) & (df_[1] == 1) & (df_[2] == 1) & (df_[3] == False)):
        determinantClassification[ix] = 'Late delivery'
    elif ((df_[0] == 1) & (df_[1] == 1) & (df_[2] == 0) & (df_[3] == True)):
        determinantClassification[ix] = 'Case'
    elif ((df_[0] == 1) & (df_[1] == 1) & (df_[2] == 0) & (np.isnan(df_[3]) == True)):
        determinantClassification[ix] = 'Case + Unknown delivery'
    elif ((df_[0] == 1) & (df_[1] == 1) & (df_[2] == 0) & (df_[3] == False)):
        determinantClassification[ix] = 'Case + Late delivery'
    elif ((df_[0] == 1) & (df_[1] == 0) & (df_[2] == 1) & (df_[3] == True)):
        determinantClassification[ix] = 'Return'
    elif ((df_[0] == 1) & (df_[1] == 0) & (df_[2] == 1) & (np.isnan(df_[3]) == True)):
        determinantClassification[ix] = 'Return + Unknown delivery'
    elif ((df_[0] == 1) & (df_[1] == 0) & (df_[2] == 1) & (df_[3] == False)):
        determinantClassification[ix] = 'Return + Late delivery'
    elif ((df_[0] == 1) & (df_[1] == 0) & (df_[2] == 0) & (df_[3] == True)):
        determinantClassification[ix] = 'Return + Case'
    elif ((df_[0] == 1) & (df_[1] == 0) & (df_[2] == 0) & (np.isnan(df_[3]) == True)):
        determinantClassification[ix] = 'Return + Case + Unknown delivery'
    elif ((df_[0] == 1) & (df_[1] == 0) & (df_[2] == 0) & (df_[3] == False)):
        determinantClassification[ix] = 'Return + Case + Late delivery'
    elif (df_[0] == 0):
        determinantClassification[ix] = 'Cancellation'
        
df['determinantClassification'] = determinantClassification
df['determinantClassification'].value_counts()

In [44]:
df['determinantClassification'] = df.apply(determinantClassification, axis = 1)
df['determinantClassification'].value_counts()

All good                            2685606
Unknown delivery                    1517648
Late delivery                        146725
Return                               145556
Return + Unknown delivery             82968
Case + Unknown delivery               67732
Case                                  42643
Return + Case                         24204
Cancellation                          23693
Return + Case + Unknown delivery      17263
Return + Late delivery                 8426
Case + Late delivery                   8352
Return + Case + Late delivery          2134
Name: determinantClassification, dtype: int64

#### Binary labels

In [31]:
df['binaryMatchClassification'] = df['generalMatchClassification'].apply(lambda x: 'UNKNOWN' if x == 'UNKNOWN' else 'KNOWN')

#### Transporter

In [32]:
def transporterCluster(transporterCode):
    """
    Function to create a new manually clustered transporter variable: 28 -> 5 categories
    """
    if transporterCode in ['AH-NL','TNT','TNT-EXPRESS','TNT-EXTRA']:
        return 'POSTNL'
    elif transporterCode in ['DHL','DHL_DE','DHLFORYOU']:
        return 'DHL'
    elif transporterCode in ['DPD-NL','DPD-BE']:
        return 'DPD'
    elif transporterCode in ['BRIEFPOST','BPOST_BRIEF','DHL-GLOBAL-MAIL','TNT_BRIEF']:
        return 'BRIEF'
    else:
        return 'OTHER'

In [33]:
df['transporterCodeGeneral'] = df['transporterCode'].apply(transporterCluster)
df['transporterCodeGeneral'].value_counts()

POSTNL    2110753
BRIEF     1488153
DHL        436975
OTHER      407323
DPD        329746
Name: transporterCodeGeneral, dtype: int64

#### Product group

In [34]:
def productGroupCluster(productGroup):
    """
    Function to create a new manually clustered product group variable based on categories bol.com
    60 -> 14 groups.
    """
    if productGroup in ['Dutch Books PG','Ebooks and Audiobooks','International Books PG']:
        return 'Books'
    elif productGroup in ['Games Accessories','Games Consoles','Games Software Physical',
                          'Movies','Music']:
        return 'Music, Film & Games'
    elif productGroup in ['Camera','Desktop Monitor and Beamer','Ereaders and Accessories',
                          'Laptop Computers','PC Accessories','Personal Audio',
                          'Sound and Vision Accessories','Storage and Network',
                          'Telephone and Tablet Accessories','Telephones and Tablets','Television']:
        return 'Computer & Electronics'
    elif productGroup in ['General Toys','Recreational and Outdoor Toys']:
        return 'Toys & Hobby'
    elif productGroup in ['Baby and Kids Fashion','Baby PG']:
        return 'Baby & Kids'
    elif productGroup in ['Daily Care PG','Health PG','Perfumery PG','Personal Care']:
        return 'Health & Care'
    elif productGroup in ['Footwear','Jewelry and Watches','Mens and Womens Fashion','Wearables']:
        return 'Fashion, Shoes & Accessories'
    elif productGroup in ['Bodyfashion and Beachwear','Camping and Outdoor','Cycling',
                          'Sporting Equipment','Sportswear','Travel Bags and Accessories']:
        return 'Sports, Outdoor & Travel'
    elif productGroup in ['Educational Dutch','Educational International','Printing and Ink']:
        return 'Office & School'
    elif productGroup in ['Supermarket PG'] :
        return 'Food & Beverage'
    elif productGroup in ['Furniture','Heating and Air','Home Decoration','Home Entertainment',
                          'Household','Household Appliances','Kitchen','Kitchen Machines',
                          'Lighting','Major Domestic Appliances PG','Plumbing and Safety']:
        return 'Home, Cooking & Household'
    elif productGroup in ['Garden','Pet PG','Textiles','Tools and Paint']:
        return 'Pets, Garden & Jobs'
    elif productGroup in ['Car and Motorcycle'] :
        return 'Car & Motor'
    else:
        return 'Other'

In [35]:
df['productGroupGeneral'] = df['productGroup'].apply(productGroupCluster)
df['productGroupGeneral'].value_counts()

Computer & Electronics          1387679
Home, Cooking & Household        797874
Sports, Outdoor & Travel         522098
Toys & Hobby                     500977
Pets, Garden & Jobs              339813
Health & Care                    299049
Food & Beverage                  258769
Books                            184581
Music, Film & Games              163842
Baby & Kids                      113707
Fashion, Shoes & Accessories     110067
Office & School                   52270
Car & Motor                       29753
Other                             12471
Name: productGroupGeneral, dtype: int64

In [36]:
#Create dummies of new product grouping
for group in df['productGroupGeneral'].unique():
    
    columnName = 'group' + group.split(' ')[0].replace(',','')
    df[columnName] = df['productGroupGeneral'].apply(lambda x: True if x == group else False)

#### Cleaned total data

In [37]:
print(df.columns)
print('Total: ',len(df.columns),' columns')

Index(['orderDate', 'productId', 'sellerId', 'totalPrice', 'quantityOrdered',
       'countryCode', 'cancellationDate', 'cancellationReasonCode',
       'promisedDeliveryDate', 'shipmentDate', 'transporterCode',
       'transporterName', 'transporterNameOther',
       'dateTimeFirstDeliveryMoment', 'fulfilmentType', 'startDateCase',
       'cntDistinctCaseIds', 'returnDateTime', 'quantityReturned',
       'returnCode', 'productTitle', 'brickName', 'chunkName', 'productGroup',
       'productSubGroup', 'productSubSubGroup', 'registrationDateSeller',
       'countryOriginSeller', 'currentCountryAvailabilitySeller',
       'calculationDefinitive', 'noCancellation', 'onTimeDelivery', 'noCase',
       'hasOneCase', 'hasMoreCases', 'noReturn', 'detailedMatchClassification',
       'generalMatchClassification', 'caseDays', 'returnDays',
       'cancellationDays', 'actualDeliveryDays', 'shipmentDays',
       'partnerSellingDays', 'promisedDeliveryDays', 'orderYear', 'orderMonth',
       'order

In [38]:
#Fixed Columns:
DATE = ['orderDate']
BASIC = ['totalPrice','quantityOrdered','fulfilmentByBol','countryCodeNL','countryOriginNL','countryOriginBE',
        'countryOriginDE','productTitleLength','promisedDeliveryDays','partnerSellingMonths']
WEEK = ['orderMonday','orderTuesday','orderWednesday','orderThursday','orderFriday','orderSaturday','orderSunday']
MONTH = ['orderJanuary','orderFebruary','orderMarch','orderApril','orderMay','orderJune',
         'orderJuly','orderAugust','orderSeptember','orderOctober','orderNovember','orderDecember']
YEAR = ['orderYear2019','orderYear2020']
GROUP = ['groupHealth','groupHome','groupSports','groupComputer','groupPets','groupToys','groupBooks', 
         'groupBaby', 'groupMusic', 'groupFood','groupOffice','groupFashion','groupOther','groupCar']

#Dynamic Columns:
TRANSPORTERX = ['transporterPOSTNL/X','transporterDHL/X','transporterDPD/X','transporterBRIEF/X','transporterOTHER/X']
KNOWNX = ['caseKnownX','returnKnownX','cancellationKnownX','onTimeDeliveryKnownX','lateDeliveryKnownX']
PRODUCTX = ['productOrderCountX','productTotalCountX','productTotalReturnedX','productReturnFractionX']
SELLERX = ['sellerDailyOrdersX']

#Classifications
CLASSIFICATION = ['generalMatchClassification','detailedMatchClassification','binaryMatchClassification','determinantClassification']

In [39]:
df.head()

Unnamed: 0,orderDate,productId,sellerId,totalPrice,quantityOrdered,countryCode,cancellationDate,cancellationReasonCode,promisedDeliveryDate,shipmentDate,transporterCode,transporterName,transporterNameOther,dateTimeFirstDeliveryMoment,fulfilmentType,startDateCase,cntDistinctCaseIds,returnDateTime,quantityReturned,returnCode,productTitle,brickName,chunkName,productGroup,productSubGroup,productSubSubGroup,registrationDateSeller,countryOriginSeller,currentCountryAvailabilitySeller,calculationDefinitive,noCancellation,onTimeDelivery,noCase,hasOneCase,hasMoreCases,noReturn,detailedMatchClassification,generalMatchClassification,caseDays,returnDays,cancellationDays,actualDeliveryDays,shipmentDays,partnerSellingDays,promisedDeliveryDays,orderYear,orderMonth,orderWeekday,orderCorona,orderMonday,orderTuesday,orderWednesday,orderThursday,orderFriday,orderSaturday,orderSunday,orderJanuary,orderFebruary,orderMarch,orderApril,orderMay,orderJune,orderJuly,orderAugust,orderSeptember,orderOctober,orderNovember,orderDecember,orderYear2019,orderYear2020,productTitleLength,fulfilmentByBol,countryCodeNL,countryOriginNL,countryOriginBE,countryOriginDE,determinantClassification,binaryMatchClassification,transporterCodeGeneral,productGroupGeneral,groupHealth,groupHome,groupSports,groupComputer,groupPets,groupToys,groupBooks,groupBaby,groupMusic,groupFood,groupOffice,groupFashion,groupOther,groupCar
0,2019-01-01,9200000103390344,1244284,4.95,1,NL,NaT,,2019-01-04,2019-01-02,TNT_BRIEF,PostNL Briefpost,,NaT,FBR,NaT,,NaT,,,Velvet Scrunchie Pale Pink,Haar – Accessoires,Haaraccessoire,Daily Care PG,Haar,Haar Accessoires,2016-12-19,NL,NL,True,True,,True,0.0,0.0,True,UNKNOWN,UNKNOWN,,,,,1.0,743,3,2019,1,1,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,4772950,False,True,True,False,False,,UNKNOWN,BRIEF,Health & Care,True,False,False,False,False,False,False,False,False,False,False,False,False,False
1,2019-01-01,9200000065456100,1167286,158.0,1,NL,NaT,,2019-01-07,2019-01-02,DPD-NL,DPD Nederland,,NaT,FBR,2019-01-03,2.0,NaT,,,Inventum MN306C - Combi-magnetron,Magnetrons,Vrijstaande magnetron,Major Domestic Appliances PG,Cooking,Cooking,2016-04-15,NL,NL,True,True,,False,0.0,1.0,True,KNOWN HEAVILY UNHAPPY,UNHAPPY,2.0,,,,1.0,991,6,2019,1,1,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,4772950,False,True,True,False,False,,KNOWN,DPD,"Home, Cooking & Household",False,True,False,False,False,False,False,False,False,False,False,False,False,False
2,2019-01-01,9200000024539481,888610,30.0,1,BE,NaT,,2019-01-03,2019-01-02,BPOST_BE,Bpost Belgie,,2019-01-03 08:17:00,FBB,NaT,,NaT,,,Perfect Push Up V2,Fitness Accessoires,Opdruksteun,Sporting Equipment,Fitness Klein,Fitnessmaterialen,2013-12-21,NL,NL,True,True,True,True,0.0,0.0,True,KNOWN HAPPY,KNOWN HAPPY,,,,2.0,1.0,1837,2,2019,1,1,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,4772950,True,False,True,False,False,All good,KNOWN,OTHER,"Sports, Outdoor & Travel",False,False,True,False,False,False,False,False,False,False,False,False,False,False
3,2019-01-01,9200000056338594,1308208,32.2,2,NL,2019-01-02,SELLER_UNDELIV,2019-01-03,NaT,,,,NaT,FBB,NaT,,NaT,,,RGB led strip - 5m - Set RGB - kleuren - Inclu...,Verlichting – Vast,Led-strip,Lighting,Slimme Verlichting,Slimme Led-Strips,2017-09-15,NL,NL,True,False,,True,0.0,0.0,True,KNOWN HEAVILY UNHAPPY,UNHAPPY,,,1.0,,,473,2,2019,1,1,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,4772950,True,True,True,False,False,Cancellation,KNOWN,OTHER,"Home, Cooking & Household",False,True,False,False,False,False,False,False,False,False,False,False,False,False
4,2019-01-01,9200000085152942,1274000,24.5,1,NL,NaT,,2019-01-03,2019-01-02,TNT,PostNL,,2019-01-03 17:06:38,FBB,2019-01-18,1.0,2019-01-04,1.0,,Deltaco GT-174D 4-poort stekkerdoos met 2 x US...,Verdeelborden/-kasten,Stekkerdoos,Plumbing and Safety,Elektra,Verlengmateriaal,2017-05-31,NL,ALL,True,True,True,False,1.0,0.0,False,KNOWN MEDIUM UNHAPPY,UNHAPPY,17.0,3.0,,2.0,1.0,580,2,2019,1,1,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,4772950,True,True,True,False,False,Return + Case,KNOWN,POSTNL,"Home, Cooking & Household",False,True,False,False,False,False,False,False,False,False,False,False,False,False


## Option 2: via direct sql connection

In [5]:
connection_string = (    
    r'Driver={SQL Server};'
    r'Server=LAPTOP-LD74USH0\SQLEXPRESS;'
    r'Integrated Security=SSPI;'
    r'Trusted_Connection=yes;'
)
pd.set_option('display.max_columns', None)

In [6]:
def sql2df(query, params=[], parse_dates=None, dsn='SQLEXPRESS'):
        with py.connect(connection_string, readonly=True) as conn:
            return pd.read_sql(query, conn, params=params, parse_dates=parse_dates)

In [7]:
# First work with random top 100.000 (to reduce computation time) - 45secs

df = sql2df('''
SELECT TOP 500000 * FROM Seminar.dbo.cleaned_bol_data_full
ORDER BY newid();
''')


In [321]:
# 9.5 minutes 

df = sql2df('''
SELECT * FROM Seminar.dbo.cleaned_bol_data_full;
''')

In [8]:
#Change type of columns
dtype = {'calculationDefinitive': bool,
         'noCancellation': bool,
         'noCase': bool,
         'hasOneCase': bool,
         'hasMoreCases': bool,
         'noReturn': bool,
         'orderWeekend': bool,
         'orderCorona': bool,
         'countryCodeNL': bool,
         'fulfilmentByBol': bool,
         'countryOriginNL': bool,
         'countryOriginBE': bool,
         'countryOriginDE': bool,
         'orderMonday': bool,
         'orderTuesday': bool,
         'orderWednesday': bool,
         'orderThursday': bool,
         'orderFriday': bool,
         'orderSaturday': bool,
         'orderSunday': bool,
         'orderJanuary': bool,
         'orderFebruary': bool,
         'orderMarch': bool,
         'orderApril': bool,
         'orderMay': bool,
         'orderJune': bool,
         'orderJuly': bool,
         'orderAugust': bool,
         'orderSeptember': bool,
         'orderOctober': bool,
         'orderNovember': bool,
         'orderDecember': bool}

df = df.astype(dtype)

#Transform dates to date-type
df['orderDate'] = pd.to_datetime(df['orderDate'], errors='coerce')
df['cancellationDate'] = pd.to_datetime(df['cancellationDate'], errors='coerce')
df['promisedDeliveryDate'] = pd.to_datetime(df['promisedDeliveryDate'], errors='coerce')
df['shipmentDate'] = pd.to_datetime(df['shipmentDate'], errors='coerce')
df['dateTimeFirstDeliveryMoment'] = pd.to_datetime(df['dateTimeFirstDeliveryMoment'], errors='coerce')
df['startDateCase'] = pd.to_datetime(df['startDateCase'], errors='coerce')
df['returnDateTime'] = pd.to_datetime(df['returnDateTime'], errors='coerce')
df['registrationDateSeller'] = pd.to_datetime(df['registrationDateSeller'], errors='coerce')

df.dtypes

orderDate             datetime64[ns]
productId                     object
sellerId                      object
totalPrice                   float64
quantityOrdered                int64
                           ...      
orderSeptember                  bool
orderOctober                    bool
orderNovember                   bool
orderDecember                   bool
productTitleLength             int64
Length: 78, dtype: object

#### Add variables

In [9]:
# Binary classification variable
df['binaryMatchClassification'] = df['generalMatchClassification'].apply(lambda x: 'UNKNOWN' if x == 'UNKNOWN' else 'KNOWN')

# Dummy for year = 2020
df['orderYear2020'] = df['orderYear'].apply(lambda x: True if x == 2020 else False)

#### Transporter Groups

In [10]:
def transporterCluster(transporterCode):
    """
    Function to create a new manually clustered transporter variable: 28 -> 5 categories
    """
    if transporterCode in ['AH-NL','TNT','TNT-EXPRESS','TNT-EXTRA']:
        return 'POSTNL'
    elif transporterCode in ['DHL','DHL_DE','DHLFORYOU']:
        return 'DHL'
    elif transporterCode in ['DPD-NL','DPD-BE']:
        return 'DPD'
    elif transporterCode in ['BRIEFPOST','BPOST_BE','BPOST_BRIEF','DHL-GLOBAL-MAIL','TNT_BRIEF']:
        return 'BRIEF'
    else:
        return 'OTHER'

In [11]:
df['transporterCodeGeneral'] = df['transporterCode'].apply(transporterCluster)
df['transporterCodeGeneral'].value_counts()

POSTNL    221508
BRIEF     165035
DHL        45527
DPD        34475
OTHER      33455
Name: transporterCodeGeneral, dtype: int64

#### Product Groups

In [12]:
def productGroupCluster(productGroup):
    """
    Function to create a new manually clustered product group variable based on categories bol.com
    60 -> 14 groups.
    """
    if productGroup in ['Dutch Books PG','Ebooks and Audiobooks','International Books PG']:
        return 'Books'
    elif productGroup in ['Games Accessories','Games Consoles','Games Software Physical',
                          'Movies','Music']:
        return 'Music, Film & Games'
    elif productGroup in ['Camera','Desktop Monitor and Beamer','Ereaders and Accessories',
                          'Laptop Computers','PC Accessories','Personal Audio',
                          'Sound and Vision Accessories','Storage and Network',
                          'Telephone and Tablet Accessories','Telephones and Tablets','Television']:
        return 'Computer & Electronics'
    elif productGroup in ['General Toys','Recreational and Outdoor Toys']:
        return 'Toys & Hobby'
    elif productGroup in ['Baby and Kids Fashion','Baby PG']:
        return 'Baby & Kids'
    elif productGroup in ['Daily Care PG','Health PG','Perfumery PG','Personal Care']:
        return 'Health & Care'
    elif productGroup in ['Footwear','Jewelry and Watches','Mens and Womens Fashion','Wearables']:
        return 'Fashion, Shoes & Accessories'
    elif productGroup in ['Bodyfashion and Beachwear','Camping and Outdoor','Cycling',
                          'Sporting Equipment','Sportswear','Travel Bags and Accessories']:
        return 'Sports, Outdoor & Travel'
    elif productGroup in ['Educational Dutch','Educational International','Printing and Ink']:
        return 'Office & School'
    elif productGroup in ['Supermarket PG'] :
        return 'Food & Beverage'
    elif productGroup in ['Furniture','Heating and Air','Home Decoration','Home Entertainment',
                          'Household','Household Appliances','Kitchen','Kitchen Machines',
                          'Lighting','Major Domestic Appliances PG','Plumbing and Safety']:
        return 'Home, Cooking & Household'
    elif productGroup in ['Garden','Pet PG','Textiles','Tools and Paint']:
        return 'Pets, Garden & Jobs'
    elif productGroup in ['Car and Motorcycle'] :
        return 'Car & Motor'
    else:
        return 'Other'

In [13]:
df['productGroupGeneral'] = df['productGroup'].apply(productGroupCluster)
df['productGroupGeneral'].value_counts()

Computer & Electronics          145687
Home, Cooking & Household        83348
Sports, Outdoor & Travel         54598
Toys & Hobby                     52834
Pets, Garden & Jobs              35484
Health & Care                    31315
Food & Beverage                  26802
Books                            19435
Music, Film & Games              17343
Baby & Kids                      11784
Fashion, Shoes & Accessories     11453
Office & School                   5450
Car & Motor                       3149
Other                             1318
Name: productGroupGeneral, dtype: int64

In [14]:
#Create dummies of new product grouping
for group in df['productGroupGeneral'].unique():
    
    columnName = 'group' + group.split(' ')[0].replace(',','')
    df[columnName] = df['productGroupGeneral'].apply(lambda x: True if x == group else False)

In [15]:
print(df.columns)
print('Total: ',len(df.columns),' columns')

Index(['orderDate', 'productId', 'sellerId', 'totalPrice', 'quantityOrdered',
       'countryCode', 'cancellationDate', 'cancellationReasonCode',
       'promisedDeliveryDate', 'shipmentDate', 'transporterCode',
       'transporterName', 'transporterNameOther',
       'dateTimeFirstDeliveryMoment', 'fulfilmentType', 'startDateCase',
       'cntDistinctCaseIds', 'returnDateTime', 'quantityReturned',
       'returnCode', 'productTitle', 'brickName', 'chunkName', 'productGroup',
       'productSubGroup', 'productSubSubGroup', 'registrationDateSeller',
       'countryOriginSeller', 'currentCountryAvailabilitySeller',
       'calculationDefinitive', 'noCancellation', 'onTimeDelivery', 'noCase',
       'hasOneCase', 'hasMoreCases', 'noReturn', 'detailedMatchClassification',
       'generalMatchClassification', 'determinantClassification', 'orderYear',
       'orderMonth', 'orderYearMonth', 'orderWeekday', 'orderWeekend',
       'orderCorona', 'transporterFeature', 'partnerSellingMonths',
   

In [16]:
#Fixed Columns:
BASIC = ['totalPrice','quantityOrdered','fulfilmentByBol','countryCodeNL','countryOriginNL','countryOriginBE',
        'countryOriginDE','productTitleLength']
WEEK = ['orderMonday','orderTuesday','orderWednesday','orderThursday','orderFriday','orderSaturday','orderSunday']
MONTH = ['orderJanuary','orderFebruary','orderMarch','orderApril','orderMay','orderJune',
         'orderJuly','orderAugust','orderSeptember','orderOctober','orderNovember','orderDecember']
GROUP = ['groupHealth','groupHome','groupSports','groupComputer','groupPets','groupToys','groupBooks', 
         'groupBaby', 'groupMusic', 'groupFood','groupOffice','groupFashion','groupOther','groupCar']

#Dynamic Columns:
TRANSPORTERX = ['transporterPOSTNL/X','transporterDHL/X','transporterDPD/X','transporterBRIEF/X','transporterOTHER/X']
KNOWNX = ['caseKnownX','returnKnownX','cancellationKnownX','onTimeDeliveryKnownX','lateDeliveryKnownX']
PRODUCTX = ['productOrderCountX','productTotalCountX','productTotalReturnedX','productReturnFractionX']
SELLERX = ['sellerDailyOrdersX']

#Classifications
CLASS = ['generalMatchClassification']

#### Functions

In [17]:
def addKnownColumns(df,X):
    """
    Function to create columns which indicate whether determinants are known after X days.
    Input: X = number of days after order date at which the prediction is made
           df = dataFrame
    """
#     df_ = df[['actualDeliveryDays','onTimeDelivery','shipmentDays','transporterCodeGeneral']]
    
    df['caseKnownX']           = df['caseDays'].apply(lambda x: True if x <= X else False)
    df['returnKnownX']         = df['returnDays'].apply(lambda x: True if x <= X else False)
    df['cancellationKnownX']   = df['cancellationDays'].apply(lambda x: True if x <= X else False)
    
#     df_['actualDeliveryKnown'] = df['actualDeliveryDays'].apply(lambda x: True if x <= X else False)
#     df_['shipmentDaysKnown']   = df['shipmentDays'].apply(lambda x: True if x <= X else False)
    
    df['onTimeDeliveryKnownX'] = df.apply(lambda row: True if ((row.actualDeliveryDays <= X) and (row.onTimeDelivery == True)) else False, axis = 1)
    df['lateDeliveryKnownX']   = df.apply(lambda row: True if ((row.actualDeliveryDays <= X) and (row.onTimeDelivery == False)) else False, axis = 1)
    
    for transporter in df['transporterCodeGeneral'].unique():
        dummyColumn = 'transporter' + transporter +'/X'
        df[dummyColumn] = df.apply(lambda row: True if ((row.shipmentDays <= X) and (row.transporterCodeGeneral == transporter)) else False, axis = 1)

    return df

In [18]:
def addProductColumns(df,X):
    
    if ['productOrderCount0','productTotalCount0','productTotalReturned0','productReturnFraction0'] not in list(df.columns):
    
        df = addProductColumns0(df)
    
    if X > 0:
        
        df = addProductColumnsX(df,X)
        
    else:
        
        df['productOrderCountX'] = df['productOrderCount0']
        df['productTotalCountX'] = df['productTotalCount0']
        df['productTotalReturnedX'] = df['productTotalReturned0']
        df['productReturnFractionX'] = df['productReturnFraction0']
        
    return df

In [19]:
def addSellerColumns(df,X):
    
    if 'sellerDailyOrders0' not in list(df.columns):
    
        df = addSellerColumns0(df)
    
    if X > 0:
        
        df = addSellerColumnsX(df,X)
    
    else:
        
        df['sellerDailyOrdersX'] = df['sellerDailyOrders0']
        
    return df

In [20]:
def addProductColumns0(df): 
    """
    Function to add 4 columns: productOrderCount, productTotalCount, productTotalReturned and productReturnFraction.
    Input: dataFrame with columns: 'productId','orderDate','quantityOrdered','quantityReturned','returnDateTime'.
    """
    df = df.sort_values(by = ['productId','orderDate'])
    df = df.reset_index(drop = True)
    
    df_ = df[['productId','orderDate','quantityOrdered','quantityReturned','returnDateTime']]
    
    #ProductTotalCount
    pivot = df_.groupby(['productId','orderDate']).quantityOrdered.sum().groupby('productId').cumsum()
    productTotalCount = df_.merge(pivot, 
                                left_on=['productId','orderDate'], 
                                right_index=True, 
                                how = 'left').quantityOrdered_y
    
    #ProductOrderCount
    pivot = df_.groupby(['productId','orderDate']).quantityOrdered.count().groupby('productId').cumsum()
    productOrderCount = df_.merge(pivot, 
                                left_on=['productId','orderDate'], 
                                right_index=True, 
                                how = 'left').quantityOrdered_y
    
    #ProductTotalReturned
    productTotalReturned = np.zeros(df_.shape[0])
    
    previousID = None
    
    returnDic = {}
    
    for row in df_.itertuples(): #iterate through dataFrame: row[0] = index, row[1] = productId, row[2] = orderDate
                                                           # row[3] = quantityOrdered, row[4] = quantityReturned
        if row[0] == 0:                                    # row[5] = returnDateTime
            
            #update return dictionary if this product is returned
            if row[4] != None:
                if row[5] in returnDic:
                    returnDic[row[5]] += row[4]
                else:
                    returnDic[row[5]] = row[4]

            previousID = row[1]
            
        elif (previousID == row[1]):
            
            #update return dictionary if this product is returned
            if row[4] != None:
                if row[5] in returnDic:
                    returnDic[row[5]] += row[4]
                else:
                    returnDic[row[5]] = row[4]
            
            #add returned products to new dictionary if known
            known = {k: v for k, v in returnDic.items() if k <= row[2]}
            productTotalReturned[row[0]] = sum(known.values())
            
            #update the dictionary by removing the returns which are now known
            returnDic = {k: v for k, v in returnDic.items() if k > row[2]}
                        
            previousID = row[1]
            
        else:
            returnDic = {} #new productId, hence empty the return dictionary
            
            #update return dictionary if this product is returned
            if row[4] != None:
                if row[5] in returnDic:
                    returnDic[row[5]] += row[4]
                else:
                    returnDic[row[5]] = row[4]
                    
            previousID = row[1]
    
    df_['productTotalReturned'] = productTotalReturned
    pivot = df_.groupby(by = ['productId','orderDate']).productTotalReturned.sum().groupby('productId').cumsum()
    productTotalReturned = df_.merge(pivot, 
                                left_on=['productId','orderDate'], 
                                right_index=True, 
                                how = 'left').productTotalReturned_y
     
    #Add new columns to dataFrame    
    df['productOrderCount0'] = productOrderCount
    df['productTotalCount0'] = productTotalCount
    df['productTotalReturned0'] = productTotalReturned
    df['productReturnFraction0'] = productTotalReturned / productTotalCount
    
    return(df)

In [21]:
def addProductColumnsX(df,X):
    """
    Function to add 4 columns: productOrderCountX, productTotalCountX, productTotalReturnedX and productReturnFractionX.
    Input: dataFrame with columns: 'productId','orderDate','productOrderCount','productTotalCount','productTotalReturned'
    """
    df = df.sort_values(by = ['productId','orderDate'], ascending = [True, False]) #reverse ordering on Orderdate!
    df = df.reset_index(drop = True)
    
    df_ = df[['productId','orderDate','productOrderCount0','productTotalCount0','productTotalReturned0']]
    #            row[1]       row[2]        row[3]               row[4]                 row[5]    
    
    df_['orderDateX'] = df_['orderDate'] + timedelta(X)
    #      row[6]

    knownProductInfo = np.zeros((df_.shape[0],3))
    
    previousID = None
    previousMaxDate = None
    
    dic = {}
    
    for row in df_.itertuples(): #iterate  
                                                                  
        if row[0] == 0:                                          
            
            knownProductInfo[[row[0]]] = (row[3],row[4],row[5]) 
            
            dic[row[2]] = (row[3],row[4],row[5])

            previousMaxDate = row[2]
            previousID = row[1]
            
        elif (previousID == row[1]):
            
            if row[6] >= previousMaxDate:
                dic[row[2]] = (row[3],row[4],row[5])
                knownProductInfo[[row[0]]] = dic[max(dic)]
            else:
                dic[row[2]] = (row[3],row[4],row[5])
                dic = {k: v for k, v in dic.items() if k <= row[6]}
                
                knownProductInfo[[row[0]]] = dic[max(dic)]
                previousMaxDate = max(dic)
                 
            previousID = row[1]
            
        else:
            dic = {} #new productId -> empty the dictionary
            
            knownProductInfo[[row[0]]] = (row[3],row[4],row[5])
            dic[row[2]] = (row[3],row[4],row[5])
                    
            previousMaxDate = row[2]
            previousID = row[1]

    df['productOrderCountX'] = knownProductInfo[:,0]
    df['productTotalCountX'] = knownProductInfo[:,1]
    df['productTotalReturnedX'] = knownProductInfo[:,2]
    df['productReturnFractionX'] = knownProductInfo[:,2] / knownProductInfo[:,1]
    
    #Reverse to natural order
    df = df.sort_values(by = ['productId','orderDate'], ascending = [True, True])
    df = df.reset_index(drop = True)
    
    return(df)

In [22]:
def addSellerColumns0(df):
    """
    Function to add 4 columns: 
    Input: dataFrame with columns: 'sellerId','orderDate','quantityOrdered','partnerSellingMonths'
    """
    df = df.sort_values(by = ['sellerId','orderDate'])
    df = df.reset_index(drop = True)
    
    df_ = df[['sellerId','orderDate','quantityOrdered','partnerSellingMonths']]
    
    firstOrder = df_.groupby('sellerId').orderDate.min()
    df_['firstOrder'] = df_.merge(firstOrder,
                                  left_on = 'sellerId',
                                  right_index = True,
                                  how = 'left').orderDate_y
    df_['daysFirstOrder'] = (df_['orderDate'] - df_['firstOrder']).dt.days + 1
    
    pivot = df_.groupby(['sellerId','orderDate']).quantityOrdered.count().groupby('sellerId').cumsum()
    sellerTotalCount = df_.merge(pivot, 
                                left_on=['sellerId','orderDate'], 
                                right_index=True, 
                                how = 'left').quantityOrdered_y
    
    df['sellerDailyOrders0'] = np.log(sellerTotalCount / df_['daysFirstOrder'])
    
    return df

In [23]:
def addSellerColumnsX(df,X):
    """
    Function to add 4 columns: 
    Input: dataFrame with columns: 'sellerId','orderDate','quantityOrdered','partnerSellingMonths'
    """
        
    df = df.sort_values(by = ['sellerId','orderDate'], ascending = [True, False]) #reverse ordering orderdate!
    df = df.reset_index(drop = True)

    df_ = df[['sellerId','orderDate','sellerDailyOrders0']]
    #            row[1]       row[2]        row[3]        

    df_['orderDateX'] = df_['orderDate'] + timedelta(X)
    #      row[4]

    knownSellerInfo = np.zeros(df_.shape[0])

    previousID = None
    previousMaxDate = None

    dic = {}

    for row in df_.itertuples(): #iterate  

        if row[0] == 0:                                          

            knownSellerInfo[[row[0]]] = row[3]

            dic[row[2]] = row[3]

            previousMaxDate = row[2]
            previousID = row[1]

        elif (previousID == row[1]):

            if row[4] >= previousMaxDate:
                dic[row[2]] = row[3]
                knownSellerInfo[[row[0]]] = dic[max(dic)]
            else:
                dic[row[2]] = row[3]
                dic = {k: v for k, v in dic.items() if k <= row[4]}

                knownSellerInfo[[row[0]]] = dic[max(dic)]
                previousMaxDate = max(dic)

            previousID = row[1]

        else:
            dic = {} #new productId -> empty the dictionary

            knownSellerInfo[[row[0]]] = row[3]
            dic[row[2]] = row[3]

            previousMaxDate = row[2]
            previousID = row[1]

    df['sellerDailyOrdersX'] = knownSellerInfo

    #Reverse to natural order
    df = df.sort_values(by = ['sellerId','orderDate'], ascending = [True, True])
    df = df.reset_index(drop = True)

    return df

In [24]:
def classifyLabels(classifier, X, y, n, split = 'TimeSeries', smote = False, scale = None, days = 0):
    """
    Function to classify match labels using a pre-specified classifier with X and y variables. 
    
    Input:
    - classifier: can be any supported classifier. E.g. DecisionTreeClassifier(random_state=0, class_weight='balanced', max_depth=10). Necessary!
    - X: dataframe input on explanatory features. Necessary!
    - y: dataframe input on labels. Necessary!
    - n: number of folds to be evaluated.
    - split: object that can take value 'Random' to make K-fold random train/test split. Default is to apply time series split.
    - smote: boolean, if true Synthetic Minority Oversampling will be applied. Default = False.
    - scale: object that can take values 'MinMax' or 'Standard' to scale X correspondingly. Any other input will not scale X. Default = None.
    - days: integer number of days after orderDate that should be considered. Default = 0.
    
    Output: 
    - accuracy: list of accuracies for the n evaluated classifiers.
    - class_report: report of performance measures for the n evaluated classifiers.
    """
    
    accuracy = {}
    class_report = {}
    count = 1
    
    if split == 'Random':
        
        kf = StratifiedKFold(n_splits = n, random_state = 0, shuffle = True)
        for train_index, test_index in kf.split(X, y):

            if scale == 'MinMax':
                scaler = preprocessing.MinMaxScaler()
                X_scaled = pd.DataFrame(scaler.fit_transform(X))
                X_train, X_test = X_scaled.iloc[train_index], X_scaled.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            elif scale == 'Standard':
                scaler = preprocessing.StandardScaler()
                X_scaled = pd.DataFrame(scaler.fit_transform(X))
                X_train, X_test = X_scaled.iloc[train_index], X_scaled.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            else:
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            if smote == True:
                smote = SMOTE('not majority')
                X_train, y_train = smote.fit_sample(X_train,y_train)
            else:
                X_train, y_train = X_train, y_train
            
            clf = classifier
            clf = clf.fit(X_train,y_train)
            prediction = clf.predict(X_test)
            accuracy[count] = metrics.accuracy_score(y_test, prediction)
            class_report[count] = metrics.classification_report(y_test, prediction)
    
            print(count)
            count +=1
    
    else:
        
        tscv = TimeSeriesSplit(n_splits = n)
        
        for train_index, test_index in tscv.split(X):
        
            if scale == 'MinMax':
                scaler = preprocessing.MinMaxScaler()
                X_scaled = pd.DataFrame(scaler.fit_transform(X))
                X_train, X_test = X_scaled.iloc[train_index], X_scaled.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            elif scale == 'Standard':
                scaler = preprocessing.StandardScaler()
                X_scaled = pd.DataFrame(scaler.fit_transform(X))
                X_train, X_test = X_scaled.iloc[train_index], X_scaled.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            else:
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            if smote == True:
                smote = SMOTE('not majority')
                X_train, y_train = smote.fit_sample(X_train,y_train)
            else:
                X_train, y_train = X_train, y_train
            
            clf = classifier
            clf = clf.fit(X_train,y_train)
            prediction = clf.predict(X_test)
            accuracy[count] = metrics.accuracy_score(y_test, prediction)
            class_report[count] = metrics.classification_report(y_test, prediction)
    
            print(count)
            count +=1

    return(accuracy, class_report)

In [25]:
#Categorical variables
s = (df.dtypes == 'object')
object_cols = list(s[s].index)
print("Categorical variables:")
print(object_cols)

Categorical variables:
['productId', 'sellerId', 'countryCode', 'cancellationReasonCode', 'transporterCode', 'transporterName', 'transporterNameOther', 'fulfilmentType', 'returnCode', 'productTitle', 'brickName', 'chunkName', 'productGroup', 'productSubGroup', 'productSubSubGroup', 'countryOriginSeller', 'currentCountryAvailabilitySeller', 'onTimeDelivery', 'detailedMatchClassification', 'generalMatchClassification', 'determinantClassification', 'orderMonth', 'orderYearMonth', 'transporterFeature', 'binaryMatchClassification', 'transporterCodeGeneral', 'productGroupGeneral']


# Function-based Classification

## Define X and y variables for classification

In [26]:
index = ['orderDate']
X_col_base = ['totalPrice','quantityOrdered','promisedDeliveryDays','orderCorona','partnerSellingMonths',
        'countryCodeNL', 'fulfilmentByBol', 'countryOriginNL', 'countryOriginBE', 'countryOriginDE', 'orderWeekend',
        'orderMonday','orderTuesday', 'orderWednesday', 'orderThursday', 'orderFriday', 'orderSaturday', 'orderSunday',
        'orderJanuary', 'orderFebruary', 'orderMarch', 'orderApril', 'orderMay', 'orderJune', 'orderJuly',
        'orderAugust', 'orderSeptember', 'orderOctober', 'orderNovember', 'orderDecember', 'productTitleLength',
        'orderYear2020', 'groupComputer', 'groupFood', 'groupBooks', 'groupHealth', 'groupToys', 'groupSports', 
        'groupHome', 'groupOffice', 'groupPets', 'groupMusic', 'groupFashion', 'groupBaby', 'groupOther', 'groupCar']
#y_col = ['binaryMatchClassification']
y_col = ['generalMatchClassification']
#'productOrderCount', 'productReturnFraction', 

## Function to return X and y for a pre-specified number of days after orderDate

In [27]:
def dataX(df,days):
    """
    Function to return features and output labels for a pre-specified number of days after orderDate. 
    
    Input:
    - df: dataframe containing all features available at the time of ordering.
    - days: integer number of days after orderDate that should be considered.
    
    Output: 
    - X: dataframe output of features that can be used the number of days after orderDate. E.g. information on cases and deliveries are added.
    - y: dataframe output of output labels that can be used the number of days after orderDate.
    """    
    
    df = addKnownColumns(df,days)
    df = addProductColumns(df,days)
    df = addSellerColumns(df,days)
    
    if days == 0:
        X_col = X_col_base + ['productOrderCountX', 'productTotalCountX',
                 'productTotalReturnedX', 'productReturnFractionX', 'sellerDailyOrdersX']
    else:
        X_col = X_col_base + ['caseKnownX', 'returnKnownX', 'cancellationKnownX', 'onTimeDeliveryKnownX',
                 'lateDeliveryKnownX', 'transporterPOSTNL/X', 'transporterDHL/X', 'transporterDPD/X', 
                 'transporterBRIEF/X', 'transporterOTHER/X', 'productOrderCountX', 'productTotalCountX',
                 'productTotalReturnedX', 'productReturnFractionX', 'sellerDailyOrdersX']

    df_test = df[index+X_col+y_col].dropna()
    df_test = df_test.sort_values(by = 'orderDate')
    df_test = df_test.reset_index(drop = True)

    X = df_test[X_col]
    y = df_test[y_col]
    
    return(X, y)

In [32]:
def neuralNetwork():
    
    model = Sequential()

    model.add(Dense(units=25,activation='relu'))
    model.add(Dense(units=3,activation='softmax')) #units should equal number of labels

    model.compile(optimizer='adam', 
                  loss='categorical_crossentropy', 
                  metrics=['accuracy'])
    return model

#estimator = KerasClassifier(build_fn = neuralNetwork, epochs = 20, class_weight = class_weights, verbose = 1)
#history = estimator.fit(X_train, y_train)
#pd.DataFrame(history.history).plot()

In [33]:
def classifyLabelsQuick(classifier, X, y, n, split = 'TimeSeries', smote = False, scale = None, days = 0):
    
    accuracy = {}
    class_report = {}
    
    if split == 'Random':
        split_type = StratifiedKFold(n_splits = n, random_state = 0, shuffle = True)
        
    else:
        split_type = TimeSeriesSplit(n_splits = n)
    
    #Create pipeline -> everything in the pipeline is executed after eachother
    pipe = Pipeline([('scaler', preprocessing.MinMaxScaler()), ('classifier', classifier)])
    
    #Cross validation function which outputs the accuracy and average (unweighted) precision and recall of labels
    y_pred = cross_validate(pipe, X, y, cv = split_type, scoring = ('precision_macro', 'recall_macro', 'accuracy'), return_train_score = True)
        
    return y_pred

## Classification

### Neural Net

In [34]:
# Training Neural Network:

# 1. Use MinMaxScaler
# 2. Multilabel output should be converted into dummies
le = LabelEncoder()
le.fit(y)
encoded_y = le.transform(y)
dummy_y = pd.DataFrame(np_utils.to_categorical(encoded_y))
# 3. Class-weights have to be computed beforehand (only the training weights actually)
class_weights = class_weight.compute_class_weight('balanced',
                                                   np.unique(encoded_y),
                                                   encoded_y)
class_weights = dict(enumerate(class_weights))
# 4. Use the folowwing estimator
estimator = KerasClassifier(build_fn = neuralNetwork, #model defined below
                            epochs = 20, 
                            class_weight = class_weights,
                            verbose = 0)
# 5. In order to create classification report you need the encoded_y or
#    if you want the label names use le.inverse_transform
metrics.classification_report(encoded_y_test, prediction)

NameError: name 'encoded_y_test' is not defined

### Function: classifyLabels(classifier, X, y, n, split = 'TimeSeries', smote = False, scale = None, days = 0)

In [28]:
A = 5

resultsAcc = {}
resultsClass = {}

for DAYS in range(A):
    
    X, y = dataX(df,DAYS)
    print('Lets Go')
    
    accuracy, class_report = classifyLabels(DecisionTreeClassifier(random_state=0,
                                                                   class_weight='balanced'), X, y, n = 3)
        
    resultsAcc[DAYS] = accuracy
    resultsClass[DAYS] = class_report
    
    print(DAYS)

Lets Go
1
2
3
0
Lets Go
1
2
3
1
Lets Go
1
2
3
2
Lets Go
1
2
3
3
Lets Go
1
2
3
4


In [29]:
resultsAcc

{0: {1: 0.604816, 2: 0.6168, 3: 0.640976},
 1: {1: 0.736872, 2: 0.72452, 3: 0.738128},
 2: {1: 0.767872, 2: 0.755368, 3: 0.778936},
 3: {1: 0.78952, 2: 0.778312, 3: 0.79404},
 4: {1: 0.80728, 2: 0.786088, 3: 0.795896}}

In [283]:
random_forest_accuracies_binary = resultsAcc

In [284]:
random_forest_report_binary = resultsClass

In [286]:
for i in range(A):
    for item in resultsClass[i].values():
        print(item)

              precision    recall  f1-score   support

       KNOWN       0.84      0.91      0.87     16759
     UNKNOWN       0.77      0.65      0.71      8241

    accuracy                           0.82     25000
   macro avg       0.81      0.78      0.79     25000
weighted avg       0.82      0.82      0.82     25000

              precision    recall  f1-score   support

       KNOWN       0.86      0.94      0.90     18233
     UNKNOWN       0.78      0.59      0.67      6767

    accuracy                           0.84     25000
   macro avg       0.82      0.76      0.78     25000
weighted avg       0.84      0.84      0.84     25000

              precision    recall  f1-score   support

       KNOWN       0.82      0.95      0.88     16847
     UNKNOWN       0.85      0.57      0.68      8153

    accuracy                           0.83     25000
   macro avg       0.83      0.76      0.78     25000
weighted avg       0.83      0.83      0.82     25000

              preci

In [242]:
# Naive Bayes Bernoulli 1 day
(X, y) = dataX(df,5)
(accuracy,class_report) = classifyLabels(BernoulliNB(), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_['productTotalReturned'] = productTotalReturned
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_['orderDateX'] = df_['orderDate'] + timedelta(X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_['daysFirstOrder'] = (df_['orderDate'] - df_['firstOrder']).dt.days + 1
A value is trying to be set

1


  return f(**kwargs)


2


  return f(**kwargs)


3
{1: 0.8126, 2: 0.83996, 3: 0.8458}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.88      0.87      0.88     13647
     UNHAPPY       0.49      0.56      0.53      3112
     UNKNOWN       0.83      0.81      0.82      8241

    accuracy                           0.81     25000
   macro avg       0.74      0.75      0.74     25000
weighted avg       0.82      0.81      0.82     25000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.86      0.92      0.89     15320
     UNHAPPY       0.87      0.43      0.57      2913
     UNKNOWN       0.78      0.84      0.81      6767

    accuracy                           0.84     25000
   macro avg       0.84      0.73      0.76     25000
weighted avg       0.84      0.84      0.83     25000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.87      0.90      0.89     14006
     UNHAPPY       0.85      0.41      0.55      2841
     UNKNOWN       0.80      0.90     

In [None]:
warnings.filterwarnings("ignore")

In [79]:
# Naive Bayes Bernoulli
(accuracy,class_report) = classifyLabels(BernoulliNB(), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

  return f(**kwargs)


1


  return f(**kwargs)
  _warn_prf(average, modifier, msg_start, len(result))


2


  return f(**kwargs)
  _warn_prf(average, modifier, msg_start, len(result))


3
{1: 0.588248, 2: 0.689048, 3: 0.675984}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.77      0.61      0.68     68195
     UNHAPPY       0.15      0.29      0.20     15437
     UNKNOWN       0.66      0.66      0.66     41368

    accuracy                           0.59    125000
   macro avg       0.53      0.52      0.51    125000
weighted avg       0.66      0.59      0.62    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.69      0.94      0.80     75861
     UNHAPPY       0.00      0.00      0.00     14564
     UNKNOWN       0.67      0.44      0.53     34575

    accuracy                           0.69    125000
   macro avg       0.45      0.46      0.44    125000
weighted avg       0.61      0.69      0.63    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.66      0.94      0.78     70405
     UNHAPPY       0.00      0.00      0.00     14378
     UNKNOWN       0.72      0.46

In [83]:
# Naive Bayes Gaussian
(accuracy,class_report) = classifyLabels(GaussianNB(), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

  return f(**kwargs)


1


  return f(**kwargs)


2


  return f(**kwargs)


3
{1: 0.648288, 2: 0.646056, 3: 0.644936}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.68      0.84      0.75     68195
     UNHAPPY       0.18      0.08      0.11     15437
     UNKNOWN       0.67      0.55      0.61     41368

    accuracy                           0.65    125000
   macro avg       0.51      0.49      0.49    125000
weighted avg       0.62      0.65      0.62    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.71      0.83      0.76     75861
     UNHAPPY       0.18      0.10      0.13     14564
     UNKNOWN       0.59      0.48      0.53     34575

    accuracy                           0.65    125000
   macro avg       0.49      0.47      0.47    125000
weighted avg       0.61      0.65      0.62    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.72      0.78      0.75     70405
     UNHAPPY       0.16      0.10      0.12     14378
     UNKNOWN       0.62      0.60

In [70]:
# K-NN
(accuracy,class_report) = classifyLabels(neighbors.KNeighborsClassifier(n_neighbors = 3), X, y, n = 3, scale = 'MinMax')
print(accuracy)
for item in class_report.values():
    print(item)

  clf = clf.fit(X_train,y_train)


1


  clf = clf.fit(X_train,y_train)


2


  clf = clf.fit(X_train,y_train)


3
{1: 0.776368, 2: 0.757728, 3: 0.762664}
              precision    recall  f1-score   support

       KNOWN       0.84      0.83      0.83     83632
     UNKNOWN       0.66      0.68      0.67     41368

    accuracy                           0.78    125000
   macro avg       0.75      0.75      0.75    125000
weighted avg       0.78      0.78      0.78    125000

              precision    recall  f1-score   support

       KNOWN       0.83      0.83      0.83     90425
     UNKNOWN       0.56      0.56      0.56     34575

    accuracy                           0.76    125000
   macro avg       0.70      0.70      0.70    125000
weighted avg       0.76      0.76      0.76    125000

              precision    recall  f1-score   support

       KNOWN       0.81      0.84      0.83     84783
     UNKNOWN       0.64      0.59      0.62     40217

    accuracy                           0.76    125000
   macro avg       0.73      0.72      0.72    125000
weighted avg       0.76      0.7

In [85]:
# Logistic Regression
(accuracy,class_report) = classifyLabels(LogisticRegression(random_state=0,
                                                            class_weight='balanced',
                                                            fit_intercept=False,
                                                            solver='liblinear'), X, y, n = 3, scale = 'MinMax')
print(accuracy)
for item in class_report.values():
    print(item)

  return f(**kwargs)


1


  return f(**kwargs)


2


  return f(**kwargs)


3
{1: 0.678176, 2: 0.685968, 3: 0.7108}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.77      0.75      0.76     68195
     UNHAPPY       0.20      0.03      0.05     15437
     UNKNOWN       0.59      0.80      0.68     41368

    accuracy                           0.68    125000
   macro avg       0.52      0.53      0.50    125000
weighted avg       0.64      0.68      0.65    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.83      0.79     75861
     UNHAPPY       0.21      0.05      0.08     14564
     UNKNOWN       0.58      0.64      0.61     34575

    accuracy                           0.69    125000
   macro avg       0.52      0.51      0.49    125000
weighted avg       0.64      0.69      0.66    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.77      0.81      0.79     70405
     UNHAPPY       0.21      0.03      0.05     14378
     UNKNOWN       0.64      0.79  

In [None]:
# SVM (very slow!)
(accuracy,class_report) = classifyLabels(svm.SVC(random_state=0,
                                                 class_weight='balanced'), X, y, n = 3, scale = 'MinMax')
print(accuracy)
for item in class_report.values():
    print(item)

In [87]:
# Decision Tree
(accuracy,class_report) = classifyLabels(DecisionTreeClassifier(random_state=0,
                                                                class_weight='balanced'), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)



1
2
3
{1: 0.603464, 2: 0.587064, 3: 0.603976}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.74      0.67      0.70     68195
     UNHAPPY       0.14      0.18      0.16     15437
     UNKNOWN       0.63      0.64      0.64     41368

    accuracy                           0.60    125000
   macro avg       0.50      0.50      0.50    125000
weighted avg       0.63      0.60      0.61    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.66      0.70     75861
     UNHAPPY       0.13      0.16      0.14     14564
     UNKNOWN       0.52      0.61      0.56     34575

    accuracy                           0.59    125000
   macro avg       0.47      0.48      0.47    125000
weighted avg       0.62      0.59      0.60    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.72      0.69      0.71     70405
     UNHAPPY       0.13      0.18      0.15     14378
     UNKNOWN       0.64      

In [88]:
# AdaBoost
(accuracy,class_report) = classifyLabels(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3),
                                                            n_estimators=50,
                                                            random_state=0), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

  return f(**kwargs)


1


  return f(**kwargs)


2


  return f(**kwargs)


3
{1: 0.73088, 2: 0.740224, 3: 0.751032}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.87      0.81     68195
     UNHAPPY       0.15      0.02      0.03     15437
     UNKNOWN       0.71      0.77      0.74     41368

    accuracy                           0.73    125000
   macro avg       0.54      0.55      0.52    125000
weighted avg       0.66      0.73      0.69    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.76      0.90      0.83     75861
     UNHAPPY       0.25      0.01      0.01     14564
     UNKNOWN       0.69      0.70      0.69     34575

    accuracy                           0.74    125000
   macro avg       0.57      0.54      0.51    125000
weighted avg       0.68      0.74      0.69    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.74      0.93      0.83     70405
     UNHAPPY       0.30      0.01      0.02     14378
     UNKNOWN       0.77      0.71 

In [89]:
# Gradient Boosting
(accuracy,class_report) = classifyLabels(GradientBoostingClassifier(random_state=0), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

  return f(**kwargs)


1


  return f(**kwargs)


2


  return f(**kwargs)


3
{1: 0.747288, 2: 0.75016, 3: 0.74272}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.92      0.82     68195
     UNHAPPY       0.23      0.00      0.00     15437
     UNKNOWN       0.75      0.75      0.75     41368

    accuracy                           0.75    125000
   macro avg       0.58      0.55      0.52    125000
weighted avg       0.68      0.75      0.70    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.76      0.93      0.83     75861
     UNHAPPY       0.45      0.00      0.00     14564
     UNKNOWN       0.72      0.68      0.70     34575

    accuracy                           0.75    125000
   macro avg       0.65      0.54      0.51    125000
weighted avg       0.71      0.75      0.70    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.73      0.94      0.82     70405
     UNHAPPY       0.32      0.00      0.00     14378
     UNKNOWN       0.78      0.66  

In [90]:
# Hist Gradient Boosting
(accuracy,class_report) = classifyLabels(HistGradientBoostingClassifier(random_state=0), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

  return f(**kwargs)


1


  return f(**kwargs)


2


  return f(**kwargs)


3
{1: 0.75276, 2: 0.751056, 3: 0.757056}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.92      0.83     68195
     UNHAPPY       0.25      0.00      0.01     15437
     UNKNOWN       0.75      0.76      0.76     41368

    accuracy                           0.75    125000
   macro avg       0.59      0.56      0.53    125000
weighted avg       0.69      0.75      0.70    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.77      0.92      0.83     75861
     UNHAPPY       0.22      0.00      0.00     14564
     UNKNOWN       0.71      0.71      0.71     34575

    accuracy                           0.75    125000
   macro avg       0.56      0.54      0.52    125000
weighted avg       0.69      0.75      0.70    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.94      0.83     70405
     UNHAPPY       0.26      0.00      0.00     14378
     UNKNOWN       0.79      0.71 

In [91]:
# Bagging
(accuracy,class_report) = classifyLabels(BaggingClassifier(n_estimators=10,
                                                           random_state=0), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

  return f(**kwargs)


1


  return f(**kwargs)


2


  return f(**kwargs)


3
{1: 0.704248, 2: 0.691872, 3: 0.691832}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.74      0.84      0.79     68195
     UNHAPPY       0.15      0.05      0.07     15437
     UNKNOWN       0.70      0.72      0.71     41368

    accuracy                           0.70    125000
   macro avg       0.53      0.54      0.52    125000
weighted avg       0.66      0.70      0.67    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.76      0.82      0.79     75861
     UNHAPPY       0.16      0.06      0.09     14564
     UNKNOWN       0.62      0.68      0.65     34575

    accuracy                           0.69    125000
   macro avg       0.51      0.52      0.51    125000
weighted avg       0.65      0.69      0.67    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.74      0.82      0.78     70405
     UNHAPPY       0.14      0.07      0.10     14378
     UNKNOWN       0.70      0.68

In [92]:
# Random Forest
(accuracy,class_report) = classifyLabels(RandomForestClassifier(n_estimators=10,
                                                                random_state=0,
                                                                class_weight='balanced'), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

  clf = clf.fit(X_train,y_train)


1


  clf = clf.fit(X_train,y_train)


2


  clf = clf.fit(X_train,y_train)


3
{1: 0.741472, 2: 0.743208, 3: 0.752912}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.90      0.82     68195
     UNHAPPY       0.18      0.03      0.05     15437
     UNKNOWN       0.75      0.74      0.75     41368

    accuracy                           0.74    125000
   macro avg       0.56      0.56      0.54    125000
weighted avg       0.68      0.74      0.70    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.77      0.91      0.83     75861
     UNHAPPY       0.17      0.03      0.05     14564
     UNKNOWN       0.72      0.69      0.70     34575

    accuracy                           0.74    125000
   macro avg       0.55      0.54      0.53    125000
weighted avg       0.68      0.74      0.70    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.93      0.83     70405
     UNHAPPY       0.18      0.03      0.05     14378
     UNKNOWN       0.79      0.71