In [245]:
import pandas as pd
import numpy as np
import pyodbc as py

from datetime import date, timedelta, datetime
import time

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import preprocessing
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.feature_selection import chi2, SelectKBest, SelectPercentile, mutual_info_classif, RFE, RFECV, SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn import neighbors
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from imblearn.over_sampling import SMOTE

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Activation,Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

import functions
import importlib
importlib.reload(functions)

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

#### Load Data

In [10]:
importlib.reload(functions)

<module 'functions' from '/Users/LV/Documents/GitHub/Seminar-QM-BA/functions.py'>

In [2]:
df = pd.read_csv('/Users/LV/Desktop/data_bol_complete.csv', low_memory = True)

In [3]:
df['orderDate']                   = pd.to_datetime(df['orderDate'])
df['cancellationDate']            = pd.to_datetime(df['cancellationDate'])
df['promisedDeliveryDate']        = pd.to_datetime(df['promisedDeliveryDate'])
df['shipmentDate']                = pd.to_datetime(df['shipmentDate'])
df['dateTimeFirstDeliveryMoment'] = pd.to_datetime(df['dateTimeFirstDeliveryMoment'])
df['startDateCase']               = pd.to_datetime(df['startDateCase'])
df['returnDateTime']              = pd.to_datetime(df['returnDateTime'])
df['registrationDateSeller']      = pd.to_datetime(df['registrationDateSeller'])

In [None]:
df['detailedMatchClassification'] = df['detailedMatchClassification'].replace({'KNOWN HAPPY':'HAPPY',
                                                                               'KNOWN MILDLY UNHAPPY':'MILDLY UNHAPPY',
                                                                               'KNOWN MEDIUM UNHAPPY':'MEDIUM UNHAPPY',
                                                                               'KNOWN HEAVILY UNHAPPY':'HEAVILY UNHAPPY'})

In [4]:
#Fixed Columns:
DATE = ['orderDate']
BASIC = ['totalPrice','quantityOrdered','fulfilmentByBol','countryCodeNL','countryOriginNL','countryOriginBE',
        'countryOriginDE','productTitleLength','promisedDeliveryDays','partnerSellingDays', 'orderCorona']
WEEK = ['orderMonday','orderTuesday','orderWednesday','orderThursday','orderFriday','orderSaturday','orderSunday']
MONTH = ['orderJanuary','orderFebruary','orderMarch','orderApril','orderMay','orderJune',
         'orderJuly','orderAugust','orderSeptember','orderOctober','orderNovember','orderDecember']
YEAR = ['orderYear2020']
GROUP = ['groupHealth','groupHome','groupSports','groupComputer','groupPets','groupToys','groupBooks', 
         'groupBaby', 'groupMusic', 'groupFood','groupOffice','groupFashion','groupOther','groupCar']

#Dynamic Columns:
TRANSPORTERX = ['transporterPOSTNL/X','transporterDHL/X','transporterDPD/X','transporterBRIEF/X','transporterOTHER/X']
KNOWNX = ['caseKnownX','returnKnownX','cancellationKnownX','onTimeDeliveryKnownX','lateDeliveryKnownX']
PRODUCTX = ['productOrderCountX','productTotalCountX','productTotalReturnedX','productReturnFractionX']
SELLERX = ['sellerDailyOrdersX']
HISTORICX = []
historic_variable = ['transporterCode','sellerId','productGroup']
for x in range(len(historic_variable)):
    HISTORICX = HISTORICX + [historic_variable[x]+'HistoricHappyX',historic_variable[x]+'HistoricUnhappyX',historic_variable[x]+'HistoricUnknownX']

#Determinants
DETERMINANT = ['noReturn', 'noCase', 'noCancellation', 'onTimeDelivery']

#Classifications
CLASSIFICATION = ['generalMatchClassification','detailedMatchClassification','binaryMatchClassification','determinantClassification']

In [329]:
X_col = BASIC + WEEK + MONTH + YEAR + GROUP + TRANSPORTERX + KNOWNX + PRODUCTX + SELLERX + HISTORICX
Y_col = ['generalMatchClassification','detailedMatchClassification','binaryMatchClassification']

df_ = df.sample(n = 1000000, replace = False, random_state = 1)

In [338]:
X, y = functions.dataX(df_, DATE, X_col, Y_col, historic_variable, 5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=False)

#### Hierarchical Classification Models

In [339]:
# classifier1 = LogisticRegression(random_state=0, class_weight='balanced')
# classifier2 = LogisticRegression(random_state=0, class_weight='balanced')
# classifier3 = LogisticRegression(random_state=0, class_weight='balanced')

classifier1 = RandomForestClassifier(random_state=0, class_weight='balanced', n_estimators = 10)
classifier2 = RandomForestClassifier(random_state=0, class_weight='balanced', n_estimators = 10)
classifier3 = RandomForestClassifier(random_state=0, class_weight='balanced', n_estimators = 10)

In [340]:
clf1,clf2,clf3 = trainHierarchy(X_train,y_train,classifier1,classifier2,classifier3)

In [341]:
predictions = testHierarchy(X_test,y_test,clf1,clf2,clf3)

In [342]:
print(metrics.classification_report(y_test['binaryMatchClassification'], predictions['binaryPrediction']))

              precision    recall  f1-score   support

       KNOWN       0.98      0.98      0.98    134874
     UNKNOWN       0.95      0.96      0.96     65126

    accuracy                           0.97    200000
   macro avg       0.97      0.97      0.97    200000
weighted avg       0.97      0.97      0.97    200000



In [343]:
print(metrics.classification_report(y_test['generalMatchClassification'], predictions['generalPrediction']))

              precision    recall  f1-score   support

       HAPPY       0.97      1.00      0.98    111157
     UNHAPPY       0.90      0.74      0.81     23717
     UNKNOWN       0.95      0.96      0.96     65126

    accuracy                           0.96    200000
   macro avg       0.94      0.90      0.92    200000
weighted avg       0.95      0.96      0.95    200000



In [344]:
print(metrics.classification_report(y_test['detailedMatchClassification'], predictions['detailedPrediction']))

                 precision    recall  f1-score   support

          HAPPY       0.97      1.00      0.98    111157
HEAVILY UNHAPPY       0.79      0.47      0.59      2026
 MEDIUM UNHAPPY       0.69      0.49      0.57      3780
 MILDLY UNHAPPY       0.85      0.75      0.79     17911
        UNKNOWN       0.95      0.96      0.96     65126

       accuracy                           0.95    200000
      macro avg       0.85      0.73      0.78    200000
   weighted avg       0.95      0.95      0.95    200000



In [292]:
# Train Model
def trainHierarchy(X_train,y_train,classifier1,classifier2,classifier3):
    
    X_train_1 = X_train
    y_train_1 = y_train['binaryMatchClassification']

    clf1 = classifier1.fit(X_train_1, y_train_1)
    #predictions['predLayer1'] = clf1.predict(X_1)

    #-------------------------------------------------------------------------------------------

    #index = predictions.loc[(predictions['predLayer1'] == 'KNOWN') & (predictions['generalMatchClassification'] != 'UNKNOWN')].index
    index_train_2 = y_train.loc[(y_train['binaryMatchClassification'] == 'KNOWN')].index
    X_train_2 = X_train.loc[index_train_2]
    y_train_2 = y_train['generalMatchClassification'].loc[index_train_2]

    clf2 = classifier2.fit(X_train_2, y_train_2)
    #predictions.loc[index, 'predLayer2'] = clf2.predict(X_2)

    #-------------------------------------------------------------------------------------------

    #index = predictions.loc[(predictions['predLayer2'] == 'UNHAPPY') & (predictions['generalMatchClassification'] == 'UNHAPPY')].index
    index_train_3 = y_train.loc[(y_train['generalMatchClassification'] == 'UNHAPPY')].index
    X_train_3 = X_train.loc[index_train_3]
    y_train_3 = y_train['detailedMatchClassification'].loc[index_train_3]

    clf3 = classifier3.fit(X_train_3, y_train_3)
    #predictions.loc[index, 'predLayer3'] = clf3.predict(X_3)
    
    return clf1,clf2,clf3

In [323]:
# Test Model
def testHierarchy(X_test,y_test,clf1,clf2,clf3):
    
    predictions = y_test[:]

    #-------------------------------------------------------------------------------------------
    
    X_test_1 = X_test

    predictions['predLayer1'] = clf1.predict(X_test_1)

    #-------------------------------------------------------------------------------------------

    index_test_2 = predictions.loc[(predictions['predLayer1'] == 'KNOWN')].index
    X_test_2 = X_test.loc[index_test_2]

    predictions.loc[index_test_2, 'predLayer2'] = clf2.predict(X_test_2)

    #-------------------------------------------------------------------------------------------

    index_test_3 = predictions.loc[(predictions['predLayer2'] == 'UNHAPPY')].index
    X_test_3 = X_test.loc[index_test_3]

    predictions.loc[index_test_3, 'predLayer3'] = clf3.predict(X_test_3)
    
    #-------------------------------------------------------------------------------------------
    
    happyLabelIndex = predictions.loc[predictions['predLayer2'] == 'HAPPY'].index
    unknownLabelIndex = predictions.loc[predictions['predLayer1'] == 'UNKNOWN'].index
    
    predictions['binaryPrediction'] = predictions['predLayer1']
    predictions['generalPrediction'] = predictions['predLayer2'].fillna('UNKNOWN')
    predictions['detailedPrediction'] = predictions['predLayer3']
    predictions.loc[happyLabelIndex, 'detailedPrediction'] = 'HAPPY'
    predictions.loc[unknownLabelIndex, 'detailedPrediction'] = 'UNKNOWN'
    
    return predictions[['binaryPrediction','generalPrediction','detailedPrediction']]