In [245]:
import pandas as pd
import numpy as np
import pyodbc as py

from datetime import date, timedelta, datetime
import time

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import preprocessing
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.feature_selection import chi2, SelectKBest, SelectPercentile, mutual_info_classif, RFE, RFECV, SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn import neighbors
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from imblearn.over_sampling import SMOTE

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Activation,Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

import functions
import importlib
importlib.reload(functions)

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

#### Load Data

In [10]:
importlib.reload(functions)

<module 'functions' from '/Users/LV/Documents/GitHub/Seminar-QM-BA/functions.py'>

In [2]:
df = pd.read_csv('/Users/LV/Desktop/data_bol_complete.csv', low_memory = True)

In [3]:
df['orderDate']                   = pd.to_datetime(df['orderDate'])
df['cancellationDate']            = pd.to_datetime(df['cancellationDate'])
df['promisedDeliveryDate']        = pd.to_datetime(df['promisedDeliveryDate'])
df['shipmentDate']                = pd.to_datetime(df['shipmentDate'])
df['dateTimeFirstDeliveryMoment'] = pd.to_datetime(df['dateTimeFirstDeliveryMoment'])
df['startDateCase']               = pd.to_datetime(df['startDateCase'])
df['returnDateTime']              = pd.to_datetime(df['returnDateTime'])
df['registrationDateSeller']      = pd.to_datetime(df['registrationDateSeller'])

In [None]:
df['detailedMatchClassification'] = df['detailedMatchClassification'].replace({'KNOWN HAPPY':'HAPPY',
                                                                               'KNOWN MILDLY UNHAPPY':'MILDLY UNHAPPY',
                                                                               'KNOWN MEDIUM UNHAPPY':'MEDIUM UNHAPPY',
                                                                               'KNOWN HEAVILY UNHAPPY':'HEAVILY UNHAPPY'})

In [4]:
#Fixed Columns:
DATE = ['orderDate']
BASIC = ['totalPrice','quantityOrdered','fulfilmentByBol','countryCodeNL','countryOriginNL','countryOriginBE',
        'countryOriginDE','productTitleLength','promisedDeliveryDays','partnerSellingDays', 'orderCorona']
WEEK = ['orderMonday','orderTuesday','orderWednesday','orderThursday','orderFriday','orderSaturday','orderSunday']
MONTH = ['orderJanuary','orderFebruary','orderMarch','orderApril','orderMay','orderJune',
         'orderJuly','orderAugust','orderSeptember','orderOctober','orderNovember','orderDecember']
YEAR = ['orderYear2020']
GROUP = ['groupHealth','groupHome','groupSports','groupComputer','groupPets','groupToys','groupBooks', 
         'groupBaby', 'groupMusic', 'groupFood','groupOffice','groupFashion','groupOther','groupCar']

#Dynamic Columns:
TRANSPORTERX = ['transporterPOSTNL/X','transporterDHL/X','transporterDPD/X','transporterBRIEF/X','transporterOTHER/X']
KNOWNX = ['caseKnownX','returnKnownX','cancellationKnownX','onTimeDeliveryKnownX','lateDeliveryKnownX']
PRODUCTX = ['productOrderCountX','productTotalCountX','productTotalReturnedX','productReturnFractionX']
SELLERX = ['sellerDailyOrdersX']
HISTORICX = []
historic_variable = ['transporterCode','sellerId','productGroup']
for x in range(len(historic_variable)):
    HISTORICX = HISTORICX + [historic_variable[x]+'HistoricHappyX',historic_variable[x]+'HistoricUnhappyX',historic_variable[x]+'HistoricUnknownX']

#Determinants
DETERMINANT = ['noReturn', 'noCase', 'noCancellation', 'onTimeDelivery']

#Classifications
CLASSIFICATION = ['generalMatchClassification','detailedMatchClassification','binaryMatchClassification','determinantClassification']

In [415]:
X_col = BASIC + WEEK + MONTH + YEAR + GROUP + TRANSPORTERX + KNOWNX + PRODUCTX + SELLERX + HISTORICX
Y_col = ['generalMatchClassification','detailedMatchClassification','binaryMatchClassification']

#### Sample data

In [421]:
df_ = df.sample(n = 500000, replace = False, random_state = 1)

### Hierarchical Classification Models

In [423]:
# classifier1 = LogisticRegression(random_state=0, class_weight='balanced')
# classifier2 = LogisticRegression(random_state=0, class_weight='balanced')
# classifier3 = LogisticRegression(random_state=0, class_weight='balanced')

classifier1 = RandomForestClassifier(random_state=0, class_weight='balanced', n_estimators = 10)
classifier2 = RandomForestClassifier(random_state=0, class_weight='balanced', n_estimators = 10)
classifier3 = RandomForestClassifier(random_state=0, class_weight='balanced', n_estimators = 10)

#### Single fit single point in time

In [None]:
X, y = functions.dataX(df_, DATE, X_col, Y_col, historic_variable, 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=False)

In [346]:
clf = HierarchicalClassification1(classifier1,classifier2,classifier3)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test, y_test)

print(metrics.classification_report(y_test['binaryMatchClassification'], predictions['binaryPrediction']))
print(metrics.classification_report(y_test['generalMatchClassification'], predictions['generalPrediction']))
print(metrics.classification_report(y_test['detailedMatchClassification'], predictions['detailedPrediction']))

#### Cross-validation single point in time

In [None]:
X, y = functions.dataX(df_, DATE, X_col, Y_col, historic_variable, 0)

In [404]:
clf = HierarchicalClassification1(classifier1,classifier2,classifier3)
results = classifyLabelsHC(test, X, y, 3)

In [417]:
pd.DataFrame.from_dict(results, orient='index')

Unnamed: 0,0
accuracy,0.752515
precision_HAPPY,0.874248
recall_HAPPY,0.773223
f1_HAPPY,0.819905
precision_UNHAPPY,0.184566
recall_UNHAPPY,0.210775
f1_UNHAPPY,0.195751
precision_UNKNOWN,0.798142
recall_UNKNOWN,0.921181
f1_UNKNOWN,0.855141


#### Cross-validation over time

In [424]:
PREDICT_DAYS = 5
REP = 3

resultDic = {}

classifier = HierarchicalClassification1(classifier1,classifier2,classifier3)

for DAYS in range(PREDICT_DAYS+1):
    
    X, y = functions.dataX(df_, DATE, X_col, Y_col, historic_variable, DAYS)

    result = classifyLabelsHC(classifier, X, y, 3)

    resultDic[DAYS] = result
    
    print('DAYS: ',DAYS)

RESULT = pd.DataFrame.from_dict(resultDic, orient='index')
RESULT

DAYS:  0
DAYS:  1
DAYS:  2
DAYS:  3
DAYS:  4


Unnamed: 0,accuracy,precision_HAPPY,recall_HAPPY,f1_HAPPY,precision_UNHAPPY,recall_UNHAPPY,f1_UNHAPPY,precision_UNKNOWN,recall_UNKNOWN,f1_UNKNOWN
0,0.834539,0.837432,0.984391,0.904975,0.256588,0.062972,0.101111,0.884651,0.856341,0.870158
1,0.862349,0.86504,0.988791,0.922774,0.588327,0.230598,0.330789,0.899555,0.873697,0.88635
2,0.898685,0.906787,0.991891,0.947428,0.745943,0.442316,0.555216,0.918003,0.903239,0.910547
3,0.918253,0.92949,0.994836,0.961044,0.794632,0.565294,0.659853,0.930821,0.91379,0.922214
4,0.938128,0.94883,0.995287,0.971502,0.85815,0.65726,0.744371,0.941006,0.94114,0.941069


#### Functions

In [407]:
class HierarchicalClassification1():
    """
    Hierarchical classification using the tree: unknown - known -> happy - unhappy -> mildly - medium - heavily
    Input: 3 classifiers for each parent node
    """
    def __init__(self, classifier1, classifier2, classifier3):
        self.clf1 = classifier1
        self.clf2 = classifier2
        self.clf3 = classifier3
    
    def fit(self, X_train, y_train):
        
        X_train_1 = X_train
        y_train_1 = y_train['binaryMatchClassification']
        self.clf1 = classifier1.fit(X_train_1, y_train_1)

        index_train_2 = y_train.loc[(y_train['binaryMatchClassification'] == 'KNOWN')].index
        X_train_2 = X_train.loc[index_train_2]
        y_train_2 = y_train['generalMatchClassification'].loc[index_train_2]
        self.clf2 = classifier2.fit(X_train_2, y_train_2)

        index_train_3 = y_train.loc[(y_train['generalMatchClassification'] == 'UNHAPPY')].index
        X_train_3 = X_train.loc[index_train_3]
        y_train_3 = y_train['detailedMatchClassification'].loc[index_train_3]
        self.clf3 = classifier3.fit(X_train_3, y_train_3)
        
    def predict(self, X_test, y_test):
        
        predictions = y_test[:]

        X_test_1 = X_test
        predictions['predLayer1'] = self.clf1.predict(X_test_1)

        index_test_2 = predictions.loc[(predictions['predLayer1'] == 'KNOWN')].index
        X_test_2 = X_test.loc[index_test_2]
        predictions.loc[index_test_2, 'predLayer2'] = self.clf2.predict(X_test_2)

        index_test_3 = predictions.loc[(predictions['predLayer2'] == 'UNHAPPY')].index
        X_test_3 = X_test.loc[index_test_3]
        predictions.loc[index_test_3, 'predLayer3'] = self.clf3.predict(X_test_3)

        happyLabelIndex = predictions.loc[predictions['predLayer2'] == 'HAPPY'].index
        unknownLabelIndex = predictions.loc[predictions['predLayer1'] == 'UNKNOWN'].index

        predictions['binaryPrediction'] = predictions['predLayer1']
        predictions['generalPrediction'] = predictions['predLayer2'].fillna('UNKNOWN')
        predictions['detailedPrediction'] = predictions['predLayer3']
        predictions.loc[happyLabelIndex, 'detailedPrediction'] = 'HAPPY'
        predictions.loc[unknownLabelIndex, 'detailedPrediction'] = 'UNKNOWN'
    
        return predictions[['binaryPrediction','generalPrediction','detailedPrediction']]

In [406]:
def classifyLabelsHC(classifier, X, y, n, split = 'TimeSeries', smote = False, scale = None, NN = False):

    scaler = preprocessing.MinMaxScaler()
    labels = np.unique(y['generalMatchClassification'])
    
    acc,pre,rec,f1,results = {},{},{},{},{}
        
    if split == 'Random':
        cv = StratifiedKFold(n_splits = n, random_state = 0, shuffle = True)   
    else:
        cv = TimeSeriesSplit(n_splits = n)
    
    count = 1

    for train_index, test_index in cv.split(X):
        
        if scale != None:
            X_scaled = pd.DataFrame(scaler.fit_transform(X))
            X_train, X_test = X_scaled.iloc[train_index], X_scaled.iloc[test_index]
        else:   
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        classifier.fit(X_train,y_train)
        predictions = classifier.predict(X_test,y_test)
        
        #define which label you want to evaluate
        y_pred_general = predictions['generalPrediction']
        y_test_general = y_test['generalMatchClassification']
        
        accuracy = metrics.accuracy_score(y_test_general, y_pred_general)
        scores = metrics.precision_recall_fscore_support(y_test_general, y_pred_general, average = None, labels = labels, beta = 1)
        
        acc[count] = accuracy
        pre[count] = scores[0]
        rec[count] = scores[1]
        f1[count] = scores[2]
        
        count += 1

    results['accuracy'] = sum(acc.values()) / n
    
    for ix,label in enumerate(labels):
        results[('precision_'+label)] = (sum(pre.values()) / n)[ix]
        results[('recall_'+label)] = (sum(rec.values()) / n)[ix]
        results[('f1_'+label)] = (sum(f1.values()) / n)[ix]
     
    return results