In [245]:
import pandas as pd
import numpy as np
import pyodbc as py

from datetime import date, timedelta, datetime
import time

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import preprocessing
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.feature_selection import chi2, SelectKBest, SelectPercentile, mutual_info_classif, RFE, RFECV, SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn import neighbors
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from imblearn.over_sampling import SMOTE

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Activation,Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

import functions
import importlib
importlib.reload(functions)

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

#### Load Data

In [10]:
importlib.reload(functions)

<module 'functions' from '/Users/LV/Documents/GitHub/Seminar-QM-BA/functions.py'>

In [2]:
df = pd.read_csv('/Users/LV/Desktop/data_bol_complete.csv', low_memory = True)

In [3]:
df['orderDate']                   = pd.to_datetime(df['orderDate'])
df['cancellationDate']            = pd.to_datetime(df['cancellationDate'])
df['promisedDeliveryDate']        = pd.to_datetime(df['promisedDeliveryDate'])
df['shipmentDate']                = pd.to_datetime(df['shipmentDate'])
df['dateTimeFirstDeliveryMoment'] = pd.to_datetime(df['dateTimeFirstDeliveryMoment'])
df['startDateCase']               = pd.to_datetime(df['startDateCase'])
df['returnDateTime']              = pd.to_datetime(df['returnDateTime'])
df['registrationDateSeller']      = pd.to_datetime(df['registrationDateSeller'])

In [4]:
#Fixed Columns:
DATE = ['orderDate']
BASIC = ['totalPrice','quantityOrdered','fulfilmentByBol','countryCodeNL','countryOriginNL','countryOriginBE',
        'countryOriginDE','productTitleLength','promisedDeliveryDays','partnerSellingDays', 'orderCorona']
WEEK = ['orderMonday','orderTuesday','orderWednesday','orderThursday','orderFriday','orderSaturday','orderSunday']
MONTH = ['orderJanuary','orderFebruary','orderMarch','orderApril','orderMay','orderJune',
         'orderJuly','orderAugust','orderSeptember','orderOctober','orderNovember','orderDecember']
YEAR = ['orderYear2020']
GROUP = ['groupHealth','groupHome','groupSports','groupComputer','groupPets','groupToys','groupBooks', 
         'groupBaby', 'groupMusic', 'groupFood','groupOffice','groupFashion','groupOther','groupCar']

#Dynamic Columns:
TRANSPORTERX = ['transporterPOSTNL/X','transporterDHL/X','transporterDPD/X','transporterBRIEF/X','transporterOTHER/X']
KNOWNX = ['caseKnownX','returnKnownX','cancellationKnownX','onTimeDeliveryKnownX','lateDeliveryKnownX']
PRODUCTX = ['productOrderCountX','productTotalCountX','productTotalReturnedX','productReturnFractionX']
SELLERX = ['sellerDailyOrdersX']
HISTORICX = []
historic_variable = ['transporterCode','sellerId','productGroup']
for x in range(len(historic_variable)):
    HISTORICX = HISTORICX + [historic_variable[x]+'HistoricHappyX',historic_variable[x]+'HistoricUnhappyX',historic_variable[x]+'HistoricUnknownX']

#Determinants
DETERMINANT = ['noReturn', 'noCase', 'noCancellation', 'onTimeDelivery']

#Classifications
CLASSIFICATION = ['generalMatchClassification','detailedMatchClassification','binaryMatchClassification','determinantClassification']

In [267]:
X_col = BASIC + WEEK + MONTH + YEAR + GROUP + TRANSPORTERX + KNOWNX + PRODUCTX + SELLERX + HISTORICX
Y_col = ['generalMatchClassification','detailedMatchClassification','binaryMatchClassification']

df_ = df.sample(n = 1000000, replace = False, random_state = 1)

In [268]:
X, y = functions.dataX(df_, DATE, X_col, Y_col, historic_variable, 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=False)

#### Hierarchical Classification Models

In [269]:
def addHierarchyLayer(X,y,classifier):
    
    clf = classifier.fit(X, y)
    prediction = clf.predict(X)
    #prediction = clf.predict_proba(X)
    
    return pd.Series(prediction)

layer1 = addHierarchyLayer(X,y['binaryMatchClassification'],LogisticRegression(random_state=0))

In [270]:
# classifier1 = LogisticRegression(random_state=0, class_weight='balanced')
# classifier2 = LogisticRegression(random_state=0, class_weight='balanced')
# classifier3 = LogisticRegression(random_state=0, class_weight='balanced')

classifier1 = RandomForestClassifier(random_state=0, class_weight='balanced', n_estimators = 10)
classifier2 = RandomForestClassifier(random_state=0, class_weight='balanced', n_estimators = 10)
classifier3 = RandomForestClassifier(random_state=0, class_weight='balanced', n_estimators = 10)

In [271]:
# Train Model

X_train_1 = X_train
y_train_1 = y_train['binaryMatchClassification']

clf1 = classifier1.fit(X_train_1, y_train_1)
#predictions['predLayer1'] = clf1.predict(X_1)

#-------------------------------------------------------------------------------------------

#index = predictions.loc[(predictions['predLayer1'] == 'KNOWN') & (predictions['generalMatchClassification'] != 'UNKNOWN')].index
index_train_2 = y_train.loc[(y_train['binaryMatchClassification'] == 'KNOWN')].index
X_train_2 = X_train.loc[index_train_2]
y_train_2 = y_train['generalMatchClassification'].loc[index_train_2]

clf2 = classifier2.fit(X_train_2, y_train_2)
#predictions.loc[index, 'predLayer2'] = clf2.predict(X_2)

#-------------------------------------------------------------------------------------------

#index = predictions.loc[(predictions['predLayer2'] == 'UNHAPPY') & (predictions['generalMatchClassification'] == 'UNHAPPY')].index
index_train_3 = y_train.loc[(y_train['generalMatchClassification'] == 'UNHAPPY')].index
X_train_3 = X_train.loc[index_train_3]
y_train_3 = y_train['detailedMatchClassification'].loc[index_train_3]

clf3 = classifier3.fit(X_train_3, y_train_3)
#predictions.loc[index, 'predLayer3'] = clf3.predict(X_3)

In [272]:
# Test Model
predictions = y_test[:]

X_test_1 = X_test

predictions['predLayer1'] = clf1.predict(X_test_1)

#-------------------------------------------------------------------------------------------

index_test_2 = predictions.loc[(predictions['predLayer1'] == 'KNOWN')].index
X_test_2 = X_test.loc[index_test_2]

predictions.loc[index_test_2, 'predLayer2'] = clf2.predict(X_test_2)

#-------------------------------------------------------------------------------------------

index_test_3 = predictions.loc[(predictions['predLayer2'] == 'UNHAPPY')].index
X_test_3 = X_test.loc[index_test_3]

predictions.loc[index_test_3, 'predLayer3'] = clf3.predict(X_test_3)

In [273]:
# Metrics
predictions['generalMatchClassificationPrediction'] = predictions['predLayer2'].fillna('UNKNOWN')

print(metrics.classification_report(predictions['generalMatchClassification'], predictions['generalMatchClassificationPrediction']))

              precision    recall  f1-score   support

       HAPPY       0.83      0.99      0.90    111157
     UNHAPPY       0.24      0.06      0.09     23717
     UNKNOWN       0.90      0.86      0.88     65126

    accuracy                           0.84    200000
   macro avg       0.66      0.64      0.63    200000
weighted avg       0.78      0.84      0.80    200000



In [185]:
index_test_2 = predictions.loc[(predictions['predLayer1'] == 'KNOWN')].index
y.loc[index_test_2, 'generalMatchClassification'].value_counts()

HAPPY      257058
UNHAPPY     38068
UNKNOWN     10726
Name: generalMatchClassification, dtype: int64

In [134]:
predictions['predLayer3'].value_counts()

KNOWN MILDLY UNHAPPY     49049
KNOWN MEDIUM UNHAPPY     13320
KNOWN HEAVILY UNHAPPY     4222
Name: predLayer3, dtype: int64

In [135]:
predictions['detailedMatchClassification'].value_counts()

KNOWN HAPPY              281236
UNKNOWN                  158712
KNOWN MILDLY UNHAPPY      43535
KNOWN MEDIUM UNHAPPY      10216
KNOWN HEAVILY UNHAPPY      6301
Name: detailedMatchClassification, dtype: int64

In [137]:
predictions['generalMatchClassification'].value_counts()

HAPPY      281236
UNKNOWN    158712
UNHAPPY     60052
Name: generalMatchClassification, dtype: int64