In [1]:
import pandas as pd
import numpy as np
import pyodbc as py

from datetime import date, timedelta, datetime
import time

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import preprocessing
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.feature_selection import chi2, SelectKBest, SelectPercentile, mutual_info_classif, RFE, RFECV, SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn import neighbors
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from imblearn.over_sampling import SMOTE

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Activation,Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

import functions
import importlib
importlib.reload(functions)

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

#### Load Data

In [2]:
importlib.reload(functions)

<module 'functions' from '/Users/LV/Documents/GitHub/Seminar-QM-BA/functions.py'>

In [3]:
df = pd.read_csv('/Users/LV/Desktop/data_bol_complete.csv', low_memory = True)

In [4]:
df['orderDate']                   = pd.to_datetime(df['orderDate'])
df['cancellationDate']            = pd.to_datetime(df['cancellationDate'])
df['promisedDeliveryDate']        = pd.to_datetime(df['promisedDeliveryDate'])
df['shipmentDate']                = pd.to_datetime(df['shipmentDate'])
df['dateTimeFirstDeliveryMoment'] = pd.to_datetime(df['dateTimeFirstDeliveryMoment'])
df['startDateCase']               = pd.to_datetime(df['startDateCase'])
df['returnDateTime']              = pd.to_datetime(df['returnDateTime'])
df['registrationDateSeller']      = pd.to_datetime(df['registrationDateSeller'])

In [5]:
#Fixed Columns:
DATE = ['orderDate']
BASIC = ['totalPrice','quantityOrdered','fulfilmentByBol','countryCodeNL','countryOriginNL','countryOriginBE',
        'countryOriginDE','productTitleLength','promisedDeliveryDays','partnerSellingDays', 'orderCorona']
WEEK = ['orderMonday','orderTuesday','orderWednesday','orderThursday','orderFriday','orderSaturday','orderSunday']
MONTH = ['orderJanuary','orderFebruary','orderMarch','orderApril','orderMay','orderJune',
         'orderJuly','orderAugust','orderSeptember','orderOctober','orderNovember','orderDecember']
YEAR = ['orderYear2020']
GROUP = ['groupHealth','groupHome','groupSports','groupComputer','groupPets','groupToys','groupBooks', 
         'groupBaby', 'groupMusic', 'groupFood','groupOffice','groupFashion','groupOther','groupCar']

#Dynamic Columns:
TRANSPORTERX = ['transporterPOSTNL/X','transporterDHL/X','transporterDPD/X','transporterBRIEF/X','transporterOTHER/X']
KNOWNX = ['caseKnownX','returnKnownX','cancellationKnownX','onTimeDeliveryKnownX','lateDeliveryKnownX']
PRODUCTX = ['productOrderCountX','productTotalCountX','productTotalReturnedX','productReturnFractionX']
SELLERX = ['sellerDailyOrdersX']
HISTORICX = []
historic_variable = ['transporterCode','sellerId','productGroup']
for x in range(len(historic_variable)):
    HISTORICX = HISTORICX + [historic_variable[x]+'HistoricHappyX',historic_variable[x]+'HistoricUnhappyX',historic_variable[x]+'HistoricUnknownX']

#Determinants:
DETERMINANT = ['noReturn', 'noCase', 'noCancellation', 'onTimeDelivery']

#Classifications
CLASSIFICATION = ['generalMatchClassification','detailedMatchClassification','binaryMatchClassification','determinantClassification']

In [353]:
X_col = BASIC + WEEK + MONTH + YEAR + GROUP + TRANSPORTERX + KNOWNX + PRODUCTX + SELLERX + HISTORICX
Y_col = ['detailedMatchClassification','generalMatchClassification']

#### Sample data

In [367]:
df_ = df.sample(n = 500000, replace = False, random_state = 1)

### Hierarchical Classification Models

In [319]:
# classifier = LogisticRegression(random_state=0, class_weight='balanced')
# classifier = RandomForestClassifier(random_state=0, class_weight='balanced', n_estimators = 10)
# classifier = svm.LinearSVC(C=1, penalty="l1", dual=False, class_weight = 'balanced')
# classifier = HistGradientBoostingClassifier(random_state=0)
# classifier = DecisionTreeClassifier(random_state=0, max_depth=10, class_weight='balanced')

In [392]:
ch = ClassHierarchy('ORDERS')
ch.add_node(['UNKNOWN','KNOWN'], 'ORDERS')
ch.add_node(['HAPPY','UNHAPPY'], 'KNOWN')
ch.add_node(['MILDLY UNHAPPY','MEDIUM UNHAPPY','HEAVILY UNHAPPY'], 'UNHAPPY')

HC = HierarchicalClassifier(ch)
HC.fit_classifiers({'ORDERS'  : HistGradientBoostingClassifier(random_state=0),
                    'KNOWN'   : HistGradientBoostingClassifier(random_state=0),
                    'UNHAPPY' : HistGradientBoostingClassifier(random_state=0)})

In [374]:
# ch = ClassHierarchy('ORDERS')
# ch.add_node(['OTHER','UNHAPPY'], 'ORDERS')
# ch.add_node(['HAPPY','UNKNOWN'], 'OTHER')
# ch.add_node(['MILDLY UNHAPPY','MEDIUM UNHAPPY','HEAVILY UNHAPPY'], 'UNHAPPY')

# HC = HierarchicalClassifier(ch)
# HC.fit_classifiers({'ORDERS'  : HistGradientBoostingClassifier(random_state=0),
#                     'OTHER'   : HistGradientBoostingClassifier(random_state=0),
#                     'UNHAPPY' : HistGradientBoostingClassifier(random_state=0)})

#### Single fit single point in time

In [364]:
X, y = functions.dataX(df_, DATE, X_col, Y_col, historic_variable, 5)
index = range(0, X.shape[0])
X_train, X_test, y_train, y_test, ix_train, ix_test = train_test_split(X, y, index, test_size=0.2, random_state=0, shuffle=False)

In [395]:
HC = HC.fit(X_train,y_train['detailedMatchClassification'])
pred = HC.predict(X_test)

In [418]:
class_report(y_test['detailedMatchClassification'], pred)

                 precision    recall  f1-score   support

          HAPPY       0.97      1.00      0.98     27774
HEAVILY UNHAPPY       0.97      0.43      0.60       535
 MEDIUM UNHAPPY       0.83      0.56      0.67       956
 MILDLY UNHAPPY       0.94      0.74      0.83      4453
        UNKNOWN       0.94      0.99      0.97     16282

       accuracy                           0.96     50000
      macro avg       0.93      0.74      0.81     50000
   weighted avg       0.96      0.96      0.95     50000



In [420]:
global_scores(y_test['detailedMatchClassification'], pred)

(0.95772, 0.9309057535403497, 0.7443621310561637, 0.8094917880793103)

In [415]:
local_scores(y_test['detailedMatchClassification'], pred)

(array([0.97086018, 0.96666667, 0.83463339, 0.93810335, 0.94426518]),
 array([0.99805574, 0.43364486, 0.55962343, 0.7419717 , 0.98851492]),
 array([0.98427014, 0.59870968, 0.67000626, 0.82858934, 0.96588352]))

In [407]:
precision_score_ancestors(ch, y_test['detailedMatchClassification'], pred)

0.9728588108702122

In [409]:
recall_score_ancestors(ch, y_test['detailedMatchClassification'], pred)

0.9478597399121144

In [411]:
f1_score_ancestors(ch, y_test['detailedMatchClassification'], pred)

0.9601965879561631

In [326]:
# clf = HierarchicalClassification1(classifier1,classifier2,classifier3)
# clf.fit(X_train, y_train)
# predictions = clf.predict(X_test)
# print(metrics.classification_report(y_test['detailedMatchClassification'], predictions['detailedPrediction']))
# print(metrics.classification_report(y_test['generalMatchClassification'], predictions['generalPrediction']))
# print(metrics.classification_report(y_test['detailedMatchClassification'], predictions['detailedPrediction']))

In [None]:
# a = clf.predict_proba(X_test, y_test, 0.6)
# comb = pd.concat([y_test['generalMatchClassification'], a['generalPrediction']],axis=1).dropna()
# print(metrics.classification_report(comb['generalMatchClassification'], comb['generalPrediction']))

#### Cross-validation single point in time

In [56]:
X, y = functions.dataX(df_, DATE, X_col, Y_col, historic_variable, 0)

In [65]:
clf = HierarchicalClassification1(classifier1,classifier2,classifier3)
results = classifyLabelsHC(clf, X, y, 3, scale = None)
pd.DataFrame.from_dict(results, orient='index')

Unnamed: 0,0
binaryAccuracy,0.922259
binaryPrecision_KNOWN,0.936825
binaryRecall_KNOWN,0.951168
binaryF1_KNOWN,0.94393
binaryPrecision_UNKNOWN,0.887381
binaryRecall_UNKNOWN,0.857556
binaryF1_UNKNOWN,0.872166
generalAccuracy,0.83622
generalPrecision_HAPPY,0.840076
generalRecall_HAPPY,0.985269


#### Cross-validation over time

In [46]:
PREDICT_DAYS = 5
REP = 3

resultDic = {}

classifier = HierarchicalClassification1(classifier1,classifier2,classifier3)

for DAYS in range(PREDICT_DAYS+1):
    
    X, y = functions.dataX(df_, DATE, X_col, Y_col, historic_variable, DAYS)

    result = classifyLabelsHC(classifier, X, y, 3)

    resultDic[DAYS] = result
    
    print('DAYS: ',DAYS)

RESULT = pd.DataFrame.from_dict(resultDic, orient='columns')
RESULT

DAYS:  0


Unnamed: 0,binaryAccuracy,binaryPrecision_KNOWN,binaryRecall_KNOWN,binaryF1_KNOWN,binaryPrecision_UNKNOWN,binaryRecall_UNKNOWN,binaryF1_UNKNOWN,generalAccuracy,generalPrecision_HAPPY,generalRecall_HAPPY,generalF1_HAPPY,generalPrecision_UNHAPPY,generalRecall_UNHAPPY,generalF1_UNHAPPY,generalPrecision_UNKNOWN,generalRecall_UNKNOWN,generalF1_UNKNOWN,detailedAccuracy,detailedPrecision_HAPPY,detailedRecall_HAPPY,detailedF1_HAPPY,detailedPrecision_HEAVILY UNHAPPY,detailedRecall_HEAVILY UNHAPPY,detailedF1_HEAVILY UNHAPPY,detailedPrecision_MEDIUM UNHAPPY,detailedRecall_MEDIUM UNHAPPY,detailedF1_MEDIUM UNHAPPY,detailedPrecision_MILDLY UNHAPPY,detailedRecall_MILDLY UNHAPPY,detailedF1_MILDLY UNHAPPY,detailedPrecision_UNKNOWN,detailedRecall_UNKNOWN,detailedF1_UNKNOWN
0,0.902832,0.964288,0.892245,0.926835,0.792926,0.925504,0.853988,0.759539,0.868611,0.793111,0.828991,0.170905,0.167677,0.169169,0.792926,0.925504,0.853988,0.751328,0.868611,0.793111,0.828991,0.033826,0.018343,0.02209,0.025296,0.035721,0.02935,0.134898,0.126293,0.130079,0.792926,0.925504,0.853988


#### Functions

In [419]:
def global_scores(y_true, y_pred, average = 'macro'):
    accuracy = metrics.accuracy_score(y_true, y_pred)
    scores = metrics.precision_recall_fscore_support(y_true, y_pred, average = average)
    return accuracy, scores[0], scores[1], scores[2]

def local_scores(y_true, y_pred):
    labels = np.unique(y_true)
    scores = metrics.precision_recall_fscore_support(y_true, y_pred, average = None, labels = labels, beta = 1)
    return scores[0], scores[1], scores[2]

def class_report(y_true, y_pred):
    print(metrics.classification_report(y_true, y_pred))

def _aggregate_class_sets(set_function, y_true, y_pred):
    intersection_sum = 0
    true_sum = 0
    predicted_sum = 0
    for true, pred in zip(list(y_true), list(y_pred)):
        true_set = set([true] + set_function(true))
        pred_set = set([pred] + set_function(pred))
        intersection_sum += len(true_set.intersection(pred_set))
        true_sum += len(true_set)
        predicted_sum += len(pred_set)
    return (true_sum, predicted_sum, intersection_sum)

def _fbeta_score_class_sets(set_function, y_true, y_pred, beta=1):
    true_sum, predicted_sum, intersection_sum = _aggregate_class_sets(set_function, y_true, y_pred)
    precision = intersection_sum / predicted_sum
    recall = intersection_sum / true_sum
    return ((beta ** 2 + 1) * precision * recall) / ((beta ** 2 * precision) + recall)

def precision_score_ancestors(class_hierarchy, y_true, y_pred):
    true_sum, predicted_sum, intersection_sum = _aggregate_class_sets(
        class_hierarchy._get_ancestors, y_true, y_pred)
    return intersection_sum / predicted_sum

def recall_score_ancestors(class_hierarchy, y_true, y_pred):
    true_sum, predicted_sum, intersection_sum = _aggregate_class_sets(
        class_hierarchy._get_ancestors, y_true, y_pred)
    return intersection_sum / true_sum

def f1_score_ancestors(class_hierarchy, y_true, y_pred):
    return _fbeta_score_class_sets(class_hierarchy._get_ancestors, y_true, y_pred)

In [390]:
class ClassHierarchy:
    
    def __init__(self, root):
        self.root = root
        self.nodes = {}
        
    def add_node(self, children, parent):
        for child in children:
            self.nodes[child] = parent
    
    def _get_children(self, parent):
        return sorted([child for child, childs_parent in
                       self.nodes.items() if childs_parent == parent])
    
    def _get_parent(self, child):
        return self.nodes[child] if (child in self.nodes and child != self.root) else self.root
    
    def _get_ancestors(self, child):
        # Not including root, not including the child
        ancestors = []
        while True:
            child = self._get_parent(child)
            if child == self.root:
                break
            ancestors.append(child)
        return ancestors

In [372]:
class HierarchicalClassifier:

    def __init__(self, class_hierarchy):
        self.stages = {}
        self.class_hierarchy = class_hierarchy
        self._create_stages(self.stages, self.class_hierarchy.root, 0)

    def _create_stages(self, stages, parent, depth):
        # Get the children of this parent
        children = self.class_hierarchy._get_children(parent)
        
        if len(children) > 0:
            stage = {}
            stage['depth'] = depth
            stage['labels'] = children
            stage['classes'] = stage['labels'] + [parent]
            stage['target'] = 'target_stage_' + parent
            stages[parent] = stage

            for node in children:
                self._create_stages(stages, node, depth + 1)
                
    def _recode_label(self, classes, label):

        while label != self.class_hierarchy.root and label not in classes:
            label = self.class_hierarchy._get_parent(label)
        return label
                
    def _prep_data(self, X, y):
        
        Xcols = range(0, X.shape[1])
        Ycol = X.shape[1]
        
        df = pd.concat([X, y], axis=1, ignore_index=True)
        # Create a target column for each stage with the recoded labels
        for stage_name, stage_info in self.stages.items():
            df[stage_info['target']] = pd.DataFrame.apply(df[[Ycol]],
                                    lambda row: self._recode_label(stage_info['classes'], row[Ycol]),
                                    axis=1)
        return df, Xcols
    
    def fit_classifiers(self, classifiers):
        """
        Fit a classifier to each stage
        """
        if classifiers.keys() != self.stages.keys():
             raise ValueError('Your assigned classifiers do not match the stages of the hierarchy, fit a classifier to each of: '+self.stages.keys())
        else:
            for stage, classifier in classifiers.items():
                self.stages[stage]['classifier'] = classifier
    
    def fit(self, X, y):
        """
        Build a multi-classifier from training data (X, y).
        """
        df, Xcols = self._prep_data(X, y)
        
        for stage_name, stage_info in self.stages.items():
            
            dfFilter = df[df[stage_info['target']].isin(stage_info['classes'])]
            
            X_train = dfFilter[Xcols]
            y_train = dfFilter[[stage_info['target']]]
                        
            #warning - no samples to fit for stage
            stage_info['classifier'] = stage_info['classifier'].fit(X_train, y_train)
            #print('Stage '+stage_name+' succesfully fitted')

        return self
    
    def _predict_stages(self, X):
        
        stage_number = 0
        for stage_name, stage_info in self.stages.items():
            
            if stage_name == self.class_hierarchy.root:
                y_hat = pd.DataFrame([self.class_hierarchy.root] * len(X),
                                        columns=[self.class_hierarchy.root],
                                        index=X.index)
            else:
                y_hat[stage_name] = y_hat[list(self.stages.keys())[stage_number - 1]]
            stage_number += 1             
                
            X_test = X[y_hat[stage_name].isin([stage_name])]  #warning - no samples to fit for stage
            
            y_hat_stage = pd.DataFrame(stage_info['classifier'].predict(X_test), index = X_test.index)
            y_hat = y_hat.assign(stage_col = y_hat_stage)
            y_hat.stage_col = y_hat.stage_col.fillna(y_hat[stage_name]) #fill previously predicted labels
            y_hat = y_hat.drop(stage_name, axis=1)
            y_hat = y_hat.rename(columns={'stage_col': stage_name})
            
        return y_hat       
    
    def predict(self, X):
        y_hat = self._predict_stages(X)
        # Return only final predicted class
        return y_hat.iloc[:, y_hat.shape[1] - 1]

In [49]:
class HierarchicalClassification1:
    """
    Hierarchical classification using the tree: unknown - known -> happy - unhappy -> mildly - medium - heavily
    Input: 3 classifiers for each parent node
    """
    def __init__(self, classifier1, classifier2, classifier3):
        self.clf1 = classifier1
        self.clf2 = classifier2
        self.clf3 = classifier3
    
    def fit(self, X_train, y_train):
        
        X_train_1 = X_train
        y_train_1 = y_train['binaryMatchClassification']

        index_train_2 = y_train.loc[(y_train['binaryMatchClassification'] == 'KNOWN')].index
        X_train_2 = X_train.loc[index_train_2]
        y_train_2 = y_train['generalMatchClassification'].loc[index_train_2]

        index_train_3 = y_train.loc[(y_train['generalMatchClassification'] == 'UNHAPPY')].index
        X_train_3 = X_train.loc[index_train_3]
        y_train_3 = y_train['detailedMatchClassification'].loc[index_train_3]
        
        self.clf1 = classifier1.fit(X_train_1, y_train_1)
        self.clf2 = classifier2.fit(X_train_2, y_train_2)
        self.clf3 = classifier3.fit(X_train_3, y_train_3)
        
    def predict(self, X_test):
        
        predictions = pd.DataFrame()
        
        predictions['predLayer1'] = self.clf1.predict(X_test)
        predictions['predLayer2'] = self.clf2.predict(X_test)
        predictions['predLayer3'] = self.clf3.predict(X_test)
        
        predictions = self.labelPredictions(predictions)
        
        return predictions
    
    def labelPredictions(self, predictions):
    
        happyLabelIndex = predictions.loc[predictions['predLayer2'] == 'HAPPY'].index
        unknownLabelIndex = predictions.loc[predictions['predLayer1'] == 'UNKNOWN'].index

        predictions['binaryPrediction'] = predictions['predLayer1']

        predictions['generalPrediction'] = predictions['predLayer2']
        predictions.loc[unknownLabelIndex, 'generalPrediction'] = 'UNKNOWN'

        predictions['detailedPrediction'] = predictions['predLayer3']
        predictions.loc[happyLabelIndex, 'detailedPrediction'] = 'HAPPY'
        predictions.loc[unknownLabelIndex, 'detailedPrediction'] = 'UNKNOWN'

        return predictions[['binaryPrediction','generalPrediction','detailedPrediction']]
        
    def predict_proba(self, X_test, threshold = 0.5):
        
        predictions = pd.DataFrame()
        
        predictions_1 = self.clf1.predict_proba(X_test)
        predictions_2 = self.clf2.predict_proba(X_test)
        predictions_3 = self.clf3.predict_proba(X_test)
        
        #concat columns with clear headings
        
        predictions = labelPredictionsProba(predictions, threshold)
        
        return predictions
    
    def labelPredictionsProba(self, predictions, threshold):
        
#         X_test_1 = X_test
#         class_1 = self.clf1.classes_
#         knownIndex = np.where(class_1 == 'KNOWN')[0][0]
#         predictions['probKnown'] = self.clf1.predict_proba(X_test_1)[:,knownIndex]
        
#         index_test_2 = predictions.loc[(predictions['probKnown'] >= threshold)].index
#         X_test_2 = X_test.loc[index_test_2]
#         class_2 = self.clf2.classes_
#         unhappyIndex = np.where(class_2 == 'UNHAPPY')[0][0]
#         predictions.loc[index_test_2, 'probUnhappy'] = self.clf2.predict_proba(X_test_2)[:,unhappyIndex]

#         index_test_3 = predictions.loc[(predictions['probUnhappy'] >= threshold)].index
#         X_test_3 = X_test.loc[index_test_3]
#         predictions.loc[index_test_3, 'predLayer3'] = self.clf3.predict(X_test_3)
    
#         predictions.loc[predictions['probKnown'] >= threshold, 'binaryPrediction'] = 'KNOWN'
#         predictions.loc[predictions['probKnown'] <= (1-threshold), 'binaryPrediction'] = 'UNKNOWN'
        
#         predictions.loc[predictions['probUnhappy'] >= threshold, 'generalPrediction'] = 'UNHAPPY'
#         predictions.loc[predictions['probUnhappy'] <= (1-threshold), 'generalPrediction'] = 'HAPPY'
#         predictions.loc[predictions['binaryPrediction'] == 'UNKNOWN', 'generalPrediction'] = 'UNKNOWN'
        
#         predictions['detailedPrediction'] = predictions['predLayer3']
#         predictions.loc[predictions['probUnhappy'] <= (1-threshold), 'detailedPrediction'] = 'HAPPY'
#         predictions.loc[predictions['probKnown'] <= (1-threshold), 'detailedPrediction'] = 'UNKNOWN'

        return predictions

In [45]:
def classifyLabelsHC(classifier, X, y, n, split = 'TimeSeries', smote = False, scale = None, NN = False):

    scaler = preprocessing.MinMaxScaler()
    
    results = {}
    
    storage = {'binary'  :{'acc':{},'pre':{},'rec':{},'f1':{}},
               'general' :{'acc':{},'pre':{},'rec':{},'f1':{}},
               'detailed':{'acc':{},'pre':{},'rec':{},'f1':{}}}
        
    if split == 'Random':
        cv = StratifiedKFold(n_splits = n, random_state = 0, shuffle = True)   
    else:
        cv = TimeSeriesSplit(n_splits = n)
    
    count = 1

    for train_index, test_index in cv.split(X):
        
        if scale != None:
            X_scaled = pd.DataFrame(scaler.fit_transform(X))
            X_train, X_test = X_scaled.iloc[train_index], X_scaled.iloc[test_index]
        else:   
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        classifier.fit(X_train,y_train)
        predictions = classifier.predict(X_test)
        
        #Calculate performance metrics for each level
        for level in ['binary','general','detailed']:
            
            labels = np.unique(y[level+'MatchClassification'])
            
            y_pred_ = predictions[level+'Prediction']
            y_test_ = y_test[level+'MatchClassification']

            accuracy = metrics.accuracy_score(y_test_, y_pred_)
            scores = metrics.precision_recall_fscore_support(y_test_, y_pred_, average = None, labels = labels, beta = 1)
            
            storage[level]['acc'][count] = accuracy
            storage[level]['pre'][count] = scores[0]
            storage[level]['rec'][count] = scores[1]
            storage[level]['f1'][count] = scores[2]
        
        count += 1
    
    #Calculate averages of each metric
    for level in ['binary','general','detailed']:
        
        labels = np.unique(y[level+'MatchClassification'])
        
        results[level+'Accuracy'] = sum(storage[level]['acc'].values()) / n

        for ix,lab in enumerate(labels):
            results[(level+'Precision_'+lab)] = (sum(storage[level]['pre'].values()) / n)[ix]
            results[(level+'Recall_'+lab)] = (sum(storage[level]['rec'].values()) / n)[ix]
            results[(level+'F1_'+lab)] = (sum(storage[level]['f1'].values()) / n)[ix]
    
    return results

In [13]:
def neuralNetworkSetup(y,train_index):
    
    labels = np.unique(y)
    int_label_mapping = dict(enumerate(labels))
    label_int_mapping = {y:x for x,y in int_label_mapping.items()}
    
    y_encoded = y.map(label_int_mapping)
    y_dummy = pd.DataFrame(np_utils.to_categorical(y_encoded))
    
    class_weights = class_weight.compute_class_weight('balanced',labels,y.iloc[train_index])
    class_weights = dict(enumerate(class_weights))
    clf.set_params(class_weight = class_weights)
    y_train = y_dummy.iloc[train_index]
    
    return y_train
    
    #y_pred = pd.Series(y_pred).map(int_label_mapping)